1. package com.sun.java_cup.internal;
  2. import com.sun.java_cup.internal.runtime.Symbol;
  3. import java.util.Hashtable;
  4. /** This class implements a small scanner (aka lexical analyzer or lexer) for
  5. * the JavaCup specification. This scanner reads characters from standard
  6. * input (System.in) and returns integers corresponding to the terminal
  7. * number of the next Symbol. Once end of input is reached the EOF Symbol is
  8. * returned on every subsequent call.<p>
  9. * Symbols currently returned include: <pre>
  10. * Symbol Constant Returned Symbol Constant Returned
  11. * ------ ----------------- ------ -----------------
  12. * "package" PACKAGE "import" IMPORT
  13. * "code" CODE "action" ACTION
  14. * "parser" PARSER "terminal" TERMINAL
  15. * "non" NON "init" INIT
  16. * "scan" SCAN "with" WITH
  17. * "start" START "precedence" PRECEDENCE
  18. * "left" LEFT "right" RIGHT
  19. * "nonassoc" NONASSOC "%prec PRECENT_PREC
  20. * [ LBRACK ] RBRACK
  21. * ; SEMI
  22. * , COMMA * STAR
  23. * . DOT : COLON
  24. * ::= COLON_COLON_EQUALS | BAR
  25. * identifier ID {:...:} CODE_STRING
  26. * "nonterminal" NONTERMINAL
  27. * </pre>
  28. * All symbol constants are defined in sym.java which is generated by
  29. * JavaCup from parser.cup.<p>
  30. *
  31. * In addition to the scanner proper (called first via init() then with
  32. * next_token() to get each Symbol) this class provides simple error and
  33. * warning routines and keeps a count of errors and warnings that is
  34. * publicly accessible.<p>
  35. *
  36. * This class is "static" (i.e., it has only static members and methods).
  37. *
  38. * @version last updated: 7/3/96
  39. * @author Frank Flannery
  40. */
  41. public class lexer {
  42. /*-----------------------------------------------------------*/
  43. /*--- Constructor(s) ----------------------------------------*/
  44. /*-----------------------------------------------------------*/
  45. /** The only constructor is private, so no instances can be created. */
  46. private lexer() { }
  47. /*-----------------------------------------------------------*/
  48. /*--- Static (Class) Variables ------------------------------*/
  49. /*-----------------------------------------------------------*/
  50. /** First character of lookahead. */
  51. protected static int next_char;
  52. /** Second character of lookahead. */
  53. protected static int next_char2;
  54. /** Second character of lookahead. */
  55. protected static int next_char3;
  56. /** Second character of lookahead. */
  57. protected static int next_char4;
  58. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  59. /** EOF constant. */
  60. protected static final int EOF_CHAR = -1;
  61. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  62. /** Table of keywords. Keywords are initially treated as identifiers.
  63. * Just before they are returned we look them up in this table to see if
  64. * they match one of the keywords. The string of the name is the key here,
  65. * which indexes Integer objects holding the symbol number.
  66. */
  67. protected static Hashtable keywords = new Hashtable(23);
  68. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  69. /** Table of single character symbols. For ease of implementation, we
  70. * store all unambiguous single character Symbols in this table of Integer
  71. * objects keyed by Integer objects with the numerical value of the
  72. * appropriate char (currently Character objects have a bug which precludes
  73. * their use in tables).
  74. */
  75. protected static Hashtable char_symbols = new Hashtable(11);
  76. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  77. /** Current line number for use in error messages. */
  78. protected static int current_line = 1;
  79. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  80. /** Character position in current line. */
  81. protected static int current_position = 1;
  82. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  83. /** Character position in current line. */
  84. protected static int absolute_position = 1;
  85. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  86. /** Count of total errors detected so far. */
  87. public static int error_count = 0;
  88. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  89. /** Count of warnings issued so far */
  90. public static int warning_count = 0;
  91. /*-----------------------------------------------------------*/
  92. /*--- Static Methods ----------------------------------------*/
  93. /*-----------------------------------------------------------*/
  94. /** Initialize the scanner. This sets up the keywords and char_symbols
  95. * tables and reads the first two characters of lookahead.
  96. */
  97. public static void init() throws java.io.IOException
  98. {
  99. /* set up the keyword table */
  100. keywords.put("package", new Integer(sym.PACKAGE));
  101. keywords.put("import", new Integer(sym.IMPORT));
  102. keywords.put("code", new Integer(sym.CODE));
  103. keywords.put("action", new Integer(sym.ACTION));
  104. keywords.put("parser", new Integer(sym.PARSER));
  105. keywords.put("terminal", new Integer(sym.TERMINAL));
  106. keywords.put("non", new Integer(sym.NON));
  107. keywords.put("nonterminal",new Integer(sym.NONTERMINAL));// [CSA]
  108. keywords.put("init", new Integer(sym.INIT));
  109. keywords.put("scan", new Integer(sym.SCAN));
  110. keywords.put("with", new Integer(sym.WITH));
  111. keywords.put("start", new Integer(sym.START));
  112. keywords.put("precedence", new Integer(sym.PRECEDENCE));
  113. keywords.put("left", new Integer(sym.LEFT));
  114. keywords.put("right", new Integer(sym.RIGHT));
  115. keywords.put("nonassoc", new Integer(sym.NONASSOC));
  116. /* set up the table of single character symbols */
  117. char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
  118. char_symbols.put(new Integer(','), new Integer(sym.COMMA));
  119. char_symbols.put(new Integer('*'), new Integer(sym.STAR));
  120. char_symbols.put(new Integer('.'), new Integer(sym.DOT));
  121. char_symbols.put(new Integer('|'), new Integer(sym.BAR));
  122. char_symbols.put(new Integer('['), new Integer(sym.LBRACK));
  123. char_symbols.put(new Integer(']'), new Integer(sym.RBRACK));
  124. /* read two characters of lookahead */
  125. next_char = System.in.read();
  126. if (next_char == EOF_CHAR) {
  127. next_char2 = EOF_CHAR;
  128. next_char3 = EOF_CHAR;
  129. next_char4 = EOF_CHAR;
  130. } else {
  131. next_char2 = System.in.read();
  132. if (next_char2 == EOF_CHAR) {
  133. next_char3 = EOF_CHAR;
  134. next_char4 = EOF_CHAR;
  135. } else {
  136. next_char3 = System.in.read();
  137. if (next_char3 == EOF_CHAR) {
  138. next_char4 = EOF_CHAR;
  139. } else {
  140. next_char4 = System.in.read();
  141. }
  142. }
  143. }
  144. }
  145. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  146. /** Advance the scanner one character in the input stream. This moves
  147. * next_char2 to next_char and then reads a new next_char2.
  148. */
  149. protected static void advance() throws java.io.IOException
  150. {
  151. int old_char;
  152. old_char = next_char;
  153. next_char = next_char2;
  154. if (next_char == EOF_CHAR) {
  155. next_char2 = EOF_CHAR;
  156. next_char3 = EOF_CHAR;
  157. next_char4 = EOF_CHAR;
  158. } else {
  159. next_char2 = next_char3;
  160. if (next_char2 == EOF_CHAR) {
  161. next_char3 = EOF_CHAR;
  162. next_char4 = EOF_CHAR;
  163. } else {
  164. next_char3 = next_char4;
  165. if (next_char3 == EOF_CHAR) {
  166. next_char4 = EOF_CHAR;
  167. } else {
  168. next_char4 = System.in.read();
  169. }
  170. }
  171. }
  172. /* count this */
  173. absolute_position++;
  174. current_position++;
  175. if (old_char == '\n')
  176. {
  177. current_line++;
  178. current_position = 1;
  179. }
  180. }
  181. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  182. /** Emit an error message. The message will be marked with both the
  183. * current line number and the position in the line. Error messages
  184. * are printed on standard error (System.err).
  185. * @param message the message to print.
  186. */
  187. public static void emit_error(String message)
  188. {
  189. System.err.println("Error at " + current_line + "(" + current_position +
  190. "): " + message);
  191. error_count++;
  192. }
  193. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  194. /** Emit a warning message. The message will be marked with both the
  195. * current line number and the position in the line. Messages are
  196. * printed on standard error (System.err).
  197. * @param message the message to print.
  198. */
  199. public static void emit_warn(String message)
  200. {
  201. System.err.println("Warning at " + current_line + "(" + current_position +
  202. "): " + message);
  203. warning_count++;
  204. }
  205. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  206. /** Determine if a character is ok to start an id.
  207. * @param ch the character in question.
  208. */
  209. protected static boolean id_start_char(int ch)
  210. {
  211. /* allow for % in identifiers. a hack to allow my
  212. %prec in. Should eventually make lex spec for this
  213. frankf */
  214. return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
  215. (ch == '_');
  216. // later need to deal with non-8-bit chars here
  217. }
  218. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  219. /** Determine if a character is ok for the middle of an id.
  220. * @param ch the character in question.
  221. */
  222. protected static boolean id_char(int ch)
  223. {
  224. return id_start_char(ch) || (ch >= '0' && ch <= '9');
  225. }
  226. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  227. /** Try to look up a single character symbol, returns -1 for not found.
  228. * @param ch the character in question.
  229. */
  230. protected static int find_single_char(int ch)
  231. {
  232. Integer result;
  233. result = (Integer)char_symbols.get(new Integer((char)ch));
  234. if (result == null)
  235. return -1;
  236. else
  237. return result.intValue();
  238. }
  239. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  240. /** Handle swallowing up a comment. Both old style C and new style C++
  241. * comments are handled.
  242. */
  243. protected static void swallow_comment() throws java.io.IOException
  244. {
  245. /* next_char == '/' at this point */
  246. /* is it a traditional comment */
  247. if (next_char2 == '*')
  248. {
  249. /* swallow the opener */
  250. advance(); advance();
  251. /* swallow the comment until end of comment or EOF */
  252. for (;;)
  253. {
  254. /* if its EOF we have an error */
  255. if (next_char == EOF_CHAR)
  256. {
  257. emit_error("Specification file ends inside a comment");
  258. return;
  259. }
  260. /* if we can see the closer we are done */
  261. if (next_char == '*' && next_char2 == '/')
  262. {
  263. advance();
  264. advance();
  265. return;
  266. }
  267. /* otherwise swallow char and move on */
  268. advance();
  269. }
  270. }
  271. /* is its a new style comment */
  272. if (next_char2 == '/')
  273. {
  274. /* swallow the opener */
  275. advance(); advance();
  276. /* swallow to '\n', '\f', or EOF */
  277. while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
  278. advance();
  279. return;
  280. }
  281. /* shouldn't get here, but... if we get here we have an error */
  282. emit_error("Malformed comment in specification -- ignored");
  283. advance();
  284. }
  285. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  286. /** Swallow up a code string. Code strings begin with "{:" and include
  287. all characters up to the first occurrence of ":}" (there is no way to
  288. include ":}" inside a code string). The routine returns a String
  289. object suitable for return by the scanner.
  290. */
  291. protected static Symbol do_code_string() throws java.io.IOException
  292. {
  293. StringBuffer result = new StringBuffer();
  294. /* at this point we have lookahead of "{:" -- swallow that */
  295. advance(); advance();
  296. /* save chars until we see ":}" */
  297. while (!(next_char == ':' && next_char2 == '}'))
  298. {
  299. /* if we have run off the end issue a message and break out of loop */
  300. if (next_char == EOF_CHAR)
  301. {
  302. emit_error("Specification file ends inside a code string");
  303. break;
  304. }
  305. /* otherwise record the char and move on */
  306. result.append(new Character((char)next_char));
  307. advance();
  308. }
  309. /* advance past the closer and build a return Symbol */
  310. advance(); advance();
  311. return new Symbol(sym.CODE_STRING, result.toString());
  312. }
  313. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  314. /** Process an identifier. Identifiers begin with a letter, underscore,
  315. * or dollar sign, which is followed by zero or more letters, numbers,
  316. * underscores or dollar signs. This routine returns a String suitable
  317. * for return by the scanner.
  318. */
  319. protected static Symbol do_id() throws java.io.IOException
  320. {
  321. StringBuffer result = new StringBuffer();
  322. String result_str;
  323. Integer keyword_num;
  324. char buffer[] = new char[1];
  325. /* next_char holds first character of id */
  326. buffer[0] = (char)next_char;
  327. result.append(buffer,0,1);
  328. advance();
  329. /* collect up characters while they fit in id */
  330. while(id_char(next_char))
  331. {
  332. buffer[0] = (char)next_char;
  333. result.append(buffer,0,1);
  334. advance();
  335. }
  336. /* extract a string and try to look it up as a keyword */
  337. result_str = result.toString();
  338. keyword_num = (Integer)keywords.get(result_str);
  339. /* if we found something, return that keyword */
  340. if (keyword_num != null)
  341. return new Symbol(keyword_num.intValue());
  342. /* otherwise build and return an id Symbol with an attached string */
  343. return new Symbol(sym.ID, result_str);
  344. }
  345. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  346. /** Return one Symbol. This is the main external interface to the scanner.
  347. * It consumes sufficient characters to determine the next input Symbol
  348. * and returns it. To help with debugging, this routine actually calls
  349. * real_next_token() which does the work. If you need to debug the
  350. * parser, this can be changed to call debug_next_token() which prints
  351. * a debugging message before returning the Symbol.
  352. */
  353. public static Symbol next_token() throws java.io.IOException
  354. {
  355. return real_next_token();
  356. }
  357. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  358. /** Debugging version of next_token(). This routine calls the real scanning
  359. * routine, prints a message on System.out indicating what the Symbol is,
  360. * then returns it.
  361. */
  362. public static Symbol debug_next_token() throws java.io.IOException
  363. {
  364. Symbol result = real_next_token();
  365. System.out.println("# next_Symbol() => " + result.sym);
  366. return result;
  367. }
  368. /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
  369. /** The actual routine to return one Symbol. This is normally called from
  370. * next_token(), but for debugging purposes can be called indirectly from
  371. * debug_next_token().
  372. */
  373. protected static Symbol real_next_token() throws java.io.IOException
  374. {
  375. int sym_num;
  376. for (;;)
  377. {
  378. /* look for white space */
  379. if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
  380. next_char == '\f' || next_char == '\r')
  381. {
  382. /* advance past it and try the next character */
  383. advance();
  384. continue;
  385. }
  386. /* look for a single character symbol */
  387. sym_num = find_single_char(next_char);
  388. if (sym_num != -1)
  389. {
  390. /* found one -- advance past it and return a Symbol for it */
  391. advance();
  392. return new Symbol(sym_num);
  393. }
  394. /* look for : or ::= */
  395. if (next_char == ':')
  396. {
  397. /* if we don't have a second ':' return COLON */
  398. if (next_char2 != ':')
  399. {
  400. advance();
  401. return new Symbol(sym.COLON);
  402. }
  403. /* move forward and look for the '=' */
  404. advance();
  405. if (next_char2 == '=')
  406. {
  407. advance(); advance();
  408. return new Symbol(sym.COLON_COLON_EQUALS);
  409. }
  410. else
  411. {
  412. /* return just the colon (already consumed) */
  413. return new Symbol(sym.COLON);
  414. }
  415. }
  416. /* find a "%prec" string and return it. otherwise, a '%' was found,
  417. which has no right being in the specification otherwise */
  418. if (next_char == '%') {
  419. advance();
  420. if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') &&
  421. (next_char4 == 'c')) {
  422. advance();
  423. advance();
  424. advance();
  425. advance();
  426. return new Symbol(sym.PERCENT_PREC);
  427. } else {
  428. emit_error("Found extraneous percent sign");
  429. }
  430. }
  431. /* look for a comment */
  432. if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
  433. {
  434. /* swallow then continue the scan */
  435. swallow_comment();
  436. continue;
  437. }
  438. /* look for start of code string */
  439. if (next_char == '{' && next_char2 == ':')
  440. return do_code_string();
  441. /* look for an id or keyword */
  442. if (id_start_char(next_char)) return do_id();
  443. /* look for EOF */
  444. if (next_char == EOF_CHAR) return new Symbol(sym.EOF);
  445. /* if we get here, we have an unrecognized character */
  446. emit_warn("Unrecognized character '" +
  447. new Character((char)next_char) + "'(" + next_char +
  448. ") -- ignored");
  449. /* advance past it */
  450. advance();
  451. }
  452. }
  453. /*-----------------------------------------------------------*/
  454. }