3 import java_cup.runtime.Symbol;
4 import java.util.Hashtable;
6 /** This class implements a small scanner (aka lexical analyzer or lexer) for
7 * the JavaCup specification. This scanner reads characters from standard
8 * input (System.in) and returns integers corresponding to the terminal
9 * number of the next Symbol. Once end of input is reached the EOF Symbol is
10 * returned on every subsequent call.<p>
11 * Symbols currently returned include: <pre>
12 * Symbol Constant Returned Symbol Constant Returned
13 * ------ ----------------- ------ -----------------
14 * "package" PACKAGE "import" IMPORT
15 * "code" CODE "action" ACTION
16 * "parser" PARSER "terminal" TERMINAL
17 * "non" NON "init" INIT
18 * "scan" SCAN "with" WITH
19 * "start" START "precedence" PRECEDENCE
20 * "left" LEFT "right" RIGHT
21 * "nonassoc" NONASSOC "%prec PRECENT_PREC
26 * ::= COLON_COLON_EQUALS | BAR
27 * identifier ID {:...:} CODE_STRING
28 * "nonterminal" NONTERMINAL
30 * All symbol constants are defined in sym.java which is generated by
31 * JavaCup from parser.cup.<p>
33 * In addition to the scanner proper (called first via init() then with
34 * next_token() to get each Symbol) this class provides simple error and
35 * warning routines and keeps a count of errors and warnings that is
36 * publicly accessible.<p>
38 * This class is "static" (i.e., it has only static members and methods).
40 * @version last updated: 7/3/96
41 * @author Frank Flannery
45 /*-----------------------------------------------------------*/
46 /*--- Constructor(s) ----------------------------------------*/
47 /*-----------------------------------------------------------*/
49 /** The only constructor is private, so no instances can be created. */
52 /*-----------------------------------------------------------*/
53 /*--- Static (Class) Variables ------------------------------*/
54 /*-----------------------------------------------------------*/
56 /** First character of lookahead. */
57 protected static int next_char;
59 /** Second character of lookahead. */
60 protected static int next_char2;
62 /** Second character of lookahead. */
63 protected static int next_char3;
65 /** Second character of lookahead. */
66 protected static int next_char4;
68 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
71 protected static final int EOF_CHAR = -1;
73 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
75 /** Table of keywords. Keywords are initially treated as identifiers.
76 * Just before they are returned we look them up in this table to see if
77 * they match one of the keywords. The string of the name is the key here,
78 * which indexes Integer objects holding the symbol number.
80 protected static Hashtable keywords = new Hashtable(23);
82 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
84 /** Table of single character symbols. For ease of implementation, we
85 * store all unambiguous single character Symbols in this table of Integer
86 * objects keyed by Integer objects with the numerical value of the
87 * appropriate char (currently Character objects have a bug which precludes
88 * their use in tables).
90 protected static Hashtable char_symbols = new Hashtable(11);
92 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
94 /** Current line number for use in error messages. */
95 protected static int current_line = 1;
97 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
99 /** Character position in current line. */
100 protected static int current_position = 1;
102 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
104 /** Character position in current line. */
105 protected static int absolute_position = 1;
107 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
109 /** Count of total errors detected so far. */
110 public static int error_count = 0;
112 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
114 /** Count of warnings issued so far */
115 public static int warning_count = 0;
117 /*-----------------------------------------------------------*/
118 /*--- Static Methods ----------------------------------------*/
119 /*-----------------------------------------------------------*/
121 /** Initialize the scanner. This sets up the keywords and char_symbols
122 * tables and reads the first two characters of lookahead.
124 public static void init() throws java.io.IOException
126 /* set up the keyword table */
127 keywords.put("package", new Integer(sym.PACKAGE));
128 keywords.put("import", new Integer(sym.IMPORT));
129 keywords.put("code", new Integer(sym.CODE));
130 keywords.put("action", new Integer(sym.ACTION));
131 keywords.put("parser", new Integer(sym.PARSER));
132 keywords.put("terminal", new Integer(sym.TERMINAL));
133 keywords.put("non", new Integer(sym.NON));
134 keywords.put("nonterminal",new Integer(sym.NONTERMINAL));// [CSA]
135 keywords.put("init", new Integer(sym.INIT));
136 keywords.put("scan", new Integer(sym.SCAN));
137 keywords.put("with", new Integer(sym.WITH));
138 keywords.put("start", new Integer(sym.START));
139 keywords.put("precedence", new Integer(sym.PRECEDENCE));
140 keywords.put("left", new Integer(sym.LEFT));
141 keywords.put("right", new Integer(sym.RIGHT));
142 keywords.put("nonassoc", new Integer(sym.NONASSOC));
144 /* set up the table of single character symbols */
145 char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
146 char_symbols.put(new Integer(','), new Integer(sym.COMMA));
147 char_symbols.put(new Integer('*'), new Integer(sym.STAR));
148 char_symbols.put(new Integer('.'), new Integer(sym.DOT));
149 char_symbols.put(new Integer('|'), new Integer(sym.BAR));
150 char_symbols.put(new Integer('['), new Integer(sym.LBRACK));
151 char_symbols.put(new Integer(']'), new Integer(sym.RBRACK));
153 /* read two characters of lookahead */
154 next_char = System.in.read();
155 if (next_char == EOF_CHAR) {
156 next_char2 = EOF_CHAR;
157 next_char3 = EOF_CHAR;
158 next_char4 = EOF_CHAR;
160 next_char2 = System.in.read();
161 if (next_char2 == EOF_CHAR) {
162 next_char3 = EOF_CHAR;
163 next_char4 = EOF_CHAR;
165 next_char3 = System.in.read();
166 if (next_char3 == EOF_CHAR) {
167 next_char4 = EOF_CHAR;
169 next_char4 = System.in.read();
175 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
177 /** Advance the scanner one character in the input stream. This moves
178 * next_char2 to next_char and then reads a new next_char2.
180 protected static void advance() throws java.io.IOException
184 old_char = next_char;
185 next_char = next_char2;
186 if (next_char == EOF_CHAR) {
187 next_char2 = EOF_CHAR;
188 next_char3 = EOF_CHAR;
189 next_char4 = EOF_CHAR;
191 next_char2 = next_char3;
192 if (next_char2 == EOF_CHAR) {
193 next_char3 = EOF_CHAR;
194 next_char4 = EOF_CHAR;
196 next_char3 = next_char4;
197 if (next_char3 == EOF_CHAR) {
198 next_char4 = EOF_CHAR;
200 next_char4 = System.in.read();
208 if (old_char == '\n' || (old_char == '\r' && next_char!='\n'))
211 current_position = 1;
215 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
217 /** Emit an error message. The message will be marked with both the
218 * current line number and the position in the line. Error messages
219 * are printed on standard error (System.err).
220 * @param message the message to print.
222 public static void emit_error(String message)
224 System.err.println("Error at " + current_line + "(" + current_position +
229 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
231 /** Emit a warning message. The message will be marked with both the
232 * current line number and the position in the line. Messages are
233 * printed on standard error (System.err).
234 * @param message the message to print.
236 public static void emit_warn(String message)
238 System.err.println("Warning at " + current_line + "(" + current_position +
243 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
245 /** Determine if a character is ok to start an id.
246 * @param ch the character in question.
248 protected static boolean id_start_char(int ch)
250 /* allow for % in identifiers. a hack to allow my
251 %prec in. Should eventually make lex spec for this
253 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
256 // later need to deal with non-8-bit chars here
259 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
261 /** Determine if a character is ok for the middle of an id.
262 * @param ch the character in question.
264 protected static boolean id_char(int ch)
266 return id_start_char(ch) || (ch >= '0' && ch <= '9');
269 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
271 /** Try to look up a single character symbol, returns -1 for not found.
272 * @param ch the character in question.
274 protected static int find_single_char(int ch)
278 result = (Integer)char_symbols.get(new Integer((char)ch));
282 return result.intValue();
285 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
287 /** Handle swallowing up a comment. Both old style C and new style C++
288 * comments are handled.
290 protected static void swallow_comment() throws java.io.IOException
292 /* next_char == '/' at this point */
294 /* is it a traditional comment */
295 if (next_char2 == '*')
297 /* swallow the opener */
298 advance(); advance();
300 /* swallow the comment until end of comment or EOF */
303 /* if its EOF we have an error */
304 if (next_char == EOF_CHAR)
306 emit_error("Specification file ends inside a comment");
310 /* if we can see the closer we are done */
311 if (next_char == '*' && next_char2 == '/')
318 /* otherwise swallow char and move on */
323 /* is its a new style comment */
324 if (next_char2 == '/')
326 /* swallow the opener */
327 advance(); advance();
329 /* swallow to '\n', '\r', '\f', or EOF */
330 while (next_char != '\n' && next_char != '\r' &&
331 next_char != '\f' && next_char!=EOF_CHAR)
338 /* shouldn't get here, but... if we get here we have an error */
339 emit_error("Malformed comment in specification -- ignored");
343 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
345 /** Swallow up a code string. Code strings begin with "{:" and include
346 all characters up to the first occurrence of ":}" (there is no way to
347 include ":}" inside a code string). The routine returns a String
348 object suitable for return by the scanner.
350 protected static Symbol do_code_string() throws java.io.IOException
352 StringBuffer result = new StringBuffer();
354 /* at this point we have lookahead of "{:" -- swallow that */
355 advance(); advance();
357 /* save chars until we see ":}" */
358 while (!(next_char == ':' && next_char2 == '}'))
360 /* if we have run off the end issue a message and break out of loop */
361 if (next_char == EOF_CHAR)
363 emit_error("Specification file ends inside a code string");
367 /* otherwise record the char and move on */
368 result.append(new Character((char)next_char));
372 /* advance past the closer and build a return Symbol */
373 advance(); advance();
374 return new Symbol(sym.CODE_STRING, result.toString());
377 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
379 /** Process an identifier. Identifiers begin with a letter, underscore,
380 * or dollar sign, which is followed by zero or more letters, numbers,
381 * underscores or dollar signs. This routine returns a String suitable
382 * for return by the scanner.
384 protected static Symbol do_id() throws java.io.IOException
386 StringBuffer result = new StringBuffer();
389 char buffer[] = new char[1];
391 /* next_char holds first character of id */
392 buffer[0] = (char)next_char;
393 result.append(buffer,0,1);
396 /* collect up characters while they fit in id */
397 while(id_char(next_char))
399 buffer[0] = (char)next_char;
400 result.append(buffer,0,1);
404 /* extract a string and try to look it up as a keyword */
405 result_str = result.toString();
406 keyword_num = (Integer)keywords.get(result_str);
408 /* if we found something, return that keyword */
409 if (keyword_num != null)
410 return new Symbol(keyword_num.intValue());
412 /* otherwise build and return an id Symbol with an attached string */
413 return new Symbol(sym.ID, result_str);
416 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
418 /** Return one Symbol. This is the main external interface to the scanner.
419 * It consumes sufficient characters to determine the next input Symbol
420 * and returns it. To help with debugging, this routine actually calls
421 * real_next_token() which does the work. If you need to debug the
422 * parser, this can be changed to call debug_next_token() which prints
423 * a debugging message before returning the Symbol.
425 public static Symbol next_token() throws java.io.IOException
427 return real_next_token();
430 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
432 /** Debugging version of next_token(). This routine calls the real scanning
433 * routine, prints a message on System.out indicating what the Symbol is,
436 public static Symbol debug_next_token() throws java.io.IOException
438 Symbol result = real_next_token();
439 System.out.println("# next_Symbol() => " + result.sym);
443 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
445 /** The actual routine to return one Symbol. This is normally called from
446 * next_token(), but for debugging purposes can be called indirectly from
447 * debug_next_token().
449 protected static Symbol real_next_token() throws java.io.IOException
455 /* look for white space */
456 if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
457 next_char == '\f' || next_char == '\r')
459 /* advance past it and try the next character */
464 /* look for a single character symbol */
465 sym_num = find_single_char(next_char);
468 /* found one -- advance past it and return a Symbol for it */
470 return new Symbol(sym_num);
473 /* look for : or ::= */
474 if (next_char == ':')
476 /* if we don't have a second ':' return COLON */
477 if (next_char2 != ':')
480 return new Symbol(sym.COLON);
483 /* move forward and look for the '=' */
485 if (next_char2 == '=')
487 advance(); advance();
488 return new Symbol(sym.COLON_COLON_EQUALS);
492 /* return just the colon (already consumed) */
493 return new Symbol(sym.COLON);
497 /* find a "%prec" string and return it. otherwise, a '%' was found,
498 which has no right being in the specification otherwise */
499 if (next_char == '%') {
501 if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') &&
502 (next_char4 == 'c')) {
507 return new Symbol(sym.PERCENT_PREC);
509 emit_error("Found extraneous percent sign");
513 /* look for a comment */
514 if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
516 /* swallow then continue the scan */
521 /* look for start of code string */
522 if (next_char == '{' && next_char2 == ':')
523 return do_code_string();
525 /* look for an id or keyword */
526 if (id_start_char(next_char)) return do_id();
529 if (next_char == EOF_CHAR) return new Symbol(sym.EOF);
531 /* if we get here, we have an unrecognized character */
532 emit_warn("Unrecognized character '" +
533 new Character((char)next_char) + "'(" + next_char +
536 /* advance past it */
541 /*-----------------------------------------------------------*/