--- /dev/null
+/**\r
+ * SourceParser class\r
+ * <p>\r
+ * <code>int</code>.\r
+ *\r
+ * @author Daniel Jackson\r
+ * @version 0, 07/02/01\r
+ */\r
+\r
+package tagger;\r
+import java.io.*;\r
+import java.util.*;\r
+\r
+public class SourceParser {\r
+ final static String loadcharmapcommand_name = "loadchars";\r
+ final static String loadstylesheetcommand_name = "loadstyles";\r
+ final static String preamblecommand_name = "preamble";\r
+ final static String refcommand_name = "ref";\r
+ final static String tagcommand_name = "tag";\r
+ final static String citecommand_name = "cite";\r
+ final static String separatorcommand_name = "sep";\r
+\r
+ private LineNumberReader reader;\r
+\r
+ // holds set of strings recognized as paragraph styles\r
+ private Set parastyles;\r
+\r
+ // holds the previous value of next_char\r
+ private int last_char;\r
+ private int next_char;\r
+ private boolean within_single_quotes;\r
+ private boolean within_double_quotes;\r
+ private boolean at_start_of_line;\r
+ private String token;\r
+\r
+ public SourceParser (Reader reader, Set parastyles) throws IOException {\r
+ this.reader = new LineNumberReader (reader);\r
+ this.parastyles = parastyles;\r
+ next_char = reader.read ();\r
+ last_char = -1;\r
+ at_start_of_line = true;\r
+ }\r
+\r
+ public boolean has_more_tokens () {\r
+ return (next_char != -1);\r
+ }\r
+\r
+ private void consume_char () throws IOException {\r
+ token += (char) next_char;\r
+ last_char = next_char;\r
+ next_char = reader.read ();\r
+ }\r
+\r
+ // consume until next close curly and return string excluding curly\r
+ private String consume_arg () throws IOException {\r
+ consume_char (); // consume open curly\r
+ token = "";\r
+ consume_char ();\r
+ while (!is_close_curly (next_char) && !is_eol (next_char)) consume_char ();\r
+ String arg = token;\r
+ consume_char (); // consume close curly\r
+ return arg;\r
+ }\r
+\r
+ /**\r
+ * requires: next_char contains next character in reader <p>\r
+ * ensures: returns next token according to one of these productions: <p>\r
+ * <blockquote><pre>\r
+ * char-sequence = alphanumeric+\r
+ * whitespace ::= (space | tab)+\r
+ * command ::= slash alphanum* [star]\r
+ * paragraph-break ::= <blank line>\r
+ * line-break ::= slash slash\r
+ * hyphen-sequence ::= hyphen+\r
+ * dot-sequence ::= dot+\r
+ * underscore ::= underscore\r
+ * </pre></blockquote>\r
+ * quote characters, disambiguated by context:\r
+ * open-single-quote: when not preceded by alphanumeric\r
+ * close-single-quote: when not followed by alphanumeric and preceded by\r
+ * open-single-quote\r
+ * open-double-quote: when not preceded by open-double-quote\r
+ * close-double-quote: when preceded by open-double-quote\r
+ * apostrophe: between alphanumerics, or when followed by numeric\r
+ * prime: after alphanumeric, when not followed by alphanumeric,\r
+ * and not preceded by open-single-quote\r
+ * @return the next token.\r
+ * explicitly returns end of stream token.\r
+ */\r
+ public Token get_token () throws IOException {\r
+ token = new String ();\r
+ if (is_eos (next_char))\r
+ return new Token (Token.ENDOFSTREAM, reader.getLineNumber ());\r
+ if (at_start_of_line) {\r
+ if (is_eol (next_char)) {\r
+ consume_char ();\r
+ within_single_quotes = false;\r
+ within_double_quotes = false;\r
+ return new Token (Token.PARABREAK, reader.getLineNumber ());\r
+ }\r
+ else if (is_hash (next_char)) {\r
+ String line = reader.readLine ();\r
+ consume_char ();\r
+ return new Token (Token.COMMENT, line, reader.getLineNumber ());\r
+ }\r
+ else\r
+ at_start_of_line = false;\r
+ }\r
+ if (is_eol (next_char)) {\r
+ consume_char ();\r
+ at_start_of_line = true;\r
+ if (is_eol (next_char)) {\r
+ consume_char ();\r
+ within_single_quotes = false;\r
+ within_double_quotes = false;\r
+ return new Token (Token.PARABREAK, reader.getLineNumber ());\r
+ }\r
+ // check this\r
+ return new Token (Token.WHITESPACE, " ", reader.getLineNumber ());\r
+ }\r
+ if (is_slash (next_char)) {\r
+ consume_char ();\r
+ token = "";\r
+ if (is_slash (next_char)) {\r
+ consume_char ();\r
+ return new Token (Token.LINEBREAK, reader.getLineNumber ());\r
+ }\r
+ if (!is_alphabetic (next_char)) {\r
+ // next character assumed prefixed with slash to avoid special treatment\r
+ // eg, \< for <, \$ for $\r
+ token = new Character ((char) next_char).toString ();\r
+ return new Token (Token.OTHER, token, reader.getLineNumber ());\r
+ }\r
+ while (is_alphanumeric (next_char)) consume_char ();\r
+ String command_name = token;\r
+ if (is_star (next_char)) consume_char ();\r
+ if (command_name.equals (preamblecommand_name)) {\r
+ return new Token (Token.PREAMBLECOMMAND, reader.getLineNumber ());\r
+ }\r
+ if (command_name.equals (separatorcommand_name)) {\r
+ // consume whitespace until next token\r
+ while (is_whitespace (next_char)) consume_char ();\r
+ return new Token (Token.SEPARATORCOMMAND, reader.getLineNumber ());\r
+ }\r
+ if (is_less_than (next_char)) {\r
+ consume_char ();\r
+ return new Token (Token.FORMATCOMMAND, command_name, reader.getLineNumber ());\r
+ }\r
+ if (is_open_curly (next_char)) {\r
+ String arg = consume_arg ();\r
+ if (command_name.equals (loadcharmapcommand_name)) {\r
+ return new Token (Token.LOADCHARMAPCOMMAND, arg, reader.getLineNumber ());\r
+ }\r
+ if (command_name.equals (loadstylesheetcommand_name)) {\r
+ return new Token (Token.LOADSTYLESHEETCOMMAND, arg, reader.getLineNumber ());\r
+ }\r
+ if (command_name.equals (refcommand_name)) {\r
+ return new Token (Token.REFCOMMAND, arg, reader.getLineNumber ());\r
+ }\r
+ if (command_name.equals (tagcommand_name)) {\r
+ return new Token (Token.TAGCOMMAND, arg, reader.getLineNumber ());\r
+ }\r
+ if (command_name.equals (citecommand_name)) {\r
+ return new Token (Token.CITECOMMAND, arg, reader.getLineNumber ());\r
+ }\r
+ }\r
+ if (parastyles.contains (command_name)) {\r
+ while (is_whitespace (next_char)) consume_char ();\r
+ // paragraph style command consumes the first linebreak following it also\r
+ if (is_eol (next_char)) consume_char ();\r
+ return new Token (Token.PARASTYLECOMMAND, command_name, reader.getLineNumber ());\r
+ }\r
+ else\r
+ // temporary\r
+ return new Token (Token.CHARCOMMAND, command_name, reader.getLineNumber ());\r
+ }\r
+ if (is_alphabetic (next_char)) {\r
+ consume_char ();\r
+ while (is_alphabetic (next_char)) consume_char ();\r
+ return new Token (Token.ALPHABETIC, token, reader.getLineNumber ());\r
+ }\r
+ if (is_numeric (next_char)) {\r
+ consume_char ();\r
+ while (is_numeric (next_char)) consume_char ();\r
+ return new Token (Token.NUMERIC, token, reader.getLineNumber ());\r
+ }\r
+ if (is_whitespace (next_char)) {\r
+ consume_char ();\r
+ while (is_whitespace (next_char)) consume_char ();\r
+ if (is_eol (next_char)) {\r
+ consume_char ();\r
+ // check this\r
+ return new Token (Token.WHITESPACE, " ", reader.getLineNumber ());\r
+ }\r
+ return new Token (Token.WHITESPACE, token, reader.getLineNumber ());\r
+ }\r
+ if (is_hyphen (next_char)) {\r
+ consume_char ();\r
+ while (is_hyphen (next_char)) consume_char ();\r
+ return new Token (Token.HYPHENS, token, reader.getLineNumber ());\r
+ }\r
+ if (is_dot (next_char)) {\r
+ consume_char ();\r
+ while (is_dot (next_char)) consume_char ();\r
+ return new Token (Token.DOTS, token, reader.getLineNumber ());\r
+ }\r
+ if (is_underscore (next_char)) {\r
+ consume_char ();\r
+ return new Token (Token.UNDERSCORE, reader.getLineNumber ());\r
+ }\r
+ if (is_dollar (next_char)) {\r
+ consume_char ();\r
+ return new Token (Token.DOLLAR, reader.getLineNumber ());\r
+ }\r
+ if (is_greater_than (next_char)) {\r
+ consume_char ();\r
+ return new Token (Token.POPFORMATCOMMAND, reader.getLineNumber ());\r
+ }\r
+ if (is_single_quote (next_char)) {\r
+ if (is_alphanumeric (last_char)) {\r
+ if (is_alphanumeric (next_char)) {\r
+ consume_char ();\r
+ return new Token (Token.APOSTROPHE, reader.getLineNumber ());\r
+ }\r
+ else if (within_single_quotes) {\r
+ within_single_quotes = false;\r
+ consume_char ();\r
+ return new Token (Token.CLOSESINGLEQUOTE, reader.getLineNumber ());\r
+ }\r
+ else {\r
+ consume_char ();\r
+ return new Token (Token.PRIME, reader.getLineNumber ());\r
+ }\r
+ }\r
+ consume_char ();\r
+ if (is_numeric (next_char)) {\r
+ return new Token (Token.APOSTROPHE, reader.getLineNumber ());\r
+ }\r
+ else {\r
+ within_single_quotes = true;\r
+ return new Token (Token.OPENSINGLEQUOTE, reader.getLineNumber ());\r
+ }\r
+ }\r
+ if (is_double_quote (next_char)) {\r
+ consume_char ();\r
+ if (within_double_quotes) {\r
+ within_double_quotes = false;\r
+ return new Token (Token.CLOSEDOUBLEQUOTE, reader.getLineNumber ());\r
+ }\r
+ else {\r
+ within_double_quotes = true;\r
+ return new Token (Token.OPENDOUBLEQUOTE, reader.getLineNumber ());\r
+ }\r
+ }\r
+ consume_char ();\r
+ return new Token (Token.OTHER, token, reader.getLineNumber ());\r
+ }\r
+\r
+ static boolean is_eol (int c) {return c == '\n';}\r
+ static boolean is_eos (int c) {return c == -1;}\r
+ static boolean is_star (int c) {return c == '*';}\r
+ static boolean is_hash (int c) {return c == '#';}\r
+ static boolean is_dot (int c) {return c == '.';}\r
+ static boolean is_slash (int c) {return c == '\\';}\r
+ static boolean is_hyphen (int c) {return c == '-';}\r
+ static boolean is_underscore (int c) {return c == '_';}\r
+ static boolean is_dollar (int c) {return c == '$';}\r
+ static boolean is_single_quote (int c) {return c == '\'';}\r
+ static boolean is_double_quote (int c) {return c == '\"';}\r
+ static boolean is_open_curly (int c) {return c == '{';}\r
+ static boolean is_close_curly (int c) {return c == '}';}\r
+ static boolean is_less_than (int c) {return c == '<';}\r
+ static boolean is_greater_than (int c) {return c == '>';}\r
+\r
+ // should perhaps use Character.isLetter? not sure, because that allows Unicode chars for\r
+ // other languages that are outside the a-Z range.\r
+ static boolean is_alphabetic (int c) {\r
+ return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z';\r
+ }\r
+ static boolean is_numeric (int c) {return c >= '0' && c <= '9';}\r
+ static boolean is_alphanumeric (int c) {\r
+ return is_numeric (c) || is_alphabetic (c);\r
+ }\r
+ static boolean is_whitespace (int c) {return c == ' ' || c == '\t';}\r
+}\r