tagger benchmark
[IRC.git] / Robust / src / Benchmarks / mlp / tagger / original-java / src / tagger / SourceParser.java
diff --git a/Robust/src/Benchmarks/mlp/tagger/original-java/src/tagger/SourceParser.java b/Robust/src/Benchmarks/mlp/tagger/original-java/src/tagger/SourceParser.java
new file mode 100755 (executable)
index 0000000..a5d5fbf
--- /dev/null
@@ -0,0 +1,285 @@
+/**\r
+ * SourceParser class\r
+ * <p>\r
+ * <code>int</code>.\r
+ *\r
+ * @author  Daniel Jackson\r
+ * @version 0, 07/02/01\r
+ */\r
+\r
+package tagger;\r
+import java.io.*;\r
+import java.util.*;\r
+\r
+public class SourceParser {\r
+       final static String loadcharmapcommand_name = "loadchars";\r
+       final static String loadstylesheetcommand_name = "loadstyles";\r
+       final static String preamblecommand_name = "preamble";\r
+       final static String refcommand_name = "ref";\r
+       final static String tagcommand_name = "tag";\r
+       final static String citecommand_name = "cite";\r
+       final static String separatorcommand_name = "sep";\r
+\r
+       private LineNumberReader reader;\r
+\r
+       // holds set of strings recognized as paragraph styles\r
+       private Set parastyles;\r
+\r
+       // holds the previous value of next_char\r
+       private int last_char;\r
+       private int next_char;\r
+       private boolean within_single_quotes;\r
+       private boolean within_double_quotes;\r
+       private boolean at_start_of_line;\r
+       private String token;\r
+\r
+       public SourceParser (Reader reader, Set parastyles) throws IOException {\r
+               this.reader = new LineNumberReader (reader);\r
+               this.parastyles = parastyles;\r
+               next_char = reader.read ();\r
+               last_char = -1;\r
+               at_start_of_line = true;\r
+               }\r
+\r
+       public boolean has_more_tokens () {\r
+               return (next_char != -1);\r
+               }\r
+\r
+       private void consume_char ()  throws IOException {\r
+               token += (char) next_char;\r
+               last_char = next_char;\r
+               next_char = reader.read ();\r
+               }\r
+\r
+       // consume until next close curly and return string excluding curly\r
+       private String consume_arg ()  throws IOException {\r
+               consume_char (); // consume open curly\r
+               token = "";\r
+               consume_char ();\r
+               while (!is_close_curly (next_char) && !is_eol (next_char)) consume_char ();\r
+               String arg = token;\r
+               consume_char (); // consume close curly\r
+               return arg;\r
+               }\r
+\r
+    /**\r
+       * requires: next_char contains next character in reader <p>\r
+       * ensures: returns next token according to one of these productions: <p>\r
+       * <blockquote><pre>\r
+       *       char-sequence = alphanumeric+\r
+       *               whitespace ::= (space | tab)+\r
+       *               command ::= slash alphanum* [star]\r
+       *               paragraph-break ::= <blank line>\r
+       *               line-break ::= slash slash\r
+       *               hyphen-sequence ::= hyphen+\r
+       *               dot-sequence ::= dot+\r
+       *               underscore ::= underscore\r
+       * </pre></blockquote>\r
+       *       quote characters, disambiguated by context:\r
+       *               open-single-quote: when not preceded by alphanumeric\r
+       *               close-single-quote: when not followed by alphanumeric and preceded by\r
+       *                       open-single-quote\r
+       *               open-double-quote: when not preceded by open-double-quote\r
+       *               close-double-quote: when preceded by open-double-quote\r
+       *               apostrophe: between alphanumerics, or when followed by numeric\r
+       *               prime: after alphanumeric, when not followed by alphanumeric,\r
+       *                       and not preceded by open-single-quote\r
+       * @return the next token.\r
+       *       explicitly returns end of stream token.\r
+       */\r
+       public Token get_token () throws IOException {\r
+               token = new String ();\r
+               if (is_eos (next_char))\r
+                       return new Token (Token.ENDOFSTREAM, reader.getLineNumber ());\r
+               if (at_start_of_line) {\r
+                       if (is_eol (next_char)) {\r
+                               consume_char ();\r
+                               within_single_quotes = false;\r
+                               within_double_quotes = false;\r
+                               return new Token (Token.PARABREAK, reader.getLineNumber ());\r
+                               }\r
+                       else if (is_hash (next_char)) {\r
+                               String line = reader.readLine ();\r
+                               consume_char ();\r
+                               return new Token (Token.COMMENT, line, reader.getLineNumber ());\r
+                               }\r
+                       else\r
+                               at_start_of_line = false;\r
+                       }\r
+               if (is_eol (next_char)) {\r
+                       consume_char ();\r
+                       at_start_of_line = true;\r
+                       if (is_eol (next_char)) {\r
+                               consume_char ();\r
+                               within_single_quotes = false;\r
+                               within_double_quotes = false;\r
+                               return new Token (Token.PARABREAK, reader.getLineNumber ());\r
+                               }\r
+                       // check this\r
+                       return new Token (Token.WHITESPACE, " ", reader.getLineNumber ());\r
+                       }\r
+               if (is_slash (next_char)) {\r
+                       consume_char ();\r
+                       token = "";\r
+                       if (is_slash (next_char)) {\r
+                               consume_char ();\r
+                               return new Token (Token.LINEBREAK, reader.getLineNumber ());\r
+                               }\r
+                       if (!is_alphabetic (next_char)) {\r
+                               // next character assumed prefixed with slash to avoid special treatment\r
+                               // eg, \< for <, \$ for $\r
+                               token = new Character ((char) next_char).toString ();\r
+                               return new Token (Token.OTHER, token, reader.getLineNumber ());\r
+                               }\r
+                       while (is_alphanumeric (next_char)) consume_char ();\r
+                       String command_name = token;\r
+                       if (is_star (next_char)) consume_char ();\r
+                       if (command_name.equals (preamblecommand_name)) {\r
+                               return new Token (Token.PREAMBLECOMMAND, reader.getLineNumber ());\r
+                               }\r
+                       if (command_name.equals (separatorcommand_name)) {\r
+                               // consume whitespace until next token\r
+                               while (is_whitespace (next_char)) consume_char ();\r
+                               return new Token (Token.SEPARATORCOMMAND, reader.getLineNumber ());\r
+                               }\r
+                       if (is_less_than (next_char)) {\r
+                               consume_char ();\r
+                               return new Token (Token.FORMATCOMMAND, command_name, reader.getLineNumber ());\r
+                               }\r
+                       if (is_open_curly (next_char)) {\r
+                               String arg = consume_arg ();\r
+                               if (command_name.equals (loadcharmapcommand_name)) {\r
+                                       return new Token (Token.LOADCHARMAPCOMMAND, arg, reader.getLineNumber ());\r
+                                       }\r
+                               if (command_name.equals (loadstylesheetcommand_name)) {\r
+                                       return new Token (Token.LOADSTYLESHEETCOMMAND, arg, reader.getLineNumber ());\r
+                                       }\r
+                               if (command_name.equals (refcommand_name)) {\r
+                                       return new Token (Token.REFCOMMAND, arg, reader.getLineNumber ());\r
+                                       }\r
+                               if (command_name.equals (tagcommand_name)) {\r
+                                       return new Token (Token.TAGCOMMAND, arg, reader.getLineNumber ());\r
+                                       }\r
+                               if (command_name.equals (citecommand_name)) {\r
+                                       return new Token (Token.CITECOMMAND, arg, reader.getLineNumber ());\r
+                                       }\r
+                               }\r
+                       if (parastyles.contains (command_name)) {\r
+                               while (is_whitespace (next_char)) consume_char ();\r
+                               // paragraph style command consumes the first linebreak following it also\r
+                               if (is_eol (next_char)) consume_char ();\r
+                               return new Token (Token.PARASTYLECOMMAND, command_name, reader.getLineNumber ());\r
+                               }\r
+                       else\r
+                               // temporary\r
+                               return new Token (Token.CHARCOMMAND, command_name, reader.getLineNumber ());\r
+                       }\r
+               if (is_alphabetic (next_char)) {\r
+                       consume_char ();\r
+                       while (is_alphabetic (next_char)) consume_char ();\r
+                       return new Token (Token.ALPHABETIC, token, reader.getLineNumber ());\r
+                       }\r
+               if (is_numeric (next_char)) {\r
+                       consume_char ();\r
+                       while (is_numeric (next_char)) consume_char ();\r
+                       return new Token (Token.NUMERIC, token, reader.getLineNumber ());\r
+                       }\r
+               if (is_whitespace (next_char)) {\r
+                       consume_char ();\r
+                       while (is_whitespace (next_char)) consume_char ();\r
+                       if (is_eol (next_char)) {\r
+                               consume_char ();\r
+                               // check this\r
+                               return new Token (Token.WHITESPACE, " ", reader.getLineNumber ());\r
+                               }\r
+                       return new Token (Token.WHITESPACE, token, reader.getLineNumber ());\r
+                       }\r
+               if (is_hyphen (next_char)) {\r
+                       consume_char ();\r
+                       while (is_hyphen (next_char)) consume_char ();\r
+                       return new Token (Token.HYPHENS, token, reader.getLineNumber ());\r
+                       }\r
+               if (is_dot (next_char)) {\r
+                       consume_char ();\r
+                       while (is_dot (next_char)) consume_char ();\r
+                       return new Token (Token.DOTS, token, reader.getLineNumber ());\r
+                       }\r
+               if (is_underscore (next_char)) {\r
+                       consume_char ();\r
+                       return new Token (Token.UNDERSCORE, reader.getLineNumber ());\r
+                       }\r
+               if (is_dollar (next_char)) {\r
+                       consume_char ();\r
+                       return new Token (Token.DOLLAR, reader.getLineNumber ());\r
+                       }\r
+               if (is_greater_than (next_char)) {\r
+                       consume_char ();\r
+                       return new Token (Token.POPFORMATCOMMAND, reader.getLineNumber ());\r
+                       }\r
+               if (is_single_quote (next_char)) {\r
+                       if (is_alphanumeric (last_char)) {\r
+                               if (is_alphanumeric (next_char)) {\r
+                                       consume_char ();\r
+                                       return new Token (Token.APOSTROPHE, reader.getLineNumber ());\r
+                                       }\r
+                               else if (within_single_quotes) {\r
+                                       within_single_quotes = false;\r
+                                       consume_char ();\r
+                                       return new Token (Token.CLOSESINGLEQUOTE, reader.getLineNumber ());\r
+                                       }\r
+                               else {\r
+                                       consume_char ();\r
+                                       return new Token (Token.PRIME, reader.getLineNumber ());\r
+                                       }\r
+                               }\r
+                       consume_char ();\r
+                       if (is_numeric (next_char)) {\r
+                               return new Token (Token.APOSTROPHE, reader.getLineNumber ());\r
+                               }\r
+                       else {\r
+                               within_single_quotes = true;\r
+                               return new Token (Token.OPENSINGLEQUOTE, reader.getLineNumber ());\r
+                               }\r
+                       }\r
+               if (is_double_quote (next_char)) {\r
+                       consume_char ();\r
+                       if (within_double_quotes) {\r
+                               within_double_quotes = false;\r
+                               return new Token (Token.CLOSEDOUBLEQUOTE, reader.getLineNumber ());\r
+                               }\r
+                       else {\r
+                               within_double_quotes = true;\r
+                               return new Token (Token.OPENDOUBLEQUOTE, reader.getLineNumber ());\r
+                               }\r
+                       }\r
+               consume_char ();\r
+               return new Token (Token.OTHER, token, reader.getLineNumber ());\r
+               }\r
+\r
+       static boolean is_eol (int c) {return c == '\n';}\r
+       static boolean is_eos (int c) {return c == -1;}\r
+       static boolean is_star (int c) {return c == '*';}\r
+       static boolean is_hash (int c) {return c == '#';}\r
+       static boolean is_dot (int c) {return c == '.';}\r
+       static boolean is_slash (int c) {return c == '\\';}\r
+       static boolean is_hyphen (int c) {return c == '-';}\r
+       static boolean is_underscore (int c) {return c == '_';}\r
+       static boolean is_dollar (int c) {return c == '$';}\r
+       static boolean is_single_quote (int c) {return c == '\'';}\r
+       static boolean is_double_quote (int c) {return c == '\"';}\r
+       static boolean is_open_curly (int c) {return c == '{';}\r
+       static boolean is_close_curly (int c) {return c == '}';}\r
+       static boolean is_less_than (int c) {return c == '<';}\r
+       static boolean is_greater_than (int c) {return c == '>';}\r
+\r
+       // should perhaps use Character.isLetter? not sure, because that allows Unicode chars for\r
+       // other languages that are outside the a-Z range.\r
+       static boolean is_alphabetic (int c) {\r
+               return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z';\r
+               }\r
+       static boolean is_numeric (int c) {return c >= '0' && c <= '9';}\r
+       static boolean is_alphanumeric (int c) {\r
+               return is_numeric (c) || is_alphabetic (c);\r
+               }\r
+       static boolean is_whitespace (int c) {return c == ' ' || c == '\t';}\r
+}\r