lib/TableGen/TGLexer.cpp

   1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // Implement the Lexer for TableGen.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "TGLexer.h"
  15 #include "llvm/TableGen/Error.h"
  16 #include "llvm/Support/SourceMgr.h"
  17 #include "llvm/Support/MemoryBuffer.h"
  18 #include "llvm/Config/config.h"
  19 #include "llvm/ADT/StringSwitch.h"
  20 #include "llvm/ADT/Twine.h"
  21 #include <cctype>
  22 #include <cstdio>
  23 #include <cstdlib>
  24 #include <cstring>
  25 #include <cerrno>
  26 using namespace llvm;
  27
  28 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
  29   CurBuffer = 0;
  30   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
  31   CurPtr = CurBuf->getBufferStart();
  32   TokStart = 0;
  33 }
  34
  35 SMLoc TGLexer::getLoc() const {
  36   return SMLoc::getFromPointer(TokStart);
  37 }
  38
  39 /// ReturnError - Set the error to the specified string at the specified
  40 /// location.  This is defined to always return tgtok::Error.
  41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
  42   PrintError(Loc, Msg);
  43   return tgtok::Error;
  44 }
  45
  46 int TGLexer::getNextChar() {
  47   char CurChar = *CurPtr++;
  48   switch (CurChar) {
  49   default:
  50     return (unsigned char)CurChar;
  51   case 0: {
  52     // A nul character in the stream is either the end of the current buffer or
  53     // a random nul in the file.  Disambiguate that here.
  54     if (CurPtr-1 != CurBuf->getBufferEnd())
  55       return 0;  // Just whitespace.
  56
  57     // If this is the end of an included file, pop the parent file off the
  58     // include stack.
  59     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
  60     if (ParentIncludeLoc != SMLoc()) {
  61       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
  62       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
  63       CurPtr = ParentIncludeLoc.getPointer();
  64       return getNextChar();
  65     }
  66
  67     // Otherwise, return end of file.
  68     --CurPtr;  // Another call to lex will return EOF again.
  69     return EOF;
  70   }
  71   case '\n':
  72   case '\r':
  73     // Handle the newline character by ignoring it and incrementing the line
  74     // count.  However, be careful about 'dos style' files with \n\r in them.
  75     // Only treat a \n\r or \r\n as a single line.
  76     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
  77         *CurPtr != CurChar)
  78       ++CurPtr;  // Eat the two char newline sequence.
  79     return '\n';
  80   }
  81 }
  82
  83 int TGLexer::peekNextChar(int Index) {
  84   return *(CurPtr + Index);
  85 }
  86
  87 tgtok::TokKind TGLexer::LexToken() {
  88   TokStart = CurPtr;
  89   // This always consumes at least one character.
  90   int CurChar = getNextChar();
  91
  92   switch (CurChar) {
  93   default:
  94     // Handle letters: [a-zA-Z_#]
  95     if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
  96       return LexIdentifier();
  97
  98     // Unknown character, emit an error.
  99     return ReturnError(TokStart, "Unexpected character");
 100   case EOF: return tgtok::Eof;
 101   case ':': return tgtok::colon;
 102   case ';': return tgtok::semi;
 103   case '.': return tgtok::period;
 104   case ',': return tgtok::comma;
 105   case '<': return tgtok::less;
 106   case '>': return tgtok::greater;
 107   case ']': return tgtok::r_square;
 108   case '{': return tgtok::l_brace;
 109   case '}': return tgtok::r_brace;
 110   case '(': return tgtok::l_paren;
 111   case ')': return tgtok::r_paren;
 112   case '=': return tgtok::equal;
 113   case '?': return tgtok::question;
 114
 115   case 0:
 116   case ' ':
 117   case '\t':
 118   case '\n':
 119   case '\r':
 120     // Ignore whitespace.
 121     return LexToken();
 122   case '/':
 123     // If this is the start of a // comment, skip until the end of the line or
 124     // the end of the buffer.
 125     if (*CurPtr == '/')
 126       SkipBCPLComment();
 127     else if (*CurPtr == '*') {
 128       if (SkipCComment())
 129         return tgtok::Error;
 130     } else // Otherwise, this is an error.
 131       return ReturnError(TokStart, "Unexpected character");
 132     return LexToken();
 133   case '-': case '+':
 134   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
 135   case '7': case '8': case '9':
 136     return LexNumber();
 137   case '"': return LexString();
 138   case '$': return LexVarName();
 139   case '[': return LexBracket();
 140   case '!': return LexExclaim();
 141   }
 142 }
 143
 144 /// LexString - Lex "[^"]*"
 145 tgtok::TokKind TGLexer::LexString() {
 146   const char *StrStart = CurPtr;
 147
 148   CurStrVal = "";
 149
 150   while (*CurPtr != '"') {
 151     // If we hit the end of the buffer, report an error.
 152     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
 153       return ReturnError(StrStart, "End of file in string literal");
 154
 155     if (*CurPtr == '\n' || *CurPtr == '\r')
 156       return ReturnError(StrStart, "End of line in string literal");
 157
 158     if (*CurPtr != '\\') {
 159       CurStrVal += *CurPtr++;
 160       continue;
 161     }
 162
 163     ++CurPtr;
 164
 165     switch (*CurPtr) {
 166     case '\\': case '\'': case '"':
 167       // These turn into their literal character.
 168       CurStrVal += *CurPtr++;
 169       break;
 170     case 't':
 171       CurStrVal += '\t';
 172       ++CurPtr;
 173       break;
 174     case 'n':
 175       CurStrVal += '\n';
 176       ++CurPtr;
 177       break;
 178
 179     case '\n':
 180     case '\r':
 181       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
 182
 183     // If we hit the end of the buffer, report an error.
 184     case '\0':
 185       if (CurPtr == CurBuf->getBufferEnd())
 186         return ReturnError(StrStart, "End of file in string literal");
 187       // FALL THROUGH
 188     default:
 189       return ReturnError(CurPtr, "invalid escape in string literal");
 190     }
 191   }
 192
 193   ++CurPtr;
 194   return tgtok::StrVal;
 195 }
 196
 197 tgtok::TokKind TGLexer::LexVarName() {
 198   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
 199     return ReturnError(TokStart, "Invalid variable name");
 200
 201   // Otherwise, we're ok, consume the rest of the characters.
 202   const char *VarNameStart = CurPtr++;
 203
 204   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 205     ++CurPtr;
 206
 207   CurStrVal.assign(VarNameStart, CurPtr);
 208   return tgtok::VarName;
 209 }
 210
 211
 212 tgtok::TokKind TGLexer::LexIdentifier() {
 213   // The first letter is [a-zA-Z_#].
 214   const char *IdentStart = TokStart;
 215
 216   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
 217   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' ||
 218          *CurPtr == '#')
 219     ++CurPtr;
 220
 221   // Check to see if this identifier is a keyword.
 222   StringRef Str(IdentStart, CurPtr-IdentStart);
 223
 224   if (Str == "include") {
 225     if (LexInclude()) return tgtok::Error;
 226     return Lex();
 227   }
 228
 229   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
 230     .Case("int", tgtok::Int)
 231     .Case("bit", tgtok::Bit)
 232     .Case("bits", tgtok::Bits)
 233     .Case("string", tgtok::String)
 234     .Case("list", tgtok::List)
 235     .Case("code", tgtok::Code)
 236     .Case("dag", tgtok::Dag)
 237     .Case("class", tgtok::Class)
 238     .Case("def", tgtok::Def)
 239     .Case("defm", tgtok::Defm)
 240     .Case("multiclass", tgtok::MultiClass)
 241     .Case("field", tgtok::Field)
 242     .Case("let", tgtok::Let)
 243     .Case("in", tgtok::In)
 244     .Default(tgtok::Id);
 245
 246   if (Kind == tgtok::Id)
 247     CurStrVal.assign(Str.begin(), Str.end());
 248   return Kind;
 249 }
 250
 251 /// LexInclude - We just read the "include" token.  Get the string token that
 252 /// comes next and enter the include.
 253 bool TGLexer::LexInclude() {
 254   // The token after the include must be a string.
 255   tgtok::TokKind Tok = LexToken();
 256   if (Tok == tgtok::Error) return true;
 257   if (Tok != tgtok::StrVal) {
 258     PrintError(getLoc(), "Expected filename after include");
 259     return true;
 260   }
 261
 262   // Get the string.
 263   std::string Filename = CurStrVal;
 264   std::string IncludedFile;
 265
 266
 267   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
 268                                     IncludedFile);
 269   if (CurBuffer == -1) {
 270     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
 271     return true;
 272   }
 273
 274   Dependencies.push_back(IncludedFile);
 275   // Save the line number and lex buffer of the includer.
 276   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
 277   CurPtr = CurBuf->getBufferStart();
 278   return false;
 279 }
 280
 281 void TGLexer::SkipBCPLComment() {
 282   ++CurPtr;  // skip the second slash.
 283   while (1) {
 284     switch (*CurPtr) {
 285     case '\n':
 286     case '\r':
 287       return;  // Newline is end of comment.
 288     case 0:
 289       // If this is the end of the buffer, end the comment.
 290       if (CurPtr == CurBuf->getBufferEnd())
 291         return;
 292       break;
 293     }
 294     // Otherwise, skip the character.
 295     ++CurPtr;
 296   }
 297 }
 298
 299 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 300 /// is that we allow nesting.
 301 bool TGLexer::SkipCComment() {
 302   ++CurPtr;  // skip the star.
 303   unsigned CommentDepth = 1;
 304
 305   while (1) {
 306     int CurChar = getNextChar();
 307     switch (CurChar) {
 308     case EOF:
 309       PrintError(TokStart, "Unterminated comment!");
 310       return true;
 311     case '*':
 312       // End of the comment?
 313       if (CurPtr[0] != '/') break;
 314
 315       ++CurPtr;   // End the */.
 316       if (--CommentDepth == 0)
 317         return false;
 318       break;
 319     case '/':
 320       // Start of a nested comment?
 321       if (CurPtr[0] != '*') break;
 322       ++CurPtr;
 323       ++CommentDepth;
 324       break;
 325     }
 326   }
 327 }
 328
 329 /// LexNumber - Lex:
 330 ///    [-+]?[0-9]+
 331 ///    0x[0-9a-fA-F]+
 332 ///    0b[01]+
 333 tgtok::TokKind TGLexer::LexNumber() {
 334   if (CurPtr[-1] == '0') {
 335     if (CurPtr[0] == 'x') {
 336       ++CurPtr;
 337       const char *NumStart = CurPtr;
 338       while (isxdigit(CurPtr[0]))
 339         ++CurPtr;
 340
 341       // Requires at least one hex digit.
 342       if (CurPtr == NumStart)
 343         return ReturnError(TokStart, "Invalid hexadecimal number");
 344
 345       errno = 0;
 346       CurIntVal = strtoll(NumStart, 0, 16);
 347       if (errno == EINVAL)
 348         return ReturnError(TokStart, "Invalid hexadecimal number");
 349       if (errno == ERANGE) {
 350         errno = 0;
 351         CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
 352         if (errno == EINVAL)
 353           return ReturnError(TokStart, "Invalid hexadecimal number");
 354         if (errno == ERANGE)
 355           return ReturnError(TokStart, "Hexadecimal number out of range");
 356       }
 357       return tgtok::IntVal;
 358     } else if (CurPtr[0] == 'b') {
 359       ++CurPtr;
 360       const char *NumStart = CurPtr;
 361       while (CurPtr[0] == '0' || CurPtr[0] == '1')
 362         ++CurPtr;
 363
 364       // Requires at least one binary digit.
 365       if (CurPtr == NumStart)
 366         return ReturnError(CurPtr-2, "Invalid binary number");
 367       CurIntVal = strtoll(NumStart, 0, 2);
 368       return tgtok::IntVal;
 369     }
 370   }
 371
 372   // Check for a sign without a digit.
 373   if (!isdigit(CurPtr[0])) {
 374     if (CurPtr[-1] == '-')
 375       return tgtok::minus;
 376     else if (CurPtr[-1] == '+')
 377       return tgtok::plus;
 378   }
 379
 380   while (isdigit(CurPtr[0]))
 381     ++CurPtr;
 382   CurIntVal = strtoll(TokStart, 0, 10);
 383   return tgtok::IntVal;
 384 }
 385
 386 /// LexBracket - We just read '['.  If this is a code block, return it,
 387 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
 388 tgtok::TokKind TGLexer::LexBracket() {
 389   if (CurPtr[0] != '{')
 390     return tgtok::l_square;
 391   ++CurPtr;
 392   const char *CodeStart = CurPtr;
 393   while (1) {
 394     int Char = getNextChar();
 395     if (Char == EOF) break;
 396
 397     if (Char != '}') continue;
 398
 399     Char = getNextChar();
 400     if (Char == EOF) break;
 401     if (Char == ']') {
 402       CurStrVal.assign(CodeStart, CurPtr-2);
 403       return tgtok::CodeFragment;
 404     }
 405   }
 406
 407   return ReturnError(CodeStart-2, "Unterminated Code Block");
 408 }
 409
 410 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 411 tgtok::TokKind TGLexer::LexExclaim() {
 412   if (!isalpha(*CurPtr))
 413     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
 414
 415   const char *Start = CurPtr++;
 416   while (isalpha(*CurPtr))
 417     ++CurPtr;
 418
 419   // Check to see which operator this is.
 420   tgtok::TokKind Kind =
 421     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
 422     .Case("eq", tgtok::XEq)
 423     .Case("if", tgtok::XIf)
 424     .Case("head", tgtok::XHead)
 425     .Case("tail", tgtok::XTail)
 426     .Case("con", tgtok::XConcat)
 427     .Case("shl", tgtok::XSHL)
 428     .Case("sra", tgtok::XSRA)
 429     .Case("srl", tgtok::XSRL)
 430     .Case("cast", tgtok::XCast)
 431     .Case("empty", tgtok::XEmpty)
 432     .Case("subst", tgtok::XSubst)
 433     .Case("foreach", tgtok::XForEach)
 434     .Case("strconcat", tgtok::XStrConcat)
 435     .Default(tgtok::Error);
 436
 437   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
 438 }
 439