1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // Implement the Lexer for TableGen.
12 //===----------------------------------------------------------------------===//
15 #include "llvm/TableGen/Error.h"
16 #include "llvm/Support/SourceMgr.h"
17 #include "llvm/Support/MemoryBuffer.h"
18 #include "llvm/Config/config.h"
19 #include "llvm/ADT/StringSwitch.h"
20 #include "llvm/ADT/Twine.h"
28 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
30 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
31 CurPtr = CurBuf->getBufferStart();
35 SMLoc TGLexer::getLoc() const {
36 return SMLoc::getFromPointer(TokStart);
39 /// ReturnError - Set the error to the specified string at the specified
40 /// location. This is defined to always return tgtok::Error.
41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
46 int TGLexer::getNextChar() {
47 char CurChar = *CurPtr++;
50 return (unsigned char)CurChar;
52 // A nul character in the stream is either the end of the current buffer or
53 // a random nul in the file. Disambiguate that here.
54 if (CurPtr-1 != CurBuf->getBufferEnd())
55 return 0; // Just whitespace.
57 // If this is the end of an included file, pop the parent file off the
59 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
60 if (ParentIncludeLoc != SMLoc()) {
61 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
62 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
63 CurPtr = ParentIncludeLoc.getPointer();
67 // Otherwise, return end of file.
68 --CurPtr; // Another call to lex will return EOF again.
73 // Handle the newline character by ignoring it and incrementing the line
74 // count. However, be careful about 'dos style' files with \n\r in them.
75 // Only treat a \n\r or \r\n as a single line.
76 if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
78 ++CurPtr; // Eat the two char newline sequence.
83 int TGLexer::peekNextChar(int Index) {
84 return *(CurPtr + Index);
87 tgtok::TokKind TGLexer::LexToken() {
89 // This always consumes at least one character.
90 int CurChar = getNextChar();
94 // Handle letters: [a-zA-Z_]
95 if (isalpha(CurChar) || CurChar == '_')
96 return LexIdentifier();
98 // Unknown character, emit an error.
99 return ReturnError(TokStart, "Unexpected character");
100 case EOF: return tgtok::Eof;
101 case ':': return tgtok::colon;
102 case ';': return tgtok::semi;
103 case '.': return tgtok::period;
104 case ',': return tgtok::comma;
105 case '<': return tgtok::less;
106 case '>': return tgtok::greater;
107 case ']': return tgtok::r_square;
108 case '{': return tgtok::l_brace;
109 case '}': return tgtok::r_brace;
110 case '(': return tgtok::l_paren;
111 case ')': return tgtok::r_paren;
112 case '=': return tgtok::equal;
113 case '?': return tgtok::question;
114 case '#': return tgtok::paste;
121 // Ignore whitespace.
124 // If this is the start of a // comment, skip until the end of the line or
125 // the end of the buffer.
128 else if (*CurPtr == '*') {
131 } else // Otherwise, this is an error.
132 return ReturnError(TokStart, "Unexpected character");
135 case '0': case '1': case '2': case '3': case '4': case '5': case '6':
136 case '7': case '8': case '9': {
138 if (isdigit(CurChar)) {
139 // Allow identifiers to start with a number if it is followed by
140 // an identifier. This can happen with paste operations like
144 NextChar = peekNextChar(i++);
145 } while (isdigit(NextChar));
147 if (NextChar == 'x' || NextChar == 'b') {
148 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
150 int NextNextChar = peekNextChar(i);
151 switch (NextNextChar) {
158 case '2': case '3': case '4': case '5':
159 case '6': case '7': case '8': case '9':
160 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
161 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
169 if (isalpha(NextChar) || NextChar == '_')
170 return LexIdentifier();
174 case '"': return LexString();
175 case '$': return LexVarName();
176 case '[': return LexBracket();
177 case '!': return LexExclaim();
181 /// LexString - Lex "[^"]*"
182 tgtok::TokKind TGLexer::LexString() {
183 const char *StrStart = CurPtr;
187 while (*CurPtr != '"') {
188 // If we hit the end of the buffer, report an error.
189 if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
190 return ReturnError(StrStart, "End of file in string literal");
192 if (*CurPtr == '\n' || *CurPtr == '\r')
193 return ReturnError(StrStart, "End of line in string literal");
195 if (*CurPtr != '\\') {
196 CurStrVal += *CurPtr++;
203 case '\\': case '\'': case '"':
204 // These turn into their literal character.
205 CurStrVal += *CurPtr++;
218 return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
220 // If we hit the end of the buffer, report an error.
222 if (CurPtr == CurBuf->getBufferEnd())
223 return ReturnError(StrStart, "End of file in string literal");
226 return ReturnError(CurPtr, "invalid escape in string literal");
231 return tgtok::StrVal;
234 tgtok::TokKind TGLexer::LexVarName() {
235 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
236 return ReturnError(TokStart, "Invalid variable name");
238 // Otherwise, we're ok, consume the rest of the characters.
239 const char *VarNameStart = CurPtr++;
241 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
244 CurStrVal.assign(VarNameStart, CurPtr);
245 return tgtok::VarName;
249 tgtok::TokKind TGLexer::LexIdentifier() {
250 // The first letter is [a-zA-Z_#].
251 const char *IdentStart = TokStart;
253 // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
254 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
257 // Check to see if this identifier is a keyword.
258 StringRef Str(IdentStart, CurPtr-IdentStart);
260 if (Str == "include") {
261 if (LexInclude()) return tgtok::Error;
265 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
266 .Case("int", tgtok::Int)
267 .Case("bit", tgtok::Bit)
268 .Case("bits", tgtok::Bits)
269 .Case("string", tgtok::String)
270 .Case("list", tgtok::List)
271 .Case("code", tgtok::Code)
272 .Case("dag", tgtok::Dag)
273 .Case("class", tgtok::Class)
274 .Case("def", tgtok::Def)
275 .Case("defm", tgtok::Defm)
276 .Case("multiclass", tgtok::MultiClass)
277 .Case("field", tgtok::Field)
278 .Case("let", tgtok::Let)
279 .Case("in", tgtok::In)
282 if (Kind == tgtok::Id)
283 CurStrVal.assign(Str.begin(), Str.end());
287 /// LexInclude - We just read the "include" token. Get the string token that
288 /// comes next and enter the include.
289 bool TGLexer::LexInclude() {
290 // The token after the include must be a string.
291 tgtok::TokKind Tok = LexToken();
292 if (Tok == tgtok::Error) return true;
293 if (Tok != tgtok::StrVal) {
294 PrintError(getLoc(), "Expected filename after include");
299 std::string Filename = CurStrVal;
300 std::string IncludedFile;
303 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
305 if (CurBuffer == -1) {
306 PrintError(getLoc(), "Could not find include file '" + Filename + "'");
310 Dependencies.push_back(IncludedFile);
311 // Save the line number and lex buffer of the includer.
312 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
313 CurPtr = CurBuf->getBufferStart();
317 void TGLexer::SkipBCPLComment() {
318 ++CurPtr; // skip the second slash.
323 return; // Newline is end of comment.
325 // If this is the end of the buffer, end the comment.
326 if (CurPtr == CurBuf->getBufferEnd())
330 // Otherwise, skip the character.
335 /// SkipCComment - This skips C-style /**/ comments. The only difference from C
336 /// is that we allow nesting.
337 bool TGLexer::SkipCComment() {
338 ++CurPtr; // skip the star.
339 unsigned CommentDepth = 1;
342 int CurChar = getNextChar();
345 PrintError(TokStart, "Unterminated comment!");
348 // End of the comment?
349 if (CurPtr[0] != '/') break;
351 ++CurPtr; // End the */.
352 if (--CommentDepth == 0)
356 // Start of a nested comment?
357 if (CurPtr[0] != '*') break;
369 tgtok::TokKind TGLexer::LexNumber() {
370 if (CurPtr[-1] == '0') {
371 if (CurPtr[0] == 'x') {
373 const char *NumStart = CurPtr;
374 while (isxdigit(CurPtr[0]))
377 // Requires at least one hex digit.
378 if (CurPtr == NumStart)
379 return ReturnError(TokStart, "Invalid hexadecimal number");
382 CurIntVal = strtoll(NumStart, 0, 16);
384 return ReturnError(TokStart, "Invalid hexadecimal number");
385 if (errno == ERANGE) {
387 CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
389 return ReturnError(TokStart, "Invalid hexadecimal number");
391 return ReturnError(TokStart, "Hexadecimal number out of range");
393 return tgtok::IntVal;
394 } else if (CurPtr[0] == 'b') {
396 const char *NumStart = CurPtr;
397 while (CurPtr[0] == '0' || CurPtr[0] == '1')
400 // Requires at least one binary digit.
401 if (CurPtr == NumStart)
402 return ReturnError(CurPtr-2, "Invalid binary number");
403 CurIntVal = strtoll(NumStart, 0, 2);
404 return tgtok::IntVal;
408 // Check for a sign without a digit.
409 if (!isdigit(CurPtr[0])) {
410 if (CurPtr[-1] == '-')
412 else if (CurPtr[-1] == '+')
416 while (isdigit(CurPtr[0]))
418 CurIntVal = strtoll(TokStart, 0, 10);
419 return tgtok::IntVal;
422 /// LexBracket - We just read '['. If this is a code block, return it,
423 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
424 tgtok::TokKind TGLexer::LexBracket() {
425 if (CurPtr[0] != '{')
426 return tgtok::l_square;
428 const char *CodeStart = CurPtr;
430 int Char = getNextChar();
431 if (Char == EOF) break;
433 if (Char != '}') continue;
435 Char = getNextChar();
436 if (Char == EOF) break;
438 CurStrVal.assign(CodeStart, CurPtr-2);
439 return tgtok::CodeFragment;
443 return ReturnError(CodeStart-2, "Unterminated Code Block");
446 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
447 tgtok::TokKind TGLexer::LexExclaim() {
448 if (!isalpha(*CurPtr))
449 return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
451 const char *Start = CurPtr++;
452 while (isalpha(*CurPtr))
455 // Check to see which operator this is.
456 tgtok::TokKind Kind =
457 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
458 .Case("eq", tgtok::XEq)
459 .Case("if", tgtok::XIf)
460 .Case("head", tgtok::XHead)
461 .Case("tail", tgtok::XTail)
462 .Case("con", tgtok::XConcat)
463 .Case("shl", tgtok::XSHL)
464 .Case("sra", tgtok::XSRA)
465 .Case("srl", tgtok::XSRL)
466 .Case("cast", tgtok::XCast)
467 .Case("empty", tgtok::XEmpty)
468 .Case("subst", tgtok::XSubst)
469 .Case("foreach", tgtok::XForEach)
470 .Case("strconcat", tgtok::XStrConcat)
471 .Default(tgtok::Error);
473 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");