From f654cdae59a71576b4da603c44ea78e24fe3a2a4 Mon Sep 17 00:00:00 2001 From: Alex Lorenz Date: Mon, 22 Jun 2015 20:37:46 +0000 Subject: [PATCH] MIR Serialization: Introduce a lexer for machine instructions. This commit adds a function that tokenizes the string containing the machine instruction. This commit also adds a struct called 'MIToken' which is used to represent the lexer's tokens. Reviewers: Sean Silva Differential Revision: http://reviews.llvm.org/D10521 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@240323 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MIRParser/CMakeLists.txt | 1 + lib/CodeGen/MIRParser/MILexer.cpp | 87 +++++++++++++++++++ lib/CodeGen/MIRParser/MILexer.h | 65 ++++++++++++++ lib/CodeGen/MIRParser/MIParser.cpp | 51 ++++++++--- test/CodeGen/MIR/X86/machine-instructions.mir | 2 +- test/CodeGen/MIR/X86/missing-instruction.mir | 18 ++++ .../MIR/X86/unrecognized-character.mir | 18 ++++ 7 files changed, 230 insertions(+), 12 deletions(-) create mode 100644 lib/CodeGen/MIRParser/MILexer.cpp create mode 100644 lib/CodeGen/MIRParser/MILexer.h create mode 100644 test/CodeGen/MIR/X86/missing-instruction.mir create mode 100644 test/CodeGen/MIR/X86/unrecognized-character.mir diff --git a/lib/CodeGen/MIRParser/CMakeLists.txt b/lib/CodeGen/MIRParser/CMakeLists.txt index d9cf3d8893e..7e757f68208 100644 --- a/lib/CodeGen/MIRParser/CMakeLists.txt +++ b/lib/CodeGen/MIRParser/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMMIRParser + MILexer.cpp MIParser.cpp MIRParser.cpp ) diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp new file mode 100644 index 00000000000..69fbba60085 --- /dev/null +++ b/lib/CodeGen/MIRParser/MILexer.cpp @@ -0,0 +1,87 @@ +//===- MILexer.cpp - Machine instructions lexer implementation ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the lexing of machine instructions. +// +//===----------------------------------------------------------------------===// + +#include "MILexer.h" +#include "llvm/ADT/Twine.h" +#include + +using namespace llvm; + +namespace { + +/// This class provides a way to iterate and get characters from the source +/// string. +class Cursor { + const char *Ptr; + const char *End; + +public: + explicit Cursor(StringRef Str) { + Ptr = Str.data(); + End = Ptr + Str.size(); + } + + bool isEOF() const { return Ptr == End; } + + char peek() const { return isEOF() ? 0 : *Ptr; } + + void advance() { ++Ptr; } + + StringRef remaining() const { return StringRef(Ptr, End - Ptr); } + + StringRef upto(Cursor C) const { + assert(C.Ptr >= Ptr && C.Ptr <= End); + return StringRef(Ptr, C.Ptr - Ptr); + } + + StringRef::iterator location() const { return Ptr; } +}; + +} // end anonymous namespace + +/// Skip the leading whitespace characters and return the updated cursor. +static Cursor skipWhitespace(Cursor C) { + while (isspace(C.peek())) + C.advance(); + return C; +} + +static bool isIdentifierChar(char C) { + return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.'; +} + +static Cursor lexIdentifier(Cursor C, MIToken &Token) { + auto Range = C; + while (isIdentifierChar(C.peek())) + C.advance(); + Token = MIToken(MIToken::Identifier, Range.upto(C)); + return C; +} + +StringRef llvm::lexMIToken( + StringRef Source, MIToken &Token, + function_ref ErrorCallback) { + auto C = skipWhitespace(Cursor(Source)); + if (C.isEOF()) { + Token = MIToken(MIToken::Eof, C.remaining()); + return C.remaining(); + } + + auto Char = C.peek(); + if (isalpha(Char) || Char == '_') + return lexIdentifier(C, Token).remaining(); + Token = MIToken(MIToken::Error, C.remaining()); + ErrorCallback(C.location(), + Twine("unexpected character '") + Twine(Char) + "'"); + return C.remaining(); +} diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h new file mode 100644 index 00000000000..d6a5d1f4ec9 --- /dev/null +++ b/lib/CodeGen/MIRParser/MILexer.h @@ -0,0 +1,65 @@ +//===- MILexer.h - Lexer for machine instructions -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the function that lexes the machine instruction source +// string. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H +#define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/STLExtras.h" +#include + +namespace llvm { + +class Twine; + +/// A token produced by the machine instruction lexer. +struct MIToken { + enum TokenKind { + // Markers + Eof, + Error, + + // Identifier tokens + Identifier + }; + +private: + TokenKind Kind; + StringRef Range; + +public: + MIToken(TokenKind Kind, StringRef Range) : Kind(Kind), Range(Range) {} + + TokenKind kind() const { return Kind; } + + bool isError() const { return Kind == Error; } + + bool is(TokenKind K) const { return Kind == K; } + + bool isNot(TokenKind K) const { return Kind != K; } + + StringRef::iterator location() const { return Range.begin(); } + + StringRef stringValue() const { return Range; } +}; + +/// Consume a single machine instruction token in the given source and return +/// the remaining source string. +StringRef lexMIToken( + StringRef Source, MIToken &Token, + function_ref ErrorCallback); + +} // end namespace llvm + +#endif diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index 9427de4f015..a677b7c47f7 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "MIParser.h" +#include "MILexer.h" #include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -29,7 +30,8 @@ class MIParser { SourceMgr &SM; MachineFunction &MF; SMDiagnostic &Error; - StringRef Source; + StringRef Source, CurrentSource; + MIToken Token; /// Maps from instruction names to op codes. StringMap Names2InstrOpCodes; @@ -37,11 +39,18 @@ public: MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error, StringRef Source); + void lex(); + /// Report an error at the current location with the given message. /// /// This function always return true. bool error(const Twine &Msg); + /// Report an error at the given location with the given message. + /// + /// This function always return true. + bool error(StringRef::iterator Loc, const Twine &Msg); + MachineInstr *parse(); private: @@ -50,31 +59,42 @@ private: /// Try to convert an instruction name to an opcode. Return true if the /// instruction name is invalid. bool parseInstrName(StringRef InstrName, unsigned &OpCode); + + bool parseInstruction(unsigned &OpCode); }; } // end anonymous namespace MIParser::MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error, StringRef Source) - : SM(SM), MF(MF), Error(Error), Source(Source) {} + : SM(SM), MF(MF), Error(Error), Source(Source), CurrentSource(Source), + Token(MIToken::Error, StringRef()) {} + +void MIParser::lex() { + CurrentSource = lexMIToken( + CurrentSource, Token, + [this](StringRef::iterator Loc, const Twine &Msg) { error(Loc, Msg); }); +} + +bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); } -bool MIParser::error(const Twine &Msg) { +bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) { // TODO: Get the proper location in the MIR file, not just a location inside // the string. - Error = - SMDiagnostic(SM, SMLoc(), SM.getMemoryBuffer(SM.getMainFileID()) - ->getBufferIdentifier(), - 1, 0, SourceMgr::DK_Error, Msg.str(), Source, None, None); + assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size())); + Error = SMDiagnostic( + SM, SMLoc(), + SM.getMemoryBuffer(SM.getMainFileID())->getBufferIdentifier(), 1, + Loc - Source.data(), SourceMgr::DK_Error, Msg.str(), Source, None, None); return true; } MachineInstr *MIParser::parse() { - StringRef InstrName = Source; + lex(); + unsigned OpCode; - if (parseInstrName(InstrName, OpCode)) { - error(Twine("unknown machine instruction name '") + InstrName + "'"); + if (Token.isError() || parseInstruction(OpCode)) return nullptr; - } // TODO: Parse the rest of instruction - machine operands, etc. const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode); @@ -82,6 +102,15 @@ MachineInstr *MIParser::parse() { return MI; } +bool MIParser::parseInstruction(unsigned &OpCode) { + if (Token.isNot(MIToken::Identifier)) + return error("expected a machine instruction"); + StringRef InstrName = Token.stringValue(); + if (parseInstrName(InstrName, OpCode)) + return error(Twine("unknown machine instruction name '") + InstrName + "'"); + return false; +} + void MIParser::initNames2InstrOpCodes() { if (!Names2InstrOpCodes.empty()) return; diff --git a/test/CodeGen/MIR/X86/machine-instructions.mir b/test/CodeGen/MIR/X86/machine-instructions.mir index 801a6d8f357..a78cd57e9b6 100644 --- a/test/CodeGen/MIR/X86/machine-instructions.mir +++ b/test/CodeGen/MIR/X86/machine-instructions.mir @@ -20,5 +20,5 @@ body: # CHECK: - IMUL32rri8 # CHECK-NEXT: - RETQ - IMUL32rri8 - - RETQ + - ' RETQ ' ... diff --git a/test/CodeGen/MIR/X86/missing-instruction.mir b/test/CodeGen/MIR/X86/missing-instruction.mir new file mode 100644 index 00000000000..0cf4371d349 --- /dev/null +++ b/test/CodeGen/MIR/X86/missing-instruction.mir @@ -0,0 +1,18 @@ +# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s + +--- | + + define void @foo() { + entry: + ret void + } + +... +--- +name: foo +body: + - name: entry + instructions: + # CHECK: 1:1: expected a machine instruction + - '' +... diff --git a/test/CodeGen/MIR/X86/unrecognized-character.mir b/test/CodeGen/MIR/X86/unrecognized-character.mir new file mode 100644 index 00000000000..b645018c428 --- /dev/null +++ b/test/CodeGen/MIR/X86/unrecognized-character.mir @@ -0,0 +1,18 @@ +# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s + +--- | + + define void @foo() { + entry: + ret void + } + +... +--- +name: foo +body: + - name: entry + instructions: + # CHECK: 1:1: unexpected character '`' + - '` RETQ' +... -- 2.34.1