If we see UTF-8 BOM sequence at the beginning of a response file, we shall

author Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>

Sat, 24 Jan 2015 04:23:08 +0000 (04:23 +0000)

committer Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>

Sat, 24 Jan 2015 04:23:08 +0000 (04:23 +0000)
author Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>
Sat, 24 Jan 2015 04:23:08 +0000 (04:23 +0000)
committer Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>
Sat, 24 Jan 2015 04:23:08 +0000 (04:23 +0000)
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp

index a774421b26c54a983ab86cc1a397d1c1b579401b..b4e32257a01f6224e2c8739e060ba04b2e101138 100644 (file)
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -655,6 +655,13 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
      NewArgv.push_back(nullptr);
  }
  
+// It is called byte order marker but the UTF-8 BOM is actually not affected
+// by the host system's endianness.
+static bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
+  return (S.size() >= 3 &&
+          S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
+}
+
  static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
                                 TokenizerCallback Tokenizer,
                                 SmallVectorImpl<const char *> &NewArgv,
@@ -674,6 +681,11 @@ static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
        return false;
      Str = StringRef(UTF8Buf);
    }
+  // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove
+  // these bytes before parsing.
+  // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark
+  else if (hasUTF8ByteOrderMark(BufRef))
+    Str = StringRef(BufRef.data() + 3, BufRef.size() - 3);
  
    // Tokenize the contents into NewArgv.
    Tokenizer(Str, Saver, NewArgv, MarkEOLs);
diff --git a/test/Other/Inputs/utf8-bom-response b/test/Other/Inputs/utf8-bom-response

new file mode 100644 (file)

index 0000000..9dae315
--- /dev/null
+++ b/test/Other/Inputs/utf8-bom-response
@@ -0,0 +1 @@
+-help
diff --git a/test/Other/Inputs/utf8-response b/test/Other/Inputs/utf8-response

new file mode 100644 (file)

index 0000000..97f455a
--- /dev/null
+++ b/test/Other/Inputs/utf8-response
@@ -0,0 +1 @@
+-help
diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll

index 914e5480f2033896ce82d3323650dd59ac7f8a79..92648b86f5fb910569536043dec64aa7a456ec17 100644 (file)
--- a/test/Other/ResponseFile.ll
+++ b/test/Other/ResponseFile.ll
@@ -6,6 +6,11 @@
  ; RUN: llvm-as @%t.list2 -o %t.bc
  ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
  
+; When the response file begins with UTF8 BOM sequence, we shall remove them.
+; Neither command below should return a "Could not open input file" error.
+; RUN: llvm-as @%S/Inputs/utf8-response > /dev/null
+; RUN: llvm-as @%S/Inputs/utf8-bom-response > /dev/null
+
  ; CHECK: T foobar
  
  define void @foobar() {
author	Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>
	Sat, 24 Jan 2015 04:23:08 +0000 (04:23 +0000)
committer	Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>
	Sat, 24 Jan 2015 04:23:08 +0000 (04:23 +0000)
lib/Support/CommandLine.cpp		patch \| blob \| history
test/Other/Inputs/utf8-bom-response	[new file with mode: 0644]	patch \| blob
test/Other/Inputs/utf8-response	[new file with mode: 0644]	patch \| blob
test/Other/ResponseFile.ll		patch \| blob \| history