From 47e7c040f4e03a5fd162ee7c8ea061cfe873eb23 Mon Sep 17 00:00:00 2001
From: Daniel Dunbar <daniel@zuster.org>
Date: Wed, 14 Aug 2013 15:24:58 +0000
Subject: [PATCH] [lit] Support parsing scripts with inconsistent or invalid
 encodings.

 - For whatever reason, we have a lot of test files with bogus unicode
   characters. This patch allows those scripts to still be parsed on Python3 by
   changing the parsing logic to work on binary files, and only require the
   actual script commands to be convertible to ascii.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188376 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/lit/lit/TestRunner.py        | 54 +++++++++++++++++++++++-------
 utils/lit/tests/shtest-encoding.py |  3 ++
 2 files changed, 45 insertions(+), 12 deletions(-)
 create mode 100644 utils/lit/tests/shtest-encoding.py

diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 068e4991b21..8a9bddd2a23 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -305,24 +305,54 @@ def isExpectedFail(test, xfails):
 
     return False
 
-def parseIntegratedTestScriptCommands(sourcepath):
+def parseIntegratedTestScriptCommands(source_path):
     """
     parseIntegratedTestScriptCommands(source_path) -> commands
 
     Parse the commands in an integrated test script file into a list of
     (line_number, command_type, line).
     """
-    line_number = 0
-    for ln in open(sourcepath):
-        line_number += 1
-        if 'RUN:' in ln:
-            yield (line_number, 'RUN', ln[ln.index('RUN:')+4:])
-        elif 'XFAIL:' in ln:
-            yield (line_number, 'XFAIL', ln[ln.index('XFAIL:') + 6:])
-        elif 'REQUIRES:' in ln:
-            yield (line_number, 'REQUIRES', ln[ln.index('REQUIRES:') + 9:])
-        elif 'END.' in ln:
-            yield (line_number, 'END', ln[ln.index('END.') + 4:])
+
+    # This code is carefully written to be dual compatible with Python 2.5+ and
+    # Python 3 without requiring input files to always have valid codings. The
+    # trick we use is to open the file in binary mode and use the regular
+    # expression library to find the commands, with it scanning strings in
+    # Python2 and bytes in Python3.
+    #
+    # Once we find a match, we do require each script line to be decodable to
+    # ascii, so we convert the outputs to ascii before returning. This way the
+    # remaining code can work with "strings" agnostic of the executing Python
+    # version.
+    
+    def to_bytes(str):
+        # Encode to Latin1 to get binary data.
+        return str.encode('ISO-8859-1')
+    keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
+    keywords_re = re.compile(
+        to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
+
+    f = open(source_path, 'rb')
+    try:
+        # Read the entire file contents.
+        data = f.read()
+
+        # Iterate over the matches.
+        line_number = 1
+        last_match_position = 0
+        for match in keywords_re.finditer(data):
+            # Compute the updated line number by counting the intervening
+            # newlines.
+            match_position = match.start()
+            line_number += data.count(to_bytes('\n'), last_match_position,
+                                      match_position)
+            last_match_position = match_position
+
+            # Convert the keyword and line to ascii and yield the command.
+            keyword,ln = match.groups()
+            yield (line_number, keyword[:-1].decode('ascii'),
+                   ln.decode('ascii'))
+    finally:
+        f.close()
 
 def parseIntegratedTestScript(test, normalize_slashes=False,
                               extra_substitutions=[]):
diff --git a/utils/lit/tests/shtest-encoding.py b/utils/lit/tests/shtest-encoding.py
new file mode 100644
index 00000000000..dfc987f6df7
--- /dev/null
+++ b/utils/lit/tests/shtest-encoding.py
@@ -0,0 +1,3 @@
+# RUN: true
+
+# Here is a string that cannot be decoded in line mode: Â.
-- 
2.34.1