Add writeFileWithSystemEncoding to LibLLVMSuppor.

author Rafael Espindola <rafael.espindola@gmail.com>

Wed, 3 Sep 2014 20:02:00 +0000 (20:02 +0000)

committer Rafael Espindola <rafael.espindola@gmail.com>

Wed, 3 Sep 2014 20:02:00 +0000 (20:02 +0000)
author Rafael Espindola <rafael.espindola@gmail.com>
Wed, 3 Sep 2014 20:02:00 +0000 (20:02 +0000)
committer Rafael Espindola <rafael.espindola@gmail.com>
Wed, 3 Sep 2014 20:02:00 +0000 (20:02 +0000)
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h

index 51279a9b864ae6b91059b94d98dd8d3d4168198b..01165e8bf7484dae7799b3b9a5064ed5b8f66aab 100644 (file)
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -126,6 +126,40 @@ struct ProcessInfo {
    /// argument length limits.
    bool argumentsFitWithinSystemLimits(ArrayRef<const char*> Args);
  
+  /// File encoding options when writing contents that a non-UTF8 tool will
+  /// read (on Windows systems). For UNIX, we always use UTF-8.
+  enum WindowsEncodingMethod {
+    /// UTF-8 is the LLVM native encoding, being the same as "do not perform
+    /// encoding conversion".
+    WEM_UTF8,
+    WEM_CurrentCodePage,
+    WEM_UTF16
+  };
+
+  /// Saves the UTF8-encoded \p contents string into the file \p FileName
+  /// using a specific encoding.
+  ///
+  /// This write file function adds the possibility to choose which encoding
+  /// to use when writing a text file. On Windows, this is important when
+  /// writing files with internationalization support with an encoding that is
+  /// different from the one used in LLVM (UTF-8). We use this when writing
+  /// response files, since GCC tools on MinGW only understand legacy code
+  /// pages, and VisualStudio tools only understand UTF-16.
+  /// For UNIX, using different encodings is silently ignored, since all tools
+  /// work well with UTF-8.
+  /// This function assumes that you only use UTF-8 *text* data and will convert
+  /// it to your desired encoding before writing to the file.
+  ///
+  /// FIXME: We use EM_CurrentCodePage to write response files for GNU tools in
+  /// a MinGW/MinGW-w64 environment, which has serious flaws but currently is
+  /// our best shot to make gcc/ld understand international characters. This
+  /// should be changed as soon as binutils fix this to support UTF16 on mingw.
+  ///
+  /// \returns non-zero error_code if failed
+  std::error_code
+  writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                        WindowsEncodingMethod Encoding = WEM_UTF8);
+
    /// This function waits for the process specified by \p PI to finish.
    /// \returns A \see ProcessInfo struct with Pid set to:
    /// \li The process id of the child process if the child process has changed
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc

index 63d7ec22ebb0f4396fa3e28f95f9a3967077c262..905c78f79bc323916eb4ac67ee7d21e278dbfc0c 100644 (file)
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -19,6 +19,7 @@
  #include "Unix.h"
  #include "llvm/Support/Compiler.h"
  #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
  #include <llvm/Config/config.h>
  #if HAVE_SYS_STAT_H
  #include <sys/stat.h>
@@ -440,6 +441,23 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
      return std::error_code();
  }
  
+std::error_code
+llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                                 WindowsEncodingMethod Encoding /*unused*/) {
+  std::error_code EC;
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
+
+  if (EC)
+    return EC;
+
+  OS << Contents;
+
+  if (OS.has_error())
+    return std::make_error_code(std::errc::io_error);
+
+  return EC;
+}
+
  bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
    static long ArgMax = sysconf(_SC_ARG_MAX);
  
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc

index b09c198969938376bde89a2011adce86e9d325bd..bff82141b343aecfa0b49b4a2dc51651522cc837 100644 (file)
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -919,11 +919,13 @@ std::error_code UTF8ToUTF16(llvm::StringRef utf8,
    return std::error_code();
  }
  
-std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                            llvm::SmallVectorImpl<char> &utf8) {
+static
+std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16,
+                                size_t utf16_len,
+                                llvm::SmallVectorImpl<char> &utf8) {
    if (utf16_len) {
      // Get length.
-    int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(),
+    int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(),
                                      0, NULL, NULL);
  
      if (len == 0)
@@ -933,7 +935,7 @@ std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
      utf8.set_size(len);
  
      // Now do the actual conversion.
-    len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.data(),
+    len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(),
                                  utf8.size(), NULL, NULL);
  
      if (len == 0)
@@ -946,6 +948,16 @@ std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
  
    return std::error_code();
  }
+
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            llvm::SmallVectorImpl<char> &utf8) {
+  return UTF16ToCodePage(CP_UTF8, utf16, utf16_len, utf8);
+}
+
+std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
+                             llvm::SmallVectorImpl<char> &utf8) {
+  return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8);
+}
  } // end namespace windows
  } // end namespace sys
  } // end namespace llvm
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc

index 74a0066de8d0c11a4357dff3c8d28f85cdf21f7c..affaf03eb61d0445c71e4127ba0b38c246e0e0fc 100644 (file)
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -12,7 +12,9 @@
  //===----------------------------------------------------------------------===//
  
  #include "WindowsSupport.h"
+#include "llvm/Support/ConvertUTF.h"
  #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
  #include <cstdio>
  #include <fcntl.h>
  #include <io.h>
@@ -440,6 +442,50 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
    return std::error_code();
  }
  
+std::error_code
+llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                                 WindowsEncodingMethod Encoding) {
+  std::error_code EC;
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
+  if (EC)
+    return EC;
+
+  if (Encoding == WEM_UTF8) {
+    OS << Contents;
+  } else if (Encoding == WEM_CurrentCodePage) {
+    SmallVector<wchar_t, 1> ArgsUTF16;
+    SmallVector<char, 1> ArgsCurCP;
+
+    if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)))
+      return EC;
+
+    if ((EC = windows::UTF16ToCurCP(
+             ArgsUTF16.data(), ArgsUTF16.size(), ArgsCurCP)))
+      return EC;
+
+    OS.write(ArgsCurCP.data(), ArgsCurCP.size());
+  } else if (Encoding == WEM_UTF16) {
+    SmallVector<wchar_t, 1> ArgsUTF16;
+
+    if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)))
+      return EC;
+
+    // Endianness guessing
+    char BOM[2];
+    uint16_t src = UNI_UTF16_BYTE_ORDER_MARK_NATIVE;
+    memcpy(BOM, &src, 2);
+    OS.write(BOM, 2);
+    OS.write((char *)ArgsUTF16.data(), ArgsUTF16.size() << 1);
+  } else {
+    llvm_unreachable("Unknown encoding");
+  }
+
+  if (OS.has_error())
+    return std::make_error_code(std::errc::io_error);
+
+  return EC;
+}
+
  bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
    // The documented max length of the command line passed to CreateProcess.
    static const size_t MaxCommandStringLength = 32768;
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h

index f68835b1a71a7463a89eaff1b580e75bb904d4fd..6d9c5fb24ff9c534738c30d2e983c08e5ead005a 100644 (file)
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -166,6 +166,9 @@ namespace windows {
  std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
  std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
                              SmallVectorImpl<char> &utf8);
+/// Convert from UTF16 to the current code page used in the system
+std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
+                             SmallVectorImpl<char> &utf8);
  } // end namespace windows
  } // end namespace sys
  } // end namespace llvm.
diff --git a/unittests/Support/ProgramTest.cpp b/unittests/Support/ProgramTest.cpp

index 4e7316fb3ace5f3f37ac9642e1cf51f6ba325075..c0e6e80e358b6f37d75c8aead36cdf3683a8aa15 100644 (file)
--- a/unittests/Support/ProgramTest.cpp
+++ b/unittests/Support/ProgramTest.cpp
@@ -34,6 +34,16 @@ void sleep_for(unsigned int seconds) {
  #error sleep_for is not implemented on your platform.
  #endif
  
+#define ASSERT_NO_ERROR(x)                                                     \
+  if (std::error_code ASSERT_NO_ERROR_ec = x) {                                \
+    SmallString<128> MessageStorage;                                           \
+    raw_svector_ostream Message(MessageStorage);                               \
+    Message << #x ": did not return errc::success.\n"                          \
+            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n"          \
+            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n";      \
+    GTEST_FATAL_FAILURE_(MessageStorage.c_str());                              \
+  } else {                                                                     \
+  }
  // From TestMain.cpp.
  extern const char *TestMainArgv0;
  
@@ -220,4 +230,44 @@ TEST(ProgramTest, TestExecuteNegative) {
  
  }
  
+#ifdef LLVM_ON_WIN32
+const char utf16le_text[] =
+    "\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61\x00";
+const char utf16be_text[] =
+    "\x00\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61";
+#endif
+const char utf8_text[] = "\x6c\x69\x6e\x67\xc3\xbc\x69\xc3\xa7\x61";
+
+TEST(ProgramTest, TestWriteWithSystemEncoding) {
+  SmallString<128> TestDirectory;
+  ASSERT_NO_ERROR(fs::createUniqueDirectory("program-test", TestDirectory));
+  errs() << "Test Directory: " << TestDirectory << '\n';
+  errs().flush();
+  SmallString<128> file_pathname(TestDirectory);
+  path::append(file_pathname, "international-file.txt");
+  // Only on Windows we should encode in UTF16. For other systems, use UTF8
+  ASSERT_NO_ERROR(sys::writeFileWithEncoding(file_pathname.c_str(), utf8_text,
+                                             sys::WEM_UTF16));
+  int fd = 0;
+  ASSERT_NO_ERROR(fs::openFileForRead(file_pathname.c_str(), fd));
+#if defined(LLVM_ON_WIN32)
+  char buf[18];
+  ASSERT_EQ(::read(fd, buf, 18), 18);
+  if (strncmp(buf, "\xfe\xff", 2) == 0) { // UTF16-BE
+    ASSERT_EQ(strncmp(&buf[2], utf16be_text, 16), 0);
+  } else if (strncmp(buf, "\xff\xfe", 2) == 0) { // UTF16-LE
+    ASSERT_EQ(strncmp(&buf[2], utf16le_text, 16), 0);
+  } else {
+    FAIL() << "Invalid BOM in UTF-16 file";
+  }
+#else
+  char buf[10];
+  ASSERT_EQ(::read(fd, buf, 10), 10);
+  ASSERT_EQ(strncmp(buf, utf8_text, 10), 0);
+#endif
+  ::close(fd);
+  ASSERT_NO_ERROR(fs::remove(file_pathname.str()));
+  ASSERT_NO_ERROR(fs::remove(TestDirectory.str()));
+}
+
  } // end anonymous namespace
author	Rafael Espindola <rafael.espindola@gmail.com>
	Wed, 3 Sep 2014 20:02:00 +0000 (20:02 +0000)
committer	Rafael Espindola <rafael.espindola@gmail.com>
	Wed, 3 Sep 2014 20:02:00 +0000 (20:02 +0000)
include/llvm/Support/Program.h		patch \| blob \| history
lib/Support/Unix/Program.inc		patch \| blob \| history
lib/Support/Windows/Path.inc		patch \| blob \| history
lib/Support/Windows/Program.inc		patch \| blob \| history
lib/Support/Windows/WindowsSupport.h		patch \| blob \| history
unittests/Support/ProgramTest.cpp		patch \| blob \| history