Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper
authorReid Kleckner <reid@kleckner.net>
Mon, 26 Jan 2015 19:51:00 +0000 (19:51 +0000)
committerReid Kleckner <reid@kleckner.net>
Mon, 26 Jan 2015 19:51:00 +0000 (19:51 +0000)
This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16
helpers, but that will require massaging some character types. The
Windows support routines want wchar_t output, but wchar_t is often 32
bits on non-Windows OSs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227122 91177308-0d34-0410-b5e6-96231b3b80d8

include/llvm/Support/ConvertUTF.h
lib/Support/ConvertUTFWrapper.cpp
unittests/Support/ConvertUTFTest.cpp

index a184d0df21322cd7a9519da7ee0b48b6169107b8..38952ec99e61cf05e2038485fb820e29fb03e3b4 100644 (file)
@@ -251,6 +251,14 @@ bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
  */
 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
 
  */
 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
 
+/**
+ * Converts a UTF-8 string into a UTF-16 string with native endianness.
+ *
+ * \returns true on success
+ */
+bool convertUTF8ToUTF16String(StringRef SrcUTF8,
+                              SmallVectorImpl<UTF16> &DstUTF16);
+
 } /* end namespace llvm */
 
 #endif
 } /* end namespace llvm */
 
 #endif
index e45335ddcb6c84467c184d95c10f0bea9c3ec04b..8f77bff4668c992923c8d57da80fbfb6c18fc3d5 100644 (file)
@@ -127,5 +127,36 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   return true;
 }
 
   return true;
 }
 
+bool convertUTF8ToUTF16String(StringRef SrcUTF8,
+                              SmallVectorImpl<UTF16> &DstUTF16) {
+  assert(DstUTF16.empty());
+
+  // Avoid OOB by returning early on empty input.
+  if (SrcUTF8.empty())
+    return true;
+
+  const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin());
+  const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end());
+
+  // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
+  // as UTF-16 should always require the same amount or less code units than the
+  // UTF-8 encoding.
+  DstUTF16.resize(SrcUTF8.size());
+  UTF16 *Dst = &DstUTF16[0];
+  UTF16 *DstEnd = Dst + DstUTF16.size();
+
+  ConversionResult CR =
+      ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+  assert(CR != targetExhausted);
+
+  if (CR != conversionOK) {
+    DstUTF16.clear();
+    return false;
+  }
+
+  DstUTF16.resize(Dst - &DstUTF16[0]);
+  return true;
+}
+
 } // end namespace llvm
 
 } // end namespace llvm
 
index 49748db4ae971171950e8cb4c04723234d92f59e..a6dbe4c475fcded14ea182941cb38ae3e24b39d3 100644 (file)
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ConvertUTF.h"
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Format.h"
 #include "gtest/gtest.h"
 #include <string>
 #include <utility>
 #include "gtest/gtest.h"
 #include <string>
 #include <utility>
@@ -37,6 +38,19 @@ TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
   EXPECT_EQ(Expected, Result);
 }
 
   EXPECT_EQ(Expected, Result);
 }
 
+TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
+  // Src is the look of disapproval.
+  static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
+  StringRef Ref(Src, sizeof(Src) - 1);
+  SmallVector<UTF16, 5> Result;
+  bool Success = convertUTF8ToUTF16String(Ref, Result);
+  EXPECT_TRUE(Success);
+  static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
+  ASSERT_EQ(3, Result.size());
+  for (int I = 0, E = 3; I != E; ++I)
+    EXPECT_EQ(Expected[I], Result[I]);
+}
+
 TEST(ConvertUTFTest, OddLengthInput) {
   std::string Result;
   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
 TEST(ConvertUTFTest, OddLengthInput) {
   std::string Result;
   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);