Move internal `decodeUtf8` method from json.cpp to public util Unicode.h

author Zbigniew Szymanski <zbsz@fb.com>

Fri, 30 Dec 2016 13:19:20 +0000 (05:19 -0800)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Fri, 30 Dec 2016 13:32:55 +0000 (05:32 -0800)
author Zbigniew Szymanski <zbsz@fb.com>
Fri, 30 Dec 2016 13:19:20 +0000 (05:19 -0800)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Fri, 30 Dec 2016 13:32:55 +0000 (05:32 -0800)
diff --git a/folly/Unicode.cpp b/folly/Unicode.cpp

index c36bd077082ff0007228ea5610d4ceb156653300..b76318ed5f1864412ceaaa6aebd949a8ac0b4d87 100644 (file)
--- a/folly/Unicode.cpp
+++ b/folly/Unicode.cpp
@@ -15,6 +15,7 @@
   */
  
  #include <folly/Unicode.h>
+#include <folly/Conv.h>
  
  namespace folly {
  
@@ -48,6 +49,91 @@ std::string codePointToUtf8(char32_t cp) {
    return result;
  }
  
+
+char32_t utf8ToCodePoint(
+    const unsigned char*& p,
+    const unsigned char* const e,
+    bool skipOnError) {
+  /* The following encodings are valid, except for the 5 and 6 byte
+   * combinations:
+   * 0xxxxxxx
+   * 110xxxxx 10xxxxxx
+   * 1110xxxx 10xxxxxx 10xxxxxx
+   * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   */
+
+  auto skip = [&] { ++p; return U'\ufffd'; };
+
+  if (p >= e) {
+    if (skipOnError) return skip();
+    throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
+  }
+
+  unsigned char fst = *p;
+  if (!(fst & 0x80)) {
+    // trivial case
+    return *p++;
+  }
+
+  static const uint32_t bitMask[] = {
+    (1 << 7) - 1,
+    (1 << 11) - 1,
+    (1 << 16) - 1,
+    (1 << 21) - 1
+  };
+
+  // upper control bits are masked out later
+  uint32_t d = fst;
+
+  if ((fst & 0xC0) != 0xC0) {
+    if (skipOnError) return skip();
+    throw std::runtime_error(to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
+  }
+
+  fst <<= 1;
+
+  for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
+    unsigned char tmp = p[i];
+
+    if ((tmp & 0xC0) != 0x80) {
+      if (skipOnError) return skip();
+      throw std::runtime_error(
+        to<std::string>("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
+    }
+
+    d = (d << 6) | (tmp & 0x3F);
+    fst <<= 1;
+
+    if (!(fst & 0x80)) {
+      d &= bitMask[i];
+
+      // overlong, could have been encoded with i bytes
+      if ((d & ~bitMask[i - 1]) == 0) {
+        if (skipOnError) return skip();
+        throw std::runtime_error(
+          to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
+      }
+
+      // check for surrogates only needed for 3 bytes
+      if (i == 2) {
+        if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
+          if (skipOnError) return skip();
+          throw std::runtime_error(
+            to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
+        }
+      }
+
+      p += i + 1;
+      return d;
+    }
+  }
+
+  if (skipOnError) return skip();
+  throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
+}
+
  //////////////////////////////////////////////////////////////////////
  
  }
diff --git a/folly/Unicode.h b/folly/Unicode.h

index 542dc10babad2706e5efc2fa10e253e755c9b138..f1dc9ed0bb04790d4f91f3d25c5bc978ec4ac7d2 100644 (file)
--- a/folly/Unicode.h
+++ b/folly/Unicode.h
@@ -31,6 +31,14 @@ namespace folly {
   */
  std::string codePointToUtf8(char32_t cp);
  
+/*
+ * Decode a single unicode code point from UTF-8 byte sequence.
+ */
+char32_t utf8ToCodePoint(
+    const unsigned char*& p,
+    const unsigned char* const e,
+    bool skipOnError);
+
  //////////////////////////////////////////////////////////////////////
  
  }
diff --git a/folly/json.cpp b/folly/json.cpp

index 882d3bf0953e134675d1a3c295520045fb9dc18d..b56e6f4bdeeab444d7ccfbbe54ead1436007c02c 100644 (file)
--- a/folly/json.cpp
+++ b/folly/json.cpp
@@ -33,90 +33,6 @@ namespace folly {
  namespace json {
  namespace {
  
-char32_t decodeUtf8(
-    const unsigned char*& p,
-    const unsigned char* const e,
-    bool skipOnError) {
-  /* The following encodings are valid, except for the 5 and 6 byte
-   * combinations:
-   * 0xxxxxxx
-   * 110xxxxx 10xxxxxx
-   * 1110xxxx 10xxxxxx 10xxxxxx
-   * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-   * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-   * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-   */
-
-  auto skip = [&] { ++p; return U'\ufffd'; };
-
-  if (p >= e) {
-    if (skipOnError) return skip();
-    throw std::runtime_error("folly::decodeUtf8 empty/invalid string");
-  }
-
-  unsigned char fst = *p;
-  if (!(fst & 0x80)) {
-    // trivial case
-    return *p++;
-  }
-
-  static const uint32_t bitMask[] = {
-    (1 << 7) - 1,
-    (1 << 11) - 1,
-    (1 << 16) - 1,
-    (1 << 21) - 1
-  };
-
-  // upper control bits are masked out later
-  uint32_t d = fst;
-
-  if ((fst & 0xC0) != 0xC0) {
-    if (skipOnError) return skip();
-    throw std::runtime_error(to<std::string>("folly::decodeUtf8 i=0 d=", d));
-  }
-
-  fst <<= 1;
-
-  for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
-    unsigned char tmp = p[i];
-
-    if ((tmp & 0xC0) != 0x80) {
-      if (skipOnError) return skip();
-      throw std::runtime_error(
-        to<std::string>("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp));
-    }
-
-    d = (d << 6) | (tmp & 0x3F);
-    fst <<= 1;
-
-    if (!(fst & 0x80)) {
-      d &= bitMask[i];
-
-      // overlong, could have been encoded with i bytes
-      if ((d & ~bitMask[i - 1]) == 0) {
-        if (skipOnError) return skip();
-        throw std::runtime_error(
-          to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
-      }
-
-      // check for surrogates only needed for 3 bytes
-      if (i == 2) {
-        if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
-          if (skipOnError) return skip();
-          throw std::runtime_error(
-            to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
-        }
-      }
-
-      p += i + 1;
-      return d;
-    }
-  }
-
-  if (skipOnError) return skip();
-  throw std::runtime_error("folly::decodeUtf8 encoding length maxed out");
-}
-
  struct Printer {
    explicit Printer(
        std::string& out,
@@ -716,7 +632,7 @@ void escapeString(
        if (q == p) {
          // calling utf8_decode has the side effect of
          // checking that utf8 encodings are valid
-        char32_t v = decodeUtf8(q, e, opts.skip_invalid_utf8);
+        char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
          if (opts.skip_invalid_utf8 && v == U'\ufffd') {
            out.append(u8"\ufffd");
            p = q;
@@ -727,7 +643,7 @@ void escapeString(
      if (opts.encode_non_ascii && (*p & 0x80)) {
        // note that this if condition captures utf8 chars
        // with value > 127, so size > 1 byte
-      char32_t v = decodeUtf8(p, e, opts.skip_invalid_utf8);
+      char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
        out.append("\\u");
        out.push_back(hexDigit(uint8_t(v >> 12)));
        out.push_back(hexDigit((v >> 8) & 0x0f));
author	Zbigniew Szymanski <zbsz@fb.com>
	Fri, 30 Dec 2016 13:19:20 +0000 (05:19 -0800)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Fri, 30 Dec 2016 13:32:55 +0000 (05:32 -0800)
folly/Unicode.cpp		patch \| blob \| history
folly/Unicode.h		patch \| blob \| history
folly/json.cpp		patch \| blob \| history