*/
#include <folly/Unicode.h>
+#include <folly/Conv.h>
namespace folly {
return result;
}
+
+char32_t utf8ToCodePoint(
+ const unsigned char*& p,
+ const unsigned char* const e,
+ bool skipOnError) {
+ /* The following encodings are valid, except for the 5 and 6 byte
+ * combinations:
+ * 0xxxxxxx
+ * 110xxxxx 10xxxxxx
+ * 1110xxxx 10xxxxxx 10xxxxxx
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+ auto skip = [&] { ++p; return U'\ufffd'; };
+
+ if (p >= e) {
+ if (skipOnError) return skip();
+ throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string");
+ }
+
+ unsigned char fst = *p;
+ if (!(fst & 0x80)) {
+ // trivial case
+ return *p++;
+ }
+
+ static const uint32_t bitMask[] = {
+ (1 << 7) - 1,
+ (1 << 11) - 1,
+ (1 << 16) - 1,
+ (1 << 21) - 1
+ };
+
+ // upper control bits are masked out later
+ uint32_t d = fst;
+
+ if ((fst & 0xC0) != 0xC0) {
+ if (skipOnError) return skip();
+ throw std::runtime_error(to<std::string>("folly::utf8ToCodePoint i=0 d=", d));
+ }
+
+ fst <<= 1;
+
+ for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
+ unsigned char tmp = p[i];
+
+ if ((tmp & 0xC0) != 0x80) {
+ if (skipOnError) return skip();
+ throw std::runtime_error(
+ to<std::string>("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp));
+ }
+
+ d = (d << 6) | (tmp & 0x3F);
+ fst <<= 1;
+
+ if (!(fst & 0x80)) {
+ d &= bitMask[i];
+
+ // overlong, could have been encoded with i bytes
+ if ((d & ~bitMask[i - 1]) == 0) {
+ if (skipOnError) return skip();
+ throw std::runtime_error(
+ to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
+ }
+
+ // check for surrogates only needed for 3 bytes
+ if (i == 2) {
+ if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
+ if (skipOnError) return skip();
+ throw std::runtime_error(
+ to<std::string>("folly::utf8ToCodePoint i=", i, " d=", d));
+ }
+ }
+
+ p += i + 1;
+ return d;
+ }
+ }
+
+ if (skipOnError) return skip();
+ throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out");
+}
+
//////////////////////////////////////////////////////////////////////
}
namespace json {
namespace {
-char32_t decodeUtf8(
- const unsigned char*& p,
- const unsigned char* const e,
- bool skipOnError) {
- /* The following encodings are valid, except for the 5 and 6 byte
- * combinations:
- * 0xxxxxxx
- * 110xxxxx 10xxxxxx
- * 1110xxxx 10xxxxxx 10xxxxxx
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- */
-
- auto skip = [&] { ++p; return U'\ufffd'; };
-
- if (p >= e) {
- if (skipOnError) return skip();
- throw std::runtime_error("folly::decodeUtf8 empty/invalid string");
- }
-
- unsigned char fst = *p;
- if (!(fst & 0x80)) {
- // trivial case
- return *p++;
- }
-
- static const uint32_t bitMask[] = {
- (1 << 7) - 1,
- (1 << 11) - 1,
- (1 << 16) - 1,
- (1 << 21) - 1
- };
-
- // upper control bits are masked out later
- uint32_t d = fst;
-
- if ((fst & 0xC0) != 0xC0) {
- if (skipOnError) return skip();
- throw std::runtime_error(to<std::string>("folly::decodeUtf8 i=0 d=", d));
- }
-
- fst <<= 1;
-
- for (unsigned int i = 1; i != 3 && p + i < e; ++i) {
- unsigned char tmp = p[i];
-
- if ((tmp & 0xC0) != 0x80) {
- if (skipOnError) return skip();
- throw std::runtime_error(
- to<std::string>("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp));
- }
-
- d = (d << 6) | (tmp & 0x3F);
- fst <<= 1;
-
- if (!(fst & 0x80)) {
- d &= bitMask[i];
-
- // overlong, could have been encoded with i bytes
- if ((d & ~bitMask[i - 1]) == 0) {
- if (skipOnError) return skip();
- throw std::runtime_error(
- to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
- }
-
- // check for surrogates only needed for 3 bytes
- if (i == 2) {
- if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) {
- if (skipOnError) return skip();
- throw std::runtime_error(
- to<std::string>("folly::decodeUtf8 i=", i, " d=", d));
- }
- }
-
- p += i + 1;
- return d;
- }
- }
-
- if (skipOnError) return skip();
- throw std::runtime_error("folly::decodeUtf8 encoding length maxed out");
-}
-
struct Printer {
explicit Printer(
std::string& out,
if (q == p) {
// calling utf8_decode has the side effect of
// checking that utf8 encodings are valid
- char32_t v = decodeUtf8(q, e, opts.skip_invalid_utf8);
+ char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
if (opts.skip_invalid_utf8 && v == U'\ufffd') {
out.append(u8"\ufffd");
p = q;
if (opts.encode_non_ascii && (*p & 0x80)) {
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte
- char32_t v = decodeUtf8(p, e, opts.skip_invalid_utf8);
+ char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
out.append("\\u");
out.push_back(hexDigit(uint8_t(v >> 12)));
out.push_back(hexDigit((v >> 8) & 0x0f));