From 67f7bb722a46b2e7226a73aae0ce803df8444efd Mon Sep 17 00:00:00 2001 From: Zbigniew Szymanski Date: Fri, 30 Dec 2016 05:19:20 -0800 Subject: [PATCH] Move internal `decodeUtf8` method from json.cpp to public util Unicode.h Summary: Moved decodeUtf8 -> folly::utf8ToCodePoint. Implementation was not changed to make sure no bugs are introduced. Reviewed By: yfeldblum Differential Revision: D4372739 fbshipit-source-id: a015a9c47ece825e09e7c243fae454f21f99db80 --- folly/Unicode.cpp | 86 +++++++++++++++++++++++++++++++++++++++++++++ folly/Unicode.h | 8 +++++ folly/json.cpp | 88 ++--------------------------------------------- 3 files changed, 96 insertions(+), 86 deletions(-) diff --git a/folly/Unicode.cpp b/folly/Unicode.cpp index c36bd077..b76318ed 100644 --- a/folly/Unicode.cpp +++ b/folly/Unicode.cpp @@ -15,6 +15,7 @@ */ #include +#include namespace folly { @@ -48,6 +49,91 @@ std::string codePointToUtf8(char32_t cp) { return result; } + +char32_t utf8ToCodePoint( + const unsigned char*& p, + const unsigned char* const e, + bool skipOnError) { + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + + auto skip = [&] { ++p; return U'\ufffd'; }; + + if (p >= e) { + if (skipOnError) return skip(); + throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string"); + } + + unsigned char fst = *p; + if (!(fst & 0x80)) { + // trivial case + return *p++; + } + + static const uint32_t bitMask[] = { + (1 << 7) - 1, + (1 << 11) - 1, + (1 << 16) - 1, + (1 << 21) - 1 + }; + + // upper control bits are masked out later + uint32_t d = fst; + + if ((fst & 0xC0) != 0xC0) { + if (skipOnError) return skip(); + throw std::runtime_error(to("folly::utf8ToCodePoint i=0 d=", d)); + } + + fst <<= 1; + + for (unsigned int i = 1; i != 3 && p + i < e; ++i) { + unsigned char tmp = p[i]; + + if ((tmp & 0xC0) != 0x80) { + if (skipOnError) return skip(); + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp)); + } + + d = (d << 6) | (tmp & 0x3F); + fst <<= 1; + + if (!(fst & 0x80)) { + d &= bitMask[i]; + + // overlong, could have been encoded with i bytes + if ((d & ~bitMask[i - 1]) == 0) { + if (skipOnError) return skip(); + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " d=", d)); + } + + // check for surrogates only needed for 3 bytes + if (i == 2) { + if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { + if (skipOnError) return skip(); + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " d=", d)); + } + } + + p += i + 1; + return d; + } + } + + if (skipOnError) return skip(); + throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out"); +} + ////////////////////////////////////////////////////////////////////// } diff --git a/folly/Unicode.h b/folly/Unicode.h index 542dc10b..f1dc9ed0 100644 --- a/folly/Unicode.h +++ b/folly/Unicode.h @@ -31,6 +31,14 @@ namespace folly { */ std::string codePointToUtf8(char32_t cp); +/* + * Decode a single unicode code point from UTF-8 byte sequence. + */ +char32_t utf8ToCodePoint( + const unsigned char*& p, + const unsigned char* const e, + bool skipOnError); + ////////////////////////////////////////////////////////////////////// } diff --git a/folly/json.cpp b/folly/json.cpp index 882d3bf0..b56e6f4b 100644 --- a/folly/json.cpp +++ b/folly/json.cpp @@ -33,90 +33,6 @@ namespace folly { namespace json { namespace { -char32_t decodeUtf8( - const unsigned char*& p, - const unsigned char* const e, - bool skipOnError) { - /* The following encodings are valid, except for the 5 and 6 byte - * combinations: - * 0xxxxxxx - * 110xxxxx 10xxxxxx - * 1110xxxx 10xxxxxx 10xxxxxx - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - */ - - auto skip = [&] { ++p; return U'\ufffd'; }; - - if (p >= e) { - if (skipOnError) return skip(); - throw std::runtime_error("folly::decodeUtf8 empty/invalid string"); - } - - unsigned char fst = *p; - if (!(fst & 0x80)) { - // trivial case - return *p++; - } - - static const uint32_t bitMask[] = { - (1 << 7) - 1, - (1 << 11) - 1, - (1 << 16) - 1, - (1 << 21) - 1 - }; - - // upper control bits are masked out later - uint32_t d = fst; - - if ((fst & 0xC0) != 0xC0) { - if (skipOnError) return skip(); - throw std::runtime_error(to("folly::decodeUtf8 i=0 d=", d)); - } - - fst <<= 1; - - for (unsigned int i = 1; i != 3 && p + i < e; ++i) { - unsigned char tmp = p[i]; - - if ((tmp & 0xC0) != 0x80) { - if (skipOnError) return skip(); - throw std::runtime_error( - to("folly::decodeUtf8 i=", i, " tmp=", (uint32_t)tmp)); - } - - d = (d << 6) | (tmp & 0x3F); - fst <<= 1; - - if (!(fst & 0x80)) { - d &= bitMask[i]; - - // overlong, could have been encoded with i bytes - if ((d & ~bitMask[i - 1]) == 0) { - if (skipOnError) return skip(); - throw std::runtime_error( - to("folly::decodeUtf8 i=", i, " d=", d)); - } - - // check for surrogates only needed for 3 bytes - if (i == 2) { - if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { - if (skipOnError) return skip(); - throw std::runtime_error( - to("folly::decodeUtf8 i=", i, " d=", d)); - } - } - - p += i + 1; - return d; - } - } - - if (skipOnError) return skip(); - throw std::runtime_error("folly::decodeUtf8 encoding length maxed out"); -} - struct Printer { explicit Printer( std::string& out, @@ -716,7 +632,7 @@ void escapeString( if (q == p) { // calling utf8_decode has the side effect of // checking that utf8 encodings are valid - char32_t v = decodeUtf8(q, e, opts.skip_invalid_utf8); + char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8); if (opts.skip_invalid_utf8 && v == U'\ufffd') { out.append(u8"\ufffd"); p = q; @@ -727,7 +643,7 @@ void escapeString( if (opts.encode_non_ascii && (*p & 0x80)) { // note that this if condition captures utf8 chars // with value > 127, so size > 1 byte - char32_t v = decodeUtf8(p, e, opts.skip_invalid_utf8); + char32_t v = utf8ToCodePoint(p, e, opts.skip_invalid_utf8); out.append("\\u"); out.push_back(hexDigit(uint8_t(v >> 12))); out.push_back(hexDigit((v >> 8) & 0x0f)); -- 2.34.1