X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=blobdiff_plain;f=folly%2FUnicode.cpp;h=b13f4ae93c46dc92698c53be36e4c13ac26818c2;hp=7ac48931898e2463558efe1dcaee0deaf3126487;hb=24d6b776bc7f5608d7e553f361eb79f2dcf6d7f7;hpb=27494a20393fa45072e7d526d358835f3abe312a diff --git a/folly/Unicode.cpp b/folly/Unicode.cpp index 7ac48931..b13f4ae9 100644 --- a/folly/Unicode.cpp +++ b/folly/Unicode.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2012 Facebook, Inc. + * Copyright 2017 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,15 @@ * limitations under the License. */ -#include "folly/Unicode.h" +#include +#include namespace folly { ////////////////////////////////////////////////////////////////////// -fbstring codePointToUtf8(char32_t cp) { - fbstring result; +std::string codePointToUtf8(char32_t cp) { + std::string result; // Based on description from http://en.wikipedia.org/wiki/UTF-8. @@ -48,7 +49,103 @@ fbstring codePointToUtf8(char32_t cp) { return result; } -////////////////////////////////////////////////////////////////////// +char32_t utf8ToCodePoint( + const unsigned char*& p, + const unsigned char* const e, + bool skipOnError) { + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + + auto skip = [&] { ++p; return U'\ufffd'; }; + + if (p >= e) { + if (skipOnError) { + return skip(); + } + throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string"); + } + + unsigned char fst = *p; + if (!(fst & 0x80)) { + // trivial case + return *p++; + } + + static const uint32_t bitMask[] = { + (1 << 7) - 1, + (1 << 11) - 1, + (1 << 16) - 1, + (1 << 21) - 1 + }; + + // upper control bits are masked out later + uint32_t d = fst; + + if ((fst & 0xC0) != 0xC0) { + if (skipOnError) { + return skip(); + } + throw std::runtime_error(to("folly::utf8ToCodePoint i=0 d=", d)); + } + + fst <<= 1; + + for (unsigned int i = 1; i != 3 && p + i < e; ++i) { + unsigned char tmp = p[i]; + + if ((tmp & 0xC0) != 0x80) { + if (skipOnError) { + return skip(); + } + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp)); + } + + d = (d << 6) | (tmp & 0x3F); + fst <<= 1; + + if (!(fst & 0x80)) { + d &= bitMask[i]; + + // overlong, could have been encoded with i bytes + if ((d & ~bitMask[i - 1]) == 0) { + if (skipOnError) { + return skip(); + } + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " d=", d)); + } + + // check for surrogates only needed for 3 bytes + if (i == 2) { + if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { + if (skipOnError) { + return skip(); + } + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " d=", d)); + } + } + + p += i + 1; + return d; + } + } + + if (skipOnError) { + return skip(); + } + throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out"); } +////////////////////////////////////////////////////////////////////// + +} // namespace folly