X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=folly%2FUnicode.cpp;h=b222fe6aa2c1ddc56b11ef526bda682968106dee;hb=b367f0fada0a53564f83e5072bbd45994a1c0795;hp=e71cbe1cfff0962dab8692fe5245d0860c4f5b9c;hpb=5c77fedbef46995a71ffa268c9fcaf49efddd01b;p=folly.git diff --git a/folly/Unicode.cpp b/folly/Unicode.cpp index e71cbe1c..b222fe6a 100644 --- a/folly/Unicode.cpp +++ b/folly/Unicode.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2013 Facebook, Inc. + * Copyright 2017 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,15 @@ * limitations under the License. */ -#include "folly/Unicode.h" +#include +#include namespace folly { ////////////////////////////////////////////////////////////////////// -fbstring codePointToUtf8(char32_t cp) { - fbstring result; +std::string codePointToUtf8(char32_t cp) { + std::string result; // Based on description from http://en.wikipedia.org/wiki/UTF-8. @@ -48,7 +49,91 @@ fbstring codePointToUtf8(char32_t cp) { return result; } -////////////////////////////////////////////////////////////////////// +char32_t utf8ToCodePoint( + const unsigned char*& p, + const unsigned char* const e, + bool skipOnError) { + /* The following encodings are valid, except for the 5 and 6 byte + * combinations: + * 0xxxxxxx + * 110xxxxx 10xxxxxx + * 1110xxxx 10xxxxxx 10xxxxxx + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + + auto skip = [&] { ++p; return U'\ufffd'; }; + + if (p >= e) { + if (skipOnError) return skip(); + throw std::runtime_error("folly::utf8ToCodePoint empty/invalid string"); + } + + unsigned char fst = *p; + if (!(fst & 0x80)) { + // trivial case + return *p++; + } + + static const uint32_t bitMask[] = { + (1 << 7) - 1, + (1 << 11) - 1, + (1 << 16) - 1, + (1 << 21) - 1 + }; + + // upper control bits are masked out later + uint32_t d = fst; + + if ((fst & 0xC0) != 0xC0) { + if (skipOnError) return skip(); + throw std::runtime_error(to("folly::utf8ToCodePoint i=0 d=", d)); + } + + fst <<= 1; + + for (unsigned int i = 1; i != 3 && p + i < e; ++i) { + unsigned char tmp = p[i]; + + if ((tmp & 0xC0) != 0x80) { + if (skipOnError) return skip(); + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " tmp=", (uint32_t)tmp)); + } + + d = (d << 6) | (tmp & 0x3F); + fst <<= 1; + + if (!(fst & 0x80)) { + d &= bitMask[i]; + + // overlong, could have been encoded with i bytes + if ((d & ~bitMask[i - 1]) == 0) { + if (skipOnError) return skip(); + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " d=", d)); + } + + // check for surrogates only needed for 3 bytes + if (i == 2) { + if ((d >= 0xD800 && d <= 0xDFFF) || d > 0x10FFFF) { + if (skipOnError) return skip(); + throw std::runtime_error( + to("folly::utf8ToCodePoint i=", i, " d=", d)); + } + } + + p += i + 1; + return d; + } + } + + if (skipOnError) return skip(); + throw std::runtime_error("folly::utf8ToCodePoint encoding length maxed out"); } +////////////////////////////////////////////////////////////////////// + +}