folly/String-inl.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_STRING_INL_H_
  18 #define FOLLY_STRING_INL_H_
  19
  20 #include <stdexcept>
  21 #include <iterator>
  22
  23 #ifndef FOLLY_BASE_STRING_H_
  24 #error This file may only be included from String.h
  25 #endif
  26
  27 namespace folly {
  28
  29 namespace detail {
  30 // Map from character code to value of one-character escape sequence
  31 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
  32 // an octal escape sequence, or 'P' if the character is printable and
  33 // should be printed as is.
  34 extern const char cEscapeTable[];
  35 }  // namespace detail
  36
  37 template <class String>
  38 void cEscape(StringPiece str, String& out) {
  39   char esc[4];
  40   esc[0] = '\\';
  41   out.reserve(out.size() + str.size());
  42   auto p = str.begin();
  43   auto last = p;  // last regular character
  44   // We advance over runs of regular characters (printable, not double-quote or
  45   // backslash) and copy them in one go; this is faster than calling push_back
  46   // repeatedly.
  47   while (p != str.end()) {
  48     char c = *p;
  49     unsigned char v = static_cast<unsigned char>(c);
  50     char e = detail::cEscapeTable[v];
  51     if (e == 'P') {  // printable
  52       ++p;
  53     } else if (e == 'O') {  // octal
  54       out.append(&*last, p - last);
  55       esc[1] = '0' + ((v >> 6) & 7);
  56       esc[2] = '0' + ((v >> 3) & 7);
  57       esc[3] = '0' + (v & 7);
  58       out.append(esc, 4);
  59       ++p;
  60       last = p;
  61     } else {  // special 1-character escape
  62       out.append(&*last, p - last);
  63       esc[1] = e;
  64       out.append(esc, 2);
  65       ++p;
  66       last = p;
  67     }
  68   }
  69   out.append(&*last, p - last);
  70 }
  71
  72 namespace detail {
  73 // Map from the character code of the character following a backslash to
  74 // the unescaped character if a valid one-character escape sequence
  75 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
  76 // octal escape sequence, 'X' if this is the first character of a
  77 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
  78 extern const char cUnescapeTable[];
  79
  80 // Map from the character code to the hex value, or 16 if invalid hex char.
  81 extern const unsigned char hexTable[];
  82 }  // namespace detail
  83
  84 template <class String>
  85 void cUnescape(StringPiece str, String& out, bool strict) {
  86   out.reserve(out.size() + str.size());
  87   auto p = str.begin();
  88   auto last = p;  // last regular character (not part of an escape sequence)
  89   // We advance over runs of regular characters (not backslash) and copy them
  90   // in one go; this is faster than calling push_back repeatedly.
  91   while (p != str.end()) {
  92     char c = *p;
  93     if (c != '\\') {  // normal case
  94       ++p;
  95       continue;
  96     }
  97     out.append(&*last, p - last);
  98     if (p == str.end()) {  // backslash at end of string
  99       if (strict) {
 100         throw std::invalid_argument("incomplete escape sequence");
 101       }
 102       out.push_back('\\');
 103       last = p;
 104       continue;
 105     }
 106     ++p;
 107     char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
 108     if (e == 'O') {  // octal
 109       unsigned char val = 0;
 110       for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
 111            ++i, ++p) {
 112         val = (val << 3) | (*p - '0');
 113       }
 114       out.push_back(val);
 115       last = p;
 116     } else if (e == 'X') {  // hex
 117       ++p;
 118       if (p == str.end()) {  // \x at end of string
 119         if (strict) {
 120           throw std::invalid_argument("incomplete hex escape sequence");
 121         }
 122         out.append("\\x");
 123         last = p;
 124         continue;
 125       }
 126       unsigned char val = 0;
 127       unsigned char h;
 128       for (; (p != str.end() &&
 129               (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
 130            ++p) {
 131         val = (val << 4) | h;
 132       }
 133       out.push_back(val);
 134       last = p;
 135     } else if (e == 'I') {  // invalid
 136       if (strict) {
 137         throw std::invalid_argument("invalid escape sequence");
 138       }
 139       out.push_back('\\');
 140       out.push_back(*p);
 141       ++p;
 142       last = p;
 143     } else {  // standard escape sequence, \' etc
 144       out.push_back(e);
 145       ++p;
 146       last = p;
 147     }
 148   }
 149   out.append(&*last, p - last);
 150 }
 151
 152 namespace detail {
 153 // Map from character code to escape mode:
 154 // 0 = pass through
 155 // 1 = unused
 156 // 2 = pass through in PATH mode
 157 // 3 = space, replace with '+' in QUERY mode
 158 // 4 = percent-encode
 159 extern const unsigned char uriEscapeTable[];
 160 }  // namespace detail
 161
 162 template <class String>
 163 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
 164   static const char hexValues[] = "0123456789abcdef";
 165   char esc[3];
 166   esc[0] = '%';
 167   // Preallocate assuming that 25% of the input string will be escaped
 168   out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
 169   auto p = str.begin();
 170   auto last = p;  // last regular character
 171   // We advance over runs of passthrough characters and copy them in one go;
 172   // this is faster than calling push_back repeatedly.
 173   unsigned char minEncode = static_cast<unsigned char>(mode);
 174   while (p != str.end()) {
 175     char c = *p;
 176     unsigned char v = static_cast<unsigned char>(c);
 177     unsigned char discriminator = detail::uriEscapeTable[v];
 178     if (LIKELY(discriminator <= minEncode)) {
 179       ++p;
 180     } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
 181       out.append(&*last, p - last);
 182       out.push_back('+');
 183       ++p;
 184       last = p;
 185     } else {
 186       out.append(&*last, p - last);
 187       esc[1] = hexValues[v >> 4];
 188       esc[2] = hexValues[v & 0x0f];
 189       out.append(esc, 3);
 190       ++p;
 191       last = p;
 192     }
 193   }
 194   out.append(&*last, p - last);
 195 }
 196
 197 template <class String>
 198 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
 199   out.reserve(out.size() + str.size());
 200   auto p = str.begin();
 201   auto last = p;
 202   // We advance over runs of passthrough characters and copy them in one go;
 203   // this is faster than calling push_back repeatedly.
 204   while (p != str.end()) {
 205     char c = *p;
 206     unsigned char v = static_cast<unsigned char>(v);
 207     switch (c) {
 208     case '%':
 209       {
 210         if (UNLIKELY(std::distance(p, str.end()) < 3)) {
 211           throw std::invalid_argument("incomplete percent encode sequence");
 212         }
 213         auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
 214         auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
 215         if (UNLIKELY(h1 == 16 || h2 == 16)) {
 216           throw std::invalid_argument("invalid percent encode sequence");
 217         }
 218         out.append(&*last, p - last);
 219         out.push_back((h1 << 4) | h2);
 220         p += 3;
 221         last = p;
 222         break;
 223       }
 224     case '+':
 225       if (mode == UriEscapeMode::QUERY) {
 226         out.append(&*last, p - last);
 227         out.push_back(' ');
 228         ++p;
 229         last = p;
 230         break;
 231       }
 232       // else fallthrough
 233     default:
 234       ++p;
 235       break;
 236     }
 237   }
 238   out.append(&*last, p - last);
 239 }
 240
 241 namespace detail {
 242
 243 /*
 244  * The following functions are type-overloaded helpers for
 245  * internalSplit().
 246  */
 247 inline size_t delimSize(char)          { return 1; }
 248 inline size_t delimSize(StringPiece s) { return s.size(); }
 249 inline bool atDelim(const char* s, char c) {
 250  return *s == c;
 251 }
 252 inline bool atDelim(const char* s, StringPiece sp) {
 253   return !std::memcmp(s, sp.start(), sp.size());
 254 }
 255
 256 // These are used to short-circuit internalSplit() in the case of
 257 // 1-character strings.
 258 inline char delimFront(char c) {
 259   // This one exists only for compile-time; it should never be called.
 260   std::abort();
 261   return c;
 262 }
 263 inline char delimFront(StringPiece s) {
 264   assert(!s.empty() && s.start() != nullptr);
 265   return *s.start();
 266 }
 267
 268 /*
 269  * These output conversion templates allow us to support multiple
 270  * output string types, even when we are using an arbitrary
 271  * OutputIterator.
 272  */
 273 template<class OutStringT> struct OutputConverter {};
 274
 275 template<> struct OutputConverter<std::string> {
 276   std::string operator()(StringPiece sp) const {
 277     return sp.toString();
 278   }
 279 };
 280
 281 template<> struct OutputConverter<fbstring> {
 282   fbstring operator()(StringPiece sp) const {
 283     return sp.toFbstring();
 284   }
 285 };
 286
 287 template<> struct OutputConverter<StringPiece> {
 288   StringPiece operator()(StringPiece sp) const { return sp; }
 289 };
 290
 291 /*
 292  * Shared implementation for all the split() overloads.
 293  *
 294  * This uses some external helpers that are overloaded to let this
 295  * algorithm be more performant if the deliminator is a single
 296  * character instead of a whole string.
 297  *
 298  * @param ignoreEmpty iff true, don't copy empty segments to output
 299  */
 300 template<class OutStringT, class DelimT, class OutputIterator>
 301 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
 302     bool ignoreEmpty) {
 303   assert(sp.empty() || sp.start() != nullptr);
 304
 305   const char* s = sp.start();
 306   const size_t strSize = sp.size();
 307   const size_t dSize = delimSize(delim);
 308
 309   OutputConverter<OutStringT> conv;
 310
 311   if (dSize > strSize || dSize == 0) {
 312     if (!ignoreEmpty || strSize > 0) {
 313       *out++ = conv(sp);
 314     }
 315     return;
 316   }
 317   if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
 318     // Call the char version because it is significantly faster.
 319     return internalSplit<OutStringT>(delimFront(delim), sp, out,
 320       ignoreEmpty);
 321   }
 322
 323   size_t tokenStartPos = 0;
 324   size_t tokenSize = 0;
 325   for (size_t i = 0; i <= strSize - dSize; ++i) {
 326     if (atDelim(&s[i], delim)) {
 327       if (!ignoreEmpty || tokenSize > 0) {
 328         *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 329       }
 330
 331       tokenStartPos = i + dSize;
 332       tokenSize = 0;
 333       i += dSize - 1;
 334     } else {
 335       ++tokenSize;
 336     }
 337   }
 338   tokenSize = strSize - tokenStartPos;
 339   if (!ignoreEmpty || tokenSize > 0) {
 340     *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 341   }
 342 }
 343
 344 template<class String> StringPiece prepareDelim(const String& s) {
 345   return StringPiece(s);
 346 }
 347 inline char prepareDelim(char c) { return c; }
 348
 349 template <class Dst>
 350 struct convertTo {
 351   template <class Src>
 352   static Dst from(const Src& src) { return folly::to<Dst>(src); }
 353   static Dst from(const Dst& src) { return src; }
 354 };
 355
 356 template<bool exact,
 357          class Delim,
 358          class OutputType>
 359 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
 360 splitFixed(const Delim& delimiter,
 361            StringPiece input,
 362            OutputType& out) {
 363   if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
 364     return false;
 365   }
 366   out = convertTo<OutputType>::from(input);
 367   return true;
 368 }
 369
 370 template<bool exact,
 371          class Delim,
 372          class OutputType,
 373          class... OutputTypes>
 374 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
 375 splitFixed(const Delim& delimiter,
 376            StringPiece input,
 377            OutputType& outHead,
 378            OutputTypes&... outTail) {
 379   size_t cut = input.find(delimiter);
 380   if (UNLIKELY(cut == std::string::npos)) {
 381     return false;
 382   }
 383   StringPiece head(input.begin(), input.begin() + cut);
 384   StringPiece tail(input.begin() + cut + detail::delimSize(delimiter),
 385                    input.end());
 386   if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
 387     outHead = convertTo<OutputType>::from(head);
 388     return true;
 389   }
 390   return false;
 391 }
 392
 393 }
 394
 395 //////////////////////////////////////////////////////////////////////
 396
 397 template<class Delim, class String, class OutputType>
 398 void split(const Delim& delimiter,
 399            const String& input,
 400            std::vector<OutputType>& out,
 401            bool ignoreEmpty) {
 402   detail::internalSplit<OutputType>(
 403     detail::prepareDelim(delimiter),
 404     StringPiece(input),
 405     std::back_inserter(out),
 406     ignoreEmpty);
 407 }
 408
 409 template<class Delim, class String, class OutputType>
 410 void split(const Delim& delimiter,
 411            const String& input,
 412            fbvector<OutputType>& out,
 413            bool ignoreEmpty) {
 414   detail::internalSplit<OutputType>(
 415     detail::prepareDelim(delimiter),
 416     StringPiece(input),
 417     std::back_inserter(out),
 418     ignoreEmpty);
 419 }
 420
 421 template<class OutputValueType, class Delim, class String,
 422          class OutputIterator>
 423 void splitTo(const Delim& delimiter,
 424              const String& input,
 425              OutputIterator out,
 426              bool ignoreEmpty) {
 427   detail::internalSplit<OutputValueType>(
 428     detail::prepareDelim(delimiter),
 429     StringPiece(input),
 430     out,
 431     ignoreEmpty);
 432 }
 433
 434 template<bool exact,
 435          class Delim,
 436          class OutputType,
 437          class... OutputTypes>
 438 typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type
 439 split(const Delim& delimiter,
 440       StringPiece input,
 441       OutputType& outHead,
 442       OutputTypes&... outTail) {
 443   return detail::splitFixed<exact>(
 444     detail::prepareDelim(delimiter),
 445     input,
 446     outHead,
 447     outTail...);
 448 }
 449
 450 namespace detail {
 451
 452 /*
 453  * If a type can have its string size determined cheaply, we can more
 454  * efficiently append it in a loop (see internalJoinAppend). Note that the
 455  * struct need not conform to the std::string api completely (ex. does not need
 456  * to implement append()).
 457  */
 458 template <class T> struct IsSizableString {
 459   enum { value = IsSomeString<T>::value
 460          || std::is_same<T, StringPiece>::value };
 461 };
 462
 463 template <class Iterator>
 464 struct IsSizableStringContainerIterator :
 465   IsSizableString<typename std::iterator_traits<Iterator>::value_type> {
 466 };
 467
 468 template <class Delim, class Iterator, class String>
 469 void internalJoinAppend(Delim delimiter,
 470                         Iterator begin,
 471                         Iterator end,
 472                         String& output) {
 473   assert(begin != end);
 474   if (std::is_same<Delim, StringPiece>::value &&
 475       delimSize(delimiter) == 1) {
 476     internalJoinAppend(delimFront(delimiter), begin, end, output);
 477     return;
 478   }
 479   toAppend(*begin, &output);
 480   while (++begin != end) {
 481     toAppend(delimiter, *begin, &output);
 482   }
 483 }
 484
 485 template <class Delim, class Iterator, class String>
 486 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
 487 internalJoin(Delim delimiter,
 488              Iterator begin,
 489              Iterator end,
 490              String& output) {
 491   output.clear();
 492   if (begin == end) {
 493     return;
 494   }
 495   const size_t dsize = delimSize(delimiter);
 496   Iterator it = begin;
 497   size_t size = it->size();
 498   while (++it != end) {
 499     size += dsize + it->size();
 500   }
 501   output.reserve(size);
 502   internalJoinAppend(delimiter, begin, end, output);
 503 }
 504
 505 template <class Delim, class Iterator, class String>
 506 typename
 507 std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type
 508 internalJoin(Delim delimiter,
 509              Iterator begin,
 510              Iterator end,
 511              String& output) {
 512   output.clear();
 513   if (begin == end) {
 514     return;
 515   }
 516   internalJoinAppend(delimiter, begin, end, output);
 517 }
 518
 519 }  // namespace detail
 520
 521 template <class Delim, class Iterator, class String>
 522 void join(const Delim& delimiter,
 523           Iterator begin,
 524           Iterator end,
 525           String& output) {
 526   detail::internalJoin(
 527     detail::prepareDelim(delimiter),
 528     begin,
 529     end,
 530     output);
 531 }
 532
 533 template <class String1, class String2>
 534 void backslashify(const String1& input, String2& output, bool hex_style) {
 535   static const char hexValues[] = "0123456789abcdef";
 536   output.clear();
 537   output.reserve(3 * input.size());
 538   for (unsigned char c : input) {
 539     // less than space or greater than '~' are considered unprintable
 540     if (c < 0x20 || c > 0x7e || c == '\\') {
 541       bool hex_append = false;
 542       output.push_back('\\');
 543       if (hex_style) {
 544         hex_append = true;
 545       } else {
 546         if (c == '\r') output += 'r';
 547         else if (c == '\n') output += 'n';
 548         else if (c == '\t') output += 't';
 549         else if (c == '\a') output += 'a';
 550         else if (c == '\b') output += 'b';
 551         else if (c == '\0') output += '0';
 552         else if (c == '\\') output += '\\';
 553         else {
 554           hex_append = true;
 555         }
 556       }
 557       if (hex_append) {
 558         output.push_back('x');
 559         output.push_back(hexValues[(c >> 4) & 0xf]);
 560         output.push_back(hexValues[c & 0xf]);
 561       }
 562     } else {
 563       output += c;
 564     }
 565   }
 566 }
 567
 568 template <class String1, class String2>
 569 void humanify(const String1& input, String2& output) {
 570   size_t numUnprintable = 0;
 571   size_t numPrintablePrefix = 0;
 572   for (unsigned char c : input) {
 573     if (c < 0x20 || c > 0x7e || c == '\\') {
 574       ++numUnprintable;
 575     }
 576     if (numUnprintable == 0) {
 577       ++numPrintablePrefix;
 578     }
 579   }
 580
 581   // hexlify doubles a string's size; backslashify can potentially
 582   // explode it by 4x.  Now, the printable range of the ascii
 583   // "spectrum" is around 95 out of 256 values, so a "random" binary
 584   // string should be around 60% unprintable.  We use a 50% hueristic
 585   // here, so if a string is 60% unprintable, then we just use hex
 586   // output.  Otherwise we backslash.
 587   //
 588   // UTF8 is completely ignored; as a result, utf8 characters will
 589   // likely be \x escaped (since most common glyphs fit in two bytes).
 590   // This is a tradeoff of complexity/speed instead of a convenience
 591   // that likely would rarely matter.  Moreover, this function is more
 592   // about displaying underlying bytes, not about displaying glyphs
 593   // from languages.
 594   if (numUnprintable == 0) {
 595     output = input;
 596   } else if (5 * numUnprintable >= 3 * input.size()) {
 597     // However!  If we have a "meaningful" prefix of printable
 598     // characters, say 20% of the string, we backslashify under the
 599     // assumption viewing the prefix as ascii is worth blowing the
 600     // output size up a bit.
 601     if (5 * numPrintablePrefix >= input.size()) {
 602       backslashify(input, output);
 603     } else {
 604       output = "0x";
 605       hexlify(input, output, true /* append output */);
 606     }
 607   } else {
 608     backslashify(input, output);
 609   }
 610 }
 611
 612 template<class InputString, class OutputString>
 613 bool hexlify(const InputString& input, OutputString& output,
 614              bool append_output) {
 615   if (!append_output) output.clear();
 616
 617   static char hexValues[] = "0123456789abcdef";
 618   auto j = output.size();
 619   output.resize(2 * input.size() + output.size());
 620   for (size_t i = 0; i < input.size(); ++i) {
 621     int ch = input[i];
 622     output[j++] = hexValues[(ch >> 4) & 0xf];
 623     output[j++] = hexValues[ch & 0xf];
 624   }
 625   return true;
 626 }
 627
 628 template<class InputString, class OutputString>
 629 bool unhexlify(const InputString& input, OutputString& output) {
 630   if (input.size() % 2 != 0) {
 631     return false;
 632   }
 633   output.resize(input.size() / 2);
 634   int j = 0;
 635   auto unhex = [](char c) -> int {
 636     return c >= '0' && c <= '9' ? c - '0' :
 637            c >= 'A' && c <= 'F' ? c - 'A' + 10 :
 638            c >= 'a' && c <= 'f' ? c - 'a' + 10 :
 639            -1;
 640   };
 641
 642   for (size_t i = 0; i < input.size(); i += 2) {
 643     int highBits = unhex(input[i]);
 644     int lowBits = unhex(input[i + 1]);
 645     if (highBits < 0 || lowBits < 0) {
 646       return false;
 647     }
 648     output[j++] = (highBits << 4) + lowBits;
 649   }
 650   return true;
 651 }
 652
 653 namespace detail {
 654 /**
 655  * Hex-dump at most 16 bytes starting at offset from a memory area of size
 656  * bytes.  Return the number of bytes actually dumped.
 657  */
 658 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
 659                    std::string& line);
 660 }  // namespace detail
 661
 662 template <class OutIt>
 663 void hexDump(const void* ptr, size_t size, OutIt out) {
 664   size_t offset = 0;
 665   std::string line;
 666   while (offset < size) {
 667     offset += detail::hexDumpLine(ptr, offset, size, line);
 668     *out++ = line;
 669   }
 670 }
 671
 672 }  // namespace folly
 673
 674 #endif /* FOLLY_STRING_INL_H_ */