folly/String-inl.h

   1 /*
   2  * Copyright 2012-present Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #pragma once
  18
  19 #include <iterator>
  20 #include <stdexcept>
  21
  22 #include <folly/CppAttributes.h>
  23
  24 #ifndef FOLLY_STRING_H_
  25 #error This file may only be included from String.h
  26 #endif
  27
  28 namespace folly {
  29
  30 namespace detail {
  31 // Map from character code to value of one-character escape sequence
  32 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
  33 // an octal escape sequence, or 'P' if the character is printable and
  34 // should be printed as is.
  35 extern const char cEscapeTable[];
  36 } // namespace detail
  37
  38 template <class String>
  39 void cEscape(StringPiece str, String& out) {
  40   char esc[4];
  41   esc[0] = '\\';
  42   out.reserve(out.size() + str.size());
  43   auto p = str.begin();
  44   auto last = p;  // last regular character
  45   // We advance over runs of regular characters (printable, not double-quote or
  46   // backslash) and copy them in one go; this is faster than calling push_back
  47   // repeatedly.
  48   while (p != str.end()) {
  49     char c = *p;
  50     unsigned char v = static_cast<unsigned char>(c);
  51     char e = detail::cEscapeTable[v];
  52     if (e == 'P') {  // printable
  53       ++p;
  54     } else if (e == 'O') {  // octal
  55       out.append(&*last, size_t(p - last));
  56       esc[1] = '0' + ((v >> 6) & 7);
  57       esc[2] = '0' + ((v >> 3) & 7);
  58       esc[3] = '0' + (v & 7);
  59       out.append(esc, 4);
  60       ++p;
  61       last = p;
  62     } else {  // special 1-character escape
  63       out.append(&*last, size_t(p - last));
  64       esc[1] = e;
  65       out.append(esc, 2);
  66       ++p;
  67       last = p;
  68     }
  69   }
  70   out.append(&*last, size_t(p - last));
  71 }
  72
  73 namespace detail {
  74 // Map from the character code of the character following a backslash to
  75 // the unescaped character if a valid one-character escape sequence
  76 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
  77 // octal escape sequence, 'X' if this is the first character of a
  78 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
  79 extern const char cUnescapeTable[];
  80
  81 // Map from the character code to the hex value, or 16 if invalid hex char.
  82 extern const unsigned char hexTable[];
  83 } // namespace detail
  84
  85 template <class String>
  86 void cUnescape(StringPiece str, String& out, bool strict) {
  87   out.reserve(out.size() + str.size());
  88   auto p = str.begin();
  89   auto last = p;  // last regular character (not part of an escape sequence)
  90   // We advance over runs of regular characters (not backslash) and copy them
  91   // in one go; this is faster than calling push_back repeatedly.
  92   while (p != str.end()) {
  93     char c = *p;
  94     if (c != '\\') {  // normal case
  95       ++p;
  96       continue;
  97     }
  98     out.append(&*last, p - last);
  99     if (p == str.end()) {  // backslash at end of string
 100       if (strict) {
 101         throw std::invalid_argument("incomplete escape sequence");
 102       }
 103       out.push_back('\\');
 104       last = p;
 105       continue;
 106     }
 107     ++p;
 108     char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
 109     if (e == 'O') {  // octal
 110       unsigned char val = 0;
 111       for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
 112            ++i, ++p) {
 113         val = (val << 3) | (*p - '0');
 114       }
 115       out.push_back(val);
 116       last = p;
 117     } else if (e == 'X') {  // hex
 118       ++p;
 119       if (p == str.end()) {  // \x at end of string
 120         if (strict) {
 121           throw std::invalid_argument("incomplete hex escape sequence");
 122         }
 123         out.append("\\x");
 124         last = p;
 125         continue;
 126       }
 127       unsigned char val = 0;
 128       unsigned char h;
 129       for (; (p != str.end() &&
 130               (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
 131            ++p) {
 132         val = (val << 4) | h;
 133       }
 134       out.push_back(val);
 135       last = p;
 136     } else if (e == 'I') {  // invalid
 137       if (strict) {
 138         throw std::invalid_argument("invalid escape sequence");
 139       }
 140       out.push_back('\\');
 141       out.push_back(*p);
 142       ++p;
 143       last = p;
 144     } else {  // standard escape sequence, \' etc
 145       out.push_back(e);
 146       ++p;
 147       last = p;
 148     }
 149   }
 150   out.append(&*last, p - last);
 151 }
 152
 153 namespace detail {
 154 // Map from character code to escape mode:
 155 // 0 = pass through
 156 // 1 = unused
 157 // 2 = pass through in PATH mode
 158 // 3 = space, replace with '+' in QUERY mode
 159 // 4 = percent-encode
 160 extern const unsigned char uriEscapeTable[];
 161 } // namespace detail
 162
 163 template <class String>
 164 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
 165   static const char hexValues[] = "0123456789abcdef";
 166   char esc[3];
 167   esc[0] = '%';
 168   // Preallocate assuming that 25% of the input string will be escaped
 169   out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
 170   auto p = str.begin();
 171   auto last = p;  // last regular character
 172   // We advance over runs of passthrough characters and copy them in one go;
 173   // this is faster than calling push_back repeatedly.
 174   unsigned char minEncode = static_cast<unsigned char>(mode);
 175   while (p != str.end()) {
 176     char c = *p;
 177     unsigned char v = static_cast<unsigned char>(c);
 178     unsigned char discriminator = detail::uriEscapeTable[v];
 179     if (LIKELY(discriminator <= minEncode)) {
 180       ++p;
 181     } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
 182       out.append(&*last, size_t(p - last));
 183       out.push_back('+');
 184       ++p;
 185       last = p;
 186     } else {
 187       out.append(&*last, size_t(p - last));
 188       esc[1] = hexValues[v >> 4];
 189       esc[2] = hexValues[v & 0x0f];
 190       out.append(esc, 3);
 191       ++p;
 192       last = p;
 193     }
 194   }
 195   out.append(&*last, size_t(p - last));
 196 }
 197
 198 template <class String>
 199 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
 200   out.reserve(out.size() + str.size());
 201   auto p = str.begin();
 202   auto last = p;
 203   // We advance over runs of passthrough characters and copy them in one go;
 204   // this is faster than calling push_back repeatedly.
 205   while (p != str.end()) {
 206     char c = *p;
 207     switch (c) {
 208     case '%':
 209       {
 210         if (UNLIKELY(std::distance(p, str.end()) < 3)) {
 211           throw std::invalid_argument("incomplete percent encode sequence");
 212         }
 213         auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
 214         auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
 215         if (UNLIKELY(h1 == 16 || h2 == 16)) {
 216           throw std::invalid_argument("invalid percent encode sequence");
 217         }
 218         out.append(&*last, size_t(p - last));
 219         out.push_back((h1 << 4) | h2);
 220         p += 3;
 221         last = p;
 222         break;
 223       }
 224     case '+':
 225       if (mode == UriEscapeMode::QUERY) {
 226         out.append(&*last, size_t(p - last));
 227         out.push_back(' ');
 228         ++p;
 229         last = p;
 230         break;
 231       }
 232       // else fallthrough
 233       FOLLY_FALLTHROUGH;
 234     default:
 235       ++p;
 236       break;
 237     }
 238   }
 239   out.append(&*last, size_t(p - last));
 240 }
 241
 242 namespace detail {
 243
 244 /*
 245  * The following functions are type-overloaded helpers for
 246  * internalSplit().
 247  */
 248 inline size_t delimSize(char)          { return 1; }
 249 inline size_t delimSize(StringPiece s) { return s.size(); }
 250 inline bool atDelim(const char* s, char c) {
 251  return *s == c;
 252 }
 253 inline bool atDelim(const char* s, StringPiece sp) {
 254   return !std::memcmp(s, sp.start(), sp.size());
 255 }
 256
 257 // These are used to short-circuit internalSplit() in the case of
 258 // 1-character strings.
 259 inline char delimFront(char c) {
 260   // This one exists only for compile-time; it should never be called.
 261   std::abort();
 262   return c;
 263 }
 264 inline char delimFront(StringPiece s) {
 265   assert(!s.empty() && s.start() != nullptr);
 266   return *s.start();
 267 }
 268
 269 /*
 270  * Shared implementation for all the split() overloads.
 271  *
 272  * This uses some external helpers that are overloaded to let this
 273  * algorithm be more performant if the deliminator is a single
 274  * character instead of a whole string.
 275  *
 276  * @param ignoreEmpty iff true, don't copy empty segments to output
 277  */
 278 template <class OutStringT, class DelimT, class OutputIterator>
 279 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
 280     bool ignoreEmpty) {
 281   assert(sp.empty() || sp.start() != nullptr);
 282
 283   const char* s = sp.start();
 284   const size_t strSize = sp.size();
 285   const size_t dSize = delimSize(delim);
 286
 287   if (dSize > strSize || dSize == 0) {
 288     if (!ignoreEmpty || strSize > 0) {
 289       *out++ = to<OutStringT>(sp);
 290     }
 291     return;
 292   }
 293   if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
 294     // Call the char version because it is significantly faster.
 295     return internalSplit<OutStringT>(delimFront(delim), sp, out,
 296       ignoreEmpty);
 297   }
 298
 299   size_t tokenStartPos = 0;
 300   size_t tokenSize = 0;
 301   for (size_t i = 0; i <= strSize - dSize; ++i) {
 302     if (atDelim(&s[i], delim)) {
 303       if (!ignoreEmpty || tokenSize > 0) {
 304         *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
 305       }
 306
 307       tokenStartPos = i + dSize;
 308       tokenSize = 0;
 309       i += dSize - 1;
 310     } else {
 311       ++tokenSize;
 312     }
 313   }
 314   tokenSize = strSize - tokenStartPos;
 315   if (!ignoreEmpty || tokenSize > 0) {
 316     *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
 317   }
 318 }
 319
 320 template <class String> StringPiece prepareDelim(const String& s) {
 321   return StringPiece(s);
 322 }
 323 inline char prepareDelim(char c) { return c; }
 324
 325 template <bool exact, class Delim, class OutputType>
 326 bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
 327   static_assert(
 328       exact || std::is_same<OutputType, StringPiece>::value ||
 329           IsSomeString<OutputType>::value,
 330       "split<false>() requires that the last argument be a string type");
 331   if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
 332     return false;
 333   }
 334   output = folly::to<OutputType>(input);
 335   return true;
 336 }
 337
 338 template <bool exact, class Delim, class OutputType, class... OutputTypes>
 339 bool splitFixed(
 340     const Delim& delimiter,
 341     StringPiece input,
 342     OutputType& outHead,
 343     OutputTypes&... outTail) {
 344   size_t cut = input.find(delimiter);
 345   if (UNLIKELY(cut == std::string::npos)) {
 346     return false;
 347   }
 348   StringPiece head(input.begin(), input.begin() + cut);
 349   StringPiece tail(input.begin() + cut + detail::delimSize(delimiter),
 350                    input.end());
 351   if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
 352     outHead = folly::to<OutputType>(head);
 353     return true;
 354   }
 355   return false;
 356 }
 357
 358 } // namespace detail
 359
 360 //////////////////////////////////////////////////////////////////////
 361
 362 template <class Delim, class String, class OutputType>
 363 void split(const Delim& delimiter,
 364            const String& input,
 365            std::vector<OutputType>& out,
 366            bool ignoreEmpty) {
 367   detail::internalSplit<OutputType>(
 368     detail::prepareDelim(delimiter),
 369     StringPiece(input),
 370     std::back_inserter(out),
 371     ignoreEmpty);
 372 }
 373
 374 template <class Delim, class String, class OutputType>
 375 void split(const Delim& delimiter,
 376            const String& input,
 377            fbvector<OutputType>& out,
 378            bool ignoreEmpty) {
 379   detail::internalSplit<OutputType>(
 380     detail::prepareDelim(delimiter),
 381     StringPiece(input),
 382     std::back_inserter(out),
 383     ignoreEmpty);
 384 }
 385
 386 template <
 387     class OutputValueType,
 388     class Delim,
 389     class String,
 390     class OutputIterator>
 391 void splitTo(const Delim& delimiter,
 392              const String& input,
 393              OutputIterator out,
 394              bool ignoreEmpty) {
 395   detail::internalSplit<OutputValueType>(
 396     detail::prepareDelim(delimiter),
 397     StringPiece(input),
 398     out,
 399     ignoreEmpty);
 400 }
 401
 402 template <bool exact, class Delim, class... OutputTypes>
 403 typename std::enable_if<
 404     StrictConjunction<IsConvertible<OutputTypes>...>::value &&
 405         sizeof...(OutputTypes) >= 1,
 406     bool>::type
 407 split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
 408   return detail::splitFixed<exact>(
 409       detail::prepareDelim(delimiter), input, outputs...);
 410 }
 411
 412 namespace detail {
 413
 414 /*
 415  * If a type can have its string size determined cheaply, we can more
 416  * efficiently append it in a loop (see internalJoinAppend). Note that the
 417  * struct need not conform to the std::string api completely (ex. does not need
 418  * to implement append()).
 419  */
 420 template <class T> struct IsSizableString {
 421   enum { value = IsSomeString<T>::value
 422          || std::is_same<T, StringPiece>::value };
 423 };
 424
 425 template <class Iterator>
 426 struct IsSizableStringContainerIterator :
 427   IsSizableString<typename std::iterator_traits<Iterator>::value_type> {
 428 };
 429
 430 template <class Delim, class Iterator, class String>
 431 void internalJoinAppend(Delim delimiter,
 432                         Iterator begin,
 433                         Iterator end,
 434                         String& output) {
 435   assert(begin != end);
 436   if (std::is_same<Delim, StringPiece>::value &&
 437       delimSize(delimiter) == 1) {
 438     internalJoinAppend(delimFront(delimiter), begin, end, output);
 439     return;
 440   }
 441   toAppend(*begin, &output);
 442   while (++begin != end) {
 443     toAppend(delimiter, *begin, &output);
 444   }
 445 }
 446
 447 template <class Delim, class Iterator, class String>
 448 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
 449 internalJoin(Delim delimiter,
 450              Iterator begin,
 451              Iterator end,
 452              String& output) {
 453   output.clear();
 454   if (begin == end) {
 455     return;
 456   }
 457   const size_t dsize = delimSize(delimiter);
 458   Iterator it = begin;
 459   size_t size = it->size();
 460   while (++it != end) {
 461     size += dsize + it->size();
 462   }
 463   output.reserve(size);
 464   internalJoinAppend(delimiter, begin, end, output);
 465 }
 466
 467 template <class Delim, class Iterator, class String>
 468 typename
 469 std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type
 470 internalJoin(Delim delimiter,
 471              Iterator begin,
 472              Iterator end,
 473              String& output) {
 474   output.clear();
 475   if (begin == end) {
 476     return;
 477   }
 478   internalJoinAppend(delimiter, begin, end, output);
 479 }
 480
 481 } // namespace detail
 482
 483 template <class Delim, class Iterator, class String>
 484 void join(const Delim& delimiter,
 485           Iterator begin,
 486           Iterator end,
 487           String& output) {
 488   detail::internalJoin(
 489     detail::prepareDelim(delimiter),
 490     begin,
 491     end,
 492     output);
 493 }
 494
 495 template <class OutputString>
 496 void backslashify(
 497     folly::StringPiece input,
 498     OutputString& output,
 499     bool hex_style) {
 500   static const char hexValues[] = "0123456789abcdef";
 501   output.clear();
 502   output.reserve(3 * input.size());
 503   for (unsigned char c : input) {
 504     // less than space or greater than '~' are considered unprintable
 505     if (c < 0x20 || c > 0x7e || c == '\\') {
 506       bool hex_append = false;
 507       output.push_back('\\');
 508       if (hex_style) {
 509         hex_append = true;
 510       } else {
 511         if (c == '\r') {
 512           output += 'r';
 513         } else if (c == '\n') {
 514           output += 'n';
 515         } else if (c == '\t') {
 516           output += 't';
 517         } else if (c == '\a') {
 518           output += 'a';
 519         } else if (c == '\b') {
 520           output += 'b';
 521         } else if (c == '\0') {
 522           output += '0';
 523         } else if (c == '\\') {
 524           output += '\\';
 525         } else {
 526           hex_append = true;
 527         }
 528       }
 529       if (hex_append) {
 530         output.push_back('x');
 531         output.push_back(hexValues[(c >> 4) & 0xf]);
 532         output.push_back(hexValues[c & 0xf]);
 533       }
 534     } else {
 535       output += c;
 536     }
 537   }
 538 }
 539
 540 template <class String1, class String2>
 541 void humanify(const String1& input, String2& output) {
 542   size_t numUnprintable = 0;
 543   size_t numPrintablePrefix = 0;
 544   for (unsigned char c : input) {
 545     if (c < 0x20 || c > 0x7e || c == '\\') {
 546       ++numUnprintable;
 547     }
 548     if (numUnprintable == 0) {
 549       ++numPrintablePrefix;
 550     }
 551   }
 552
 553   // hexlify doubles a string's size; backslashify can potentially
 554   // explode it by 4x.  Now, the printable range of the ascii
 555   // "spectrum" is around 95 out of 256 values, so a "random" binary
 556   // string should be around 60% unprintable.  We use a 50% hueristic
 557   // here, so if a string is 60% unprintable, then we just use hex
 558   // output.  Otherwise we backslash.
 559   //
 560   // UTF8 is completely ignored; as a result, utf8 characters will
 561   // likely be \x escaped (since most common glyphs fit in two bytes).
 562   // This is a tradeoff of complexity/speed instead of a convenience
 563   // that likely would rarely matter.  Moreover, this function is more
 564   // about displaying underlying bytes, not about displaying glyphs
 565   // from languages.
 566   if (numUnprintable == 0) {
 567     output = input;
 568   } else if (5 * numUnprintable >= 3 * input.size()) {
 569     // However!  If we have a "meaningful" prefix of printable
 570     // characters, say 20% of the string, we backslashify under the
 571     // assumption viewing the prefix as ascii is worth blowing the
 572     // output size up a bit.
 573     if (5 * numPrintablePrefix >= input.size()) {
 574       backslashify(input, output);
 575     } else {
 576       output = "0x";
 577       hexlify(input, output, true /* append output */);
 578     }
 579   } else {
 580     backslashify(input, output);
 581   }
 582 }
 583
 584 template <class InputString, class OutputString>
 585 bool hexlify(const InputString& input, OutputString& output,
 586              bool append_output) {
 587   if (!append_output) {
 588     output.clear();
 589   }
 590
 591   static char hexValues[] = "0123456789abcdef";
 592   auto j = output.size();
 593   output.resize(2 * input.size() + output.size());
 594   for (size_t i = 0; i < input.size(); ++i) {
 595     int ch = input[i];
 596     output[j++] = hexValues[(ch >> 4) & 0xf];
 597     output[j++] = hexValues[ch & 0xf];
 598   }
 599   return true;
 600 }
 601
 602 template <class InputString, class OutputString>
 603 bool unhexlify(const InputString& input, OutputString& output) {
 604   if (input.size() % 2 != 0) {
 605     return false;
 606   }
 607   output.resize(input.size() / 2);
 608   int j = 0;
 609
 610   for (size_t i = 0; i < input.size(); i += 2) {
 611     int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
 612     int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
 613     if ((highBits | lowBits) & 0x10) {
 614       // One of the characters wasn't a hex digit
 615       return false;
 616     }
 617     output[j++] = (highBits << 4) + lowBits;
 618   }
 619   return true;
 620 }
 621
 622 namespace detail {
 623 /**
 624  * Hex-dump at most 16 bytes starting at offset from a memory area of size
 625  * bytes.  Return the number of bytes actually dumped.
 626  */
 627 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
 628                    std::string& line);
 629 } // namespace detail
 630
 631 template <class OutIt>
 632 void hexDump(const void* ptr, size_t size, OutIt out) {
 633   size_t offset = 0;
 634   std::string line;
 635   while (offset < size) {
 636     offset += detail::hexDumpLine(ptr, offset, size, line);
 637     *out++ = line;
 638   }
 639 }
 640
 641 } // namespace folly