folly/String-inl.h

   1 /*
   2  * Copyright 2012 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_STRING_INL_H_
  18 #define FOLLY_STRING_INL_H_
  19
  20 #include <stdexcept>
  21
  22 #ifndef FOLLY_BASE_STRING_H_
  23 #error This file may only be included from String.h
  24 #endif
  25
  26 namespace folly {
  27
  28 namespace detail {
  29 // Map from character code to value of one-character escape sequence
  30 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
  31 // an octal escape sequence, or 'P' if the character is printable and
  32 // should be printed as is.
  33 extern const char cEscapeTable[];
  34 }  // namespace detail
  35
  36 template <class String>
  37 void cEscape(StringPiece str, String& out) {
  38   char esc[4];
  39   esc[0] = '\\';
  40   out.reserve(out.size() + str.size());
  41   auto p = str.begin();
  42   auto last = p;  // last regular character
  43   // We advance over runs of regular characters (printable, not double-quote or
  44   // backslash) and copy them in one go; this is faster than calling push_back
  45   // repeatedly.
  46   while (p != str.end()) {
  47     char c = *p;
  48     unsigned char v = static_cast<unsigned char>(c);
  49     char e = detail::cEscapeTable[v];
  50     if (e == 'P') {  // printable
  51       ++p;
  52     } else if (e == 'O') {  // octal
  53       out.append(&*last, p - last);
  54       esc[1] = '0' + ((v >> 6) & 7);
  55       esc[2] = '0' + ((v >> 3) & 7);
  56       esc[3] = '0' + (v & 7);
  57       out.append(esc, 4);
  58       ++p;
  59       last = p;
  60     } else {  // special 1-character escape
  61       out.append(&*last, p - last);
  62       esc[1] = e;
  63       out.append(esc, 2);
  64       ++p;
  65       last = p;
  66     }
  67   }
  68   out.append(&*last, p - last);
  69 }
  70
  71 namespace detail {
  72 // Map from the character code of the character following a backslash to
  73 // the unescaped character if a valid one-character escape sequence
  74 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
  75 // octal escape sequence, 'X' if this is the first character of a
  76 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
  77 extern const char cUnescapeTable[];
  78
  79 // Map from the character code to the hex value, or 16 if invalid hex char.
  80 extern const unsigned char hexTable[];
  81 }  // namespace detail
  82
  83 template <class String>
  84 void cUnescape(StringPiece str, String& out, bool strict) {
  85   out.reserve(out.size() + str.size());
  86   auto p = str.begin();
  87   auto last = p;  // last regular character (not part of an escape sequence)
  88   // We advance over runs of regular characters (not backslash) and copy them
  89   // in one go; this is faster than calling push_back repeatedly.
  90   while (p != str.end()) {
  91     char c = *p;
  92     if (c != '\\') {  // normal case
  93       ++p;
  94       continue;
  95     }
  96     out.append(&*last, p - last);
  97     if (p == str.end()) {  // backslash at end of string
  98       if (strict) {
  99         throw std::invalid_argument("incomplete escape sequence");
 100       }
 101       out.push_back('\\');
 102       last = p;
 103       continue;
 104     }
 105     ++p;
 106     char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
 107     if (e == 'O') {  // octal
 108       unsigned char val = 0;
 109       for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
 110            ++i, ++p) {
 111         val = (val << 3) | (*p - '0');
 112       }
 113       out.push_back(val);
 114       last = p;
 115     } else if (e == 'X') {  // hex
 116       ++p;
 117       if (p == str.end()) {  // \x at end of string
 118         if (strict) {
 119           throw std::invalid_argument("incomplete hex escape sequence");
 120         }
 121         out.append("\\x");
 122         last = p;
 123         continue;
 124       }
 125       unsigned char val = 0;
 126       unsigned char h;
 127       for (; (p != str.end() &&
 128               (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
 129            ++p) {
 130         val = (val << 4) | h;
 131       }
 132       out.push_back(val);
 133       last = p;
 134     } else if (e == 'I') {  // invalid
 135       if (strict) {
 136         throw std::invalid_argument("invalid escape sequence");
 137       }
 138       out.push_back('\\');
 139       out.push_back(*p);
 140       ++p;
 141       last = p;
 142     } else {  // standard escape sequence, \' etc
 143       out.push_back(e);
 144       ++p;
 145       last = p;
 146     }
 147   }
 148   out.append(&*last, p - last);
 149 }
 150
 151 namespace detail {
 152
 153 /*
 154  * The following functions are type-overloaded helpers for
 155  * internalSplit().
 156  */
 157 inline size_t delimSize(char)          { return 1; }
 158 inline size_t delimSize(StringPiece s) { return s.size(); }
 159 inline bool atDelim(const char* s, char c) {
 160  return *s == c;
 161 }
 162 inline bool atDelim(const char* s, StringPiece sp) {
 163   return !std::memcmp(s, sp.start(), sp.size());
 164 }
 165
 166 // These are used to short-circuit internalSplit() in the case of
 167 // 1-character strings.
 168 inline char delimFront(char c) {
 169   // This one exists only for compile-time; it should never be called.
 170   std::abort();
 171   return c;
 172 }
 173 inline char delimFront(StringPiece s) {
 174   assert(!s.empty() && s.start() != nullptr);
 175   return *s.start();
 176 }
 177
 178 /*
 179  * These output conversion templates allow us to support multiple
 180  * output string types, even when we are using an arbitrary
 181  * OutputIterator.
 182  */
 183 template<class OutStringT> struct OutputConverter {};
 184
 185 template<> struct OutputConverter<std::string> {
 186   std::string operator()(StringPiece sp) const {
 187     return sp.toString();
 188   }
 189 };
 190
 191 template<> struct OutputConverter<fbstring> {
 192   fbstring operator()(StringPiece sp) const {
 193     return sp.toFbstring();
 194   }
 195 };
 196
 197 template<> struct OutputConverter<StringPiece> {
 198   StringPiece operator()(StringPiece sp) const { return sp; }
 199 };
 200
 201 /*
 202  * Shared implementation for all the split() overloads.
 203  *
 204  * This uses some external helpers that are overloaded to let this
 205  * algorithm be more performant if the deliminator is a single
 206  * character instead of a whole string.
 207  *
 208  * @param ignoreEmpty iff true, don't copy empty segments to output
 209  */
 210 template<class OutStringT, class DelimT, class OutputIterator>
 211 void internalSplit(DelimT delim, StringPiece sp, OutputIterator out,
 212     bool ignoreEmpty) {
 213   assert(sp.start() != nullptr);
 214
 215   const char* s = sp.start();
 216   const size_t strSize = sp.size();
 217   const size_t dSize = delimSize(delim);
 218
 219   OutputConverter<OutStringT> conv;
 220
 221   if (dSize > strSize || dSize == 0) {
 222     if (!ignoreEmpty || strSize > 0) {
 223       *out++ = conv(sp);
 224     }
 225     return;
 226   }
 227   if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) {
 228     // Call the char version because it is significantly faster.
 229     return internalSplit<OutStringT>(delimFront(delim), sp, out,
 230       ignoreEmpty);
 231   }
 232
 233   int tokenStartPos = 0;
 234   int tokenSize = 0;
 235   for (int i = 0; i <= strSize - dSize; ++i) {
 236     if (atDelim(&s[i], delim)) {
 237       if (!ignoreEmpty || tokenSize > 0) {
 238         *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 239       }
 240
 241       tokenStartPos = i + dSize;
 242       tokenSize = 0;
 243       i += dSize - 1;
 244     } else {
 245       ++tokenSize;
 246     }
 247   }
 248
 249   if (!ignoreEmpty || tokenSize > 0) {
 250     tokenSize = strSize - tokenStartPos;
 251     *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize));
 252   }
 253 }
 254
 255 template<class String> StringPiece prepareDelim(const String& s) {
 256   return StringPiece(s);
 257 }
 258 inline char prepareDelim(char c) { return c; }
 259
 260 }
 261
 262 //////////////////////////////////////////////////////////////////////
 263
 264 template<class Delim, class String, class OutputType>
 265 void split(const Delim& delimiter,
 266            const String& input,
 267            std::vector<OutputType>& out,
 268            bool ignoreEmpty) {
 269   detail::internalSplit<OutputType>(
 270     detail::prepareDelim(delimiter),
 271     StringPiece(input),
 272     std::back_inserter(out),
 273     ignoreEmpty);
 274 }
 275
 276 template<class Delim, class String, class OutputType>
 277 void split(const Delim& delimiter,
 278            const String& input,
 279            fbvector<OutputType>& out,
 280            bool ignoreEmpty = false) {
 281   detail::internalSplit<OutputType>(
 282     detail::prepareDelim(delimiter),
 283     StringPiece(input),
 284     std::back_inserter(out),
 285     ignoreEmpty);
 286 }
 287
 288 template<class OutputValueType, class Delim, class String,
 289          class OutputIterator>
 290 void splitTo(const Delim& delimiter,
 291              const String& input,
 292              OutputIterator out,
 293              bool ignoreEmpty) {
 294   detail::internalSplit<OutputValueType>(
 295     detail::prepareDelim(delimiter),
 296     StringPiece(input),
 297     out,
 298     ignoreEmpty);
 299 }
 300
 301 template <class String1, class String2>
 302 void backslashify(const String1& input, String2& output, bool hex_style) {
 303   static const char hexValues[] = "0123456789abcdef";
 304   output.clear();
 305   output.reserve(3 * input.size());
 306   for (unsigned char c : input) {
 307     // less than space or greater than '~' are considered unprintable
 308     if (c < 0x20 || c > 0x7e || c == '\\') {
 309       bool hex_append = false;
 310       output.push_back('\\');
 311       if (hex_style) {
 312         hex_append = true;
 313       } else {
 314         if (c == '\r') output += 'r';
 315         else if (c == '\n') output += 'n';
 316         else if (c == '\t') output += 't';
 317         else if (c == '\a') output += 'a';
 318         else if (c == '\b') output += 'b';
 319         else if (c == '\0') output += '0';
 320         else if (c == '\\') output += '\\';
 321         else {
 322           hex_append = true;
 323         }
 324       }
 325       if (hex_append) {
 326         output.push_back('x');
 327         output.push_back(hexValues[(c >> 4) & 0xf]);
 328         output.push_back(hexValues[c & 0xf]);
 329       }
 330     } else {
 331       output += c;
 332     }
 333   }
 334 }
 335
 336 template <class String1, class String2>
 337 void humanify(const String1& input, String2& output) {
 338   int numUnprintable = 0;
 339   int numPrintablePrefix = 0;
 340   for (unsigned char c : input) {
 341     if (c < 0x20 || c > 0x7e || c == '\\') {
 342       ++numUnprintable;
 343     }
 344     if (numUnprintable == 0) {
 345       ++numPrintablePrefix;
 346     }
 347   }
 348
 349   // hexlify doubles a string's size; backslashify can potentially
 350   // explode it by 4x.  Now, the printable range of the ascii
 351   // "spectrum" is around 95 out of 256 values, so a "random" binary
 352   // string should be around 60% unprintable.  We use a 50% hueristic
 353   // here, so if a string is 60% unprintable, then we just use hex
 354   // output.  Otherwise we backslash.
 355   //
 356   // UTF8 is completely ignored; as a result, utf8 characters will
 357   // likely be \x escaped (since most common glyphs fit in two bytes).
 358   // This is a tradeoff of complexity/speed instead of a convenience
 359   // that likely would rarely matter.  Moreover, this function is more
 360   // about displaying underlying bytes, not about displaying glyphs
 361   // from languages.
 362   if (numUnprintable == 0) {
 363     output = input;
 364   } else if (5 * numUnprintable >= 3 * input.size()) {
 365     // However!  If we have a "meaningful" prefix of printable
 366     // characters, say 20% of the string, we backslashify under the
 367     // assumption viewing the prefix as ascii is worth blowing the
 368     // output size up a bit.
 369     if (5 * numPrintablePrefix >= input.size()) {
 370       backslashify(input, output);
 371     } else {
 372       output = "0x";
 373       hexlify(input, output, true /* append output */);
 374     }
 375   } else {
 376     backslashify(input, output);
 377   }
 378 }
 379
 380 template<class InputString, class OutputString>
 381 bool hexlify(const InputString& input, OutputString& output,
 382              bool append_output=false) {
 383   if (!append_output) output.clear();
 384
 385   static char hexValues[] = "0123456789abcdef";
 386   int j = output.size();
 387   output.resize(2 * input.size() + output.size());
 388   for (int i = 0; i < input.size(); ++i) {
 389     int ch = input[i];
 390     output[j++] = hexValues[(ch >> 4) & 0xf];
 391     output[j++] = hexValues[ch & 0xf];
 392   }
 393   return true;
 394 }
 395
 396 template<class InputString, class OutputString>
 397 bool unhexlify(const InputString& input, OutputString& output) {
 398   if (input.size() % 2 != 0) {
 399     return false;
 400   }
 401   output.resize(input.size() / 2);
 402   int j = 0;
 403   auto unhex = [](char c) -> int {
 404     return c >= '0' && c <= '9' ? c - '0' :
 405            c >= 'A' && c <= 'F' ? c - 'A' + 10 :
 406            c >= 'a' && c <= 'f' ? c - 'a' + 10 :
 407            -1;
 408   };
 409
 410   for (int i = 0; i < input.size(); i += 2) {
 411     int highBits = unhex(input[i]);
 412     int lowBits = unhex(input[i + 1]);
 413     if (highBits < 0 || lowBits < 0) {
 414       return false;
 415     }
 416     output[j++] = (highBits << 4) + lowBits;
 417   }
 418   return true;
 419 }
 420
 421 namespace detail {
 422 /**
 423  * Hex-dump at most 16 bytes starting at offset from a memory area of size
 424  * bytes.  Return the number of bytes actually dumped.
 425  */
 426 size_t hexDumpLine(const void* ptr, size_t offset, size_t size,
 427                    std::string& line);
 428 }  // namespace detail
 429
 430 template <class OutIt>
 431 void hexDump(const void* ptr, size_t size, OutIt out) {
 432   size_t offset = 0;
 433   std::string line;
 434   while (offset < size) {
 435     offset += detail::hexDumpLine(ptr, offset, size, line);
 436     *out++ = line;
 437   }
 438 }
 439
 440 }  // namespace folly
 441
 442 #endif /* FOLLY_STRING_INL_H_ */
 443