folly/String.h

   1 /*
   2  * Copyright 2014 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_BASE_STRING_H_
  18 #define FOLLY_BASE_STRING_H_
  19
  20 #include <exception>
  21 #include <string>
  22 #include <boost/type_traits.hpp>
  23
  24 #ifdef _GLIBCXX_SYMVER
  25 #include <ext/hash_set>
  26 #include <ext/hash_map>
  27 #endif
  28
  29 #include <unordered_set>
  30 #include <unordered_map>
  31
  32 #include "folly/Conv.h"
  33 #include "folly/FBString.h"
  34 #include "folly/FBVector.h"
  35 #include "folly/Portability.h"
  36 #include "folly/Range.h"
  37 #include "folly/ScopeGuard.h"
  38
  39 // Compatibility function, to make sure toStdString(s) can be called
  40 // to convert a std::string or fbstring variable s into type std::string
  41 // with very little overhead if s was already std::string
  42 namespace folly {
  43
  44 inline
  45 std::string toStdString(const folly::fbstring& s) {
  46   return std::string(s.data(), s.size());
  47 }
  48
  49 inline
  50 const std::string& toStdString(const std::string& s) {
  51   return s;
  52 }
  53
  54 // If called with a temporary, the compiler will select this overload instead
  55 // of the above, so we don't return a (lvalue) reference to a temporary.
  56 inline
  57 std::string&& toStdString(std::string&& s) {
  58   return std::move(s);
  59 }
  60
  61 /**
  62  * C-Escape a string, making it suitable for representation as a C string
  63  * literal.  Appends the result to the output string.
  64  *
  65  * Backslashes all occurrences of backslash and double-quote:
  66  *   "  ->  \"
  67  *   \  ->  \\
  68  *
  69  * Replaces all non-printable ASCII characters with backslash-octal
  70  * representation:
  71  *   <ASCII 254> -> \376
  72  *
  73  * Note that we use backslash-octal instead of backslash-hex because the octal
  74  * representation is guaranteed to consume no more than 3 characters; "\3760"
  75  * represents two characters, one with value 254, and one with value 48 ('0'),
  76  * whereas "\xfe0" represents only one character (with value 4064, which leads
  77  * to implementation-defined behavior).
  78  */
  79 template <class String>
  80 void cEscape(StringPiece str, String& out);
  81
  82 /**
  83  * Similar to cEscape above, but returns the escaped string.
  84  */
  85 template <class String>
  86 String cEscape(StringPiece str) {
  87   String out;
  88   cEscape(str, out);
  89   return out;
  90 }
  91
  92 /**
  93  * C-Unescape a string; the opposite of cEscape above.  Appends the result
  94  * to the output string.
  95  *
  96  * Recognizes the standard C escape sequences:
  97  *
  98  * \' \" \? \\ \a \b \f \n \r \t \v
  99  * \[0-7]+
 100  * \x[0-9a-fA-F]+
 101  *
 102  * In strict mode (default), throws std::invalid_argument if it encounters
 103  * an unrecognized escape sequence.  In non-strict mode, it leaves
 104  * the escape sequence unchanged.
 105  */
 106 template <class String>
 107 void cUnescape(StringPiece str, String& out, bool strict = true);
 108
 109 /**
 110  * Similar to cUnescape above, but returns the escaped string.
 111  */
 112 template <class String>
 113 String cUnescape(StringPiece str, bool strict = true) {
 114   String out;
 115   cUnescape(str, out, strict);
 116   return out;
 117 }
 118
 119 /**
 120  * URI-escape a string.  Appends the result to the output string.
 121  *
 122  * Alphanumeric characters and other characters marked as "unreserved" in RFC
 123  * 3986 ( -_.~ ) are left unchanged.  In PATH mode, the forward slash (/) is
 124  * also left unchanged.  In QUERY mode, spaces are replaced by '+'.  All other
 125  * characters are percent-encoded.
 126  */
 127 enum class UriEscapeMode : unsigned char {
 128   // The values are meaningful, see generate_escape_tables.py
 129   ALL = 0,
 130   QUERY = 1,
 131   PATH = 2
 132 };
 133 template <class String>
 134 void uriEscape(StringPiece str,
 135                String& out,
 136                UriEscapeMode mode = UriEscapeMode::ALL);
 137
 138 /**
 139  * Similar to uriEscape above, but returns the escaped string.
 140  */
 141 template <class String>
 142 String uriEscape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
 143   String out;
 144   uriEscape(str, out, mode);
 145   return out;
 146 }
 147
 148 /**
 149  * URI-unescape a string.  Appends the result to the output string.
 150  *
 151  * In QUERY mode, '+' are replaced by space.  %XX sequences are decoded if
 152  * XX is a valid hex sequence, otherwise we throw invalid_argument.
 153  */
 154 template <class String>
 155 void uriUnescape(StringPiece str,
 156                  String& out,
 157                  UriEscapeMode mode = UriEscapeMode::ALL);
 158
 159 /**
 160  * Similar to uriUnescape above, but returns the unescaped string.
 161  */
 162 template <class String>
 163 String uriUnescape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
 164   String out;
 165   uriUnescape(str, out, mode);
 166   return out;
 167 }
 168
 169 /**
 170  * stringPrintf is much like printf but deposits its result into a
 171  * string. Two signatures are supported: the first simply returns the
 172  * resulting string, and the second appends the produced characters to
 173  * the specified string and returns a reference to it.
 174  */
 175 std::string stringPrintf(const char* format, ...)
 176   __attribute__ ((format (printf, 1, 2)));
 177
 178 /** Similar to stringPrintf, with different signiture.
 179   */
 180 void stringPrintf(std::string* out, const char* fmt, ...)
 181   __attribute__ ((format (printf, 2, 3)));
 182
 183 std::string& stringAppendf(std::string* output, const char* format, ...)
 184   __attribute__ ((format (printf, 2, 3)));
 185
 186 /**
 187  * Backslashify a string, that is, replace non-printable characters
 188  * with C-style (but NOT C compliant) "\xHH" encoding.  If hex_style
 189  * is false, then shorthand notations like "\0" will be used instead
 190  * of "\x00" for the most common backslash cases.
 191  *
 192  * There are two forms, one returning the input string, and one
 193  * creating output in the specified output string.
 194  *
 195  * This is mainly intended for printing to a terminal, so it is not
 196  * particularly optimized.
 197  *
 198  * Do *not* use this in situations where you expect to be able to feed
 199  * the string to a C or C++ compiler, as there are nuances with how C
 200  * parses such strings that lead to failures.  This is for display
 201  * purposed only.  If you want a string you can embed for use in C or
 202  * C++, use cEscape instead.  This function is for display purposes
 203  * only.
 204  */
 205 template <class String1, class String2>
 206 void backslashify(const String1& input, String2& output, bool hex_style=false);
 207
 208 template <class String>
 209 String backslashify(const String& input, bool hex_style=false) {
 210   String output;
 211   backslashify(input, output, hex_style);
 212   return output;
 213 }
 214
 215 /**
 216  * Take a string and "humanify" it -- that is, make it look better.
 217  * Since "better" is subjective, caveat emptor.  The basic approach is
 218  * to count the number of unprintable characters.  If there are none,
 219  * then the output is the input.  If there are relatively few, or if
 220  * there is a long "enough" prefix of printable characters, use
 221  * backslashify.  If it is mostly binary, then simply hex encode.
 222  *
 223  * This is an attempt to make a computer smart, and so likely is wrong
 224  * most of the time.
 225  */
 226 template <class String1, class String2>
 227 void humanify(const String1& input, String2& output);
 228
 229 template <class String>
 230 String humanify(const String& input) {
 231   String output;
 232   humanify(input, output);
 233   return output;
 234 }
 235
 236 /**
 237  * Same functionality as Python's binascii.hexlify.  Returns true
 238  * on successful conversion.
 239  *
 240  * If append_output is true, append data to the output rather than
 241  * replace it.
 242  */
 243 template<class InputString, class OutputString>
 244 bool hexlify(const InputString& input, OutputString& output,
 245              bool append=false);
 246
 247 /**
 248  * Same functionality as Python's binascii.unhexlify.  Returns true
 249  * on successful conversion.
 250  */
 251 template<class InputString, class OutputString>
 252 bool unhexlify(const InputString& input, OutputString& output);
 253
 254 /*
 255  * A pretty-printer for numbers that appends suffixes of units of the
 256  * given type.  It prints 4 sig-figs of value with the most
 257  * appropriate unit.
 258  *
 259  * If `addSpace' is true, we put a space between the units suffix and
 260  * the value.
 261  *
 262  * Current types are:
 263  *   PRETTY_TIME         - s, ms, us, ns, etc.
 264  *   PRETTY_BYTES_METRIC - kB, MB, GB, etc (goes up by 10^3 = 1000 each time)
 265  *   PRETTY_BYTES        - kB, MB, GB, etc (goes up by 2^10 = 1024 each time)
 266  *   PRETTY_BYTES_IEC    - KiB, MiB, GiB, etc
 267  *   PRETTY_UNITS_METRIC - k, M, G, etc (goes up by 10^3 = 1000 each time)
 268  *   PRETTY_UNITS_BINARY - k, M, G, etc (goes up by 2^10 = 1024 each time)
 269  *   PRETTY_UNITS_BINARY_IEC - Ki, Mi, Gi, etc
 270  *
 271  * @author Mark Rabkin <mrabkin@fb.com>
 272  */
 273 enum PrettyType {
 274   PRETTY_TIME,
 275
 276   PRETTY_BYTES_METRIC,
 277   PRETTY_BYTES_BINARY,
 278   PRETTY_BYTES = PRETTY_BYTES_BINARY,
 279   PRETTY_BYTES_BINARY_IEC,
 280   PRETTY_BYTES_IEC = PRETTY_BYTES_BINARY_IEC,
 281
 282   PRETTY_UNITS_METRIC,
 283   PRETTY_UNITS_BINARY,
 284   PRETTY_UNITS_BINARY_IEC,
 285
 286   PRETTY_NUM_TYPES
 287 };
 288
 289 std::string prettyPrint(double val, PrettyType, bool addSpace = true);
 290
 291 /**
 292  * Write a hex dump of size bytes starting at ptr to out.
 293  *
 294  * The hex dump is formatted as follows:
 295  *
 296  * for the string "abcdefghijklmnopqrstuvwxyz\x02"
 297 00000000  61 62 63 64 65 66 67 68  69 6a 6b 6c 6d 6e 6f 70  |abcdefghijklmnop|
 298 00000010  71 72 73 74 75 76 77 78  79 7a 02                 |qrstuvwxyz.     |
 299  *
 300  * that is, we write 16 bytes per line, both as hex bytes and as printable
 301  * characters.  Non-printable characters are replaced with '.'
 302  * Lines are written to out one by one (one StringPiece at a time) without
 303  * delimiters.
 304  */
 305 template <class OutIt>
 306 void hexDump(const void* ptr, size_t size, OutIt out);
 307
 308 /**
 309  * Return the hex dump of size bytes starting at ptr as a string.
 310  */
 311 std::string hexDump(const void* ptr, size_t size);
 312
 313 /**
 314  * Return a fbstring containing the description of the given errno value.
 315  * Takes care not to overwrite the actual system errno, so calling
 316  * errnoStr(errno) is valid.
 317  */
 318 fbstring errnoStr(int err);
 319
 320 /**
 321  * Return the demangled (prettyfied) version of a C++ type.
 322  *
 323  * This function tries to produce a human-readable type, but the type name will
 324  * be returned unchanged in case of error or if demangling isn't supported on
 325  * your system.
 326  *
 327  * Use for debugging -- do not rely on demangle() returning anything useful.
 328  *
 329  * This function may allocate memory (and therefore throw std::bad_alloc).
 330  */
 331 fbstring demangle(const char* name);
 332 inline fbstring demangle(const std::type_info& type) {
 333   return demangle(type.name());
 334 }
 335
 336 /**
 337  * Return the demangled (prettyfied) version of a C++ type in a user-provided
 338  * buffer.
 339  *
 340  * The semantics are the same as for snprintf or strlcpy: bufSize is the size
 341  * of the buffer, the string is always null-terminated, and the return value is
 342  * the number of characters (not including the null terminator) that would have
 343  * been written if the buffer was big enough. (So a return value >= bufSize
 344  * indicates that the output was truncated)
 345  *
 346  * This function does not allocate memory and is async-signal-safe.
 347  *
 348  * Note that the underlying function for the fbstring-returning demangle is
 349  * somewhat standard (abi::__cxa_demangle, which uses malloc), the underlying
 350  * function for this version is less so (cplus_demangle_v3_callback from
 351  * libiberty), so it is possible for the fbstring version to work, while this
 352  * version returns the original, mangled name.
 353  */
 354 size_t demangle(const char* name, char* buf, size_t bufSize);
 355 inline size_t demangle(const std::type_info& type, char* buf, size_t bufSize) {
 356   return demangle(type.name(), buf, bufSize);
 357 }
 358
 359 /**
 360  * Debug string for an exception: include type and what().
 361  */
 362 inline fbstring exceptionStr(const std::exception& e) {
 363   return folly::to<fbstring>(demangle(typeid(e)), ": ", e.what());
 364 }
 365
 366 inline fbstring exceptionStr(std::exception_ptr ep) {
 367   try {
 368     std::rethrow_exception(ep);
 369   } catch (const std::exception& e) {
 370     return exceptionStr(e);
 371   } catch (...) {
 372     return "<unknown exception>";
 373   }
 374 }
 375
 376 /*
 377  * Split a string into a list of tokens by delimiter.
 378  *
 379  * The split interface here supports different output types, selected
 380  * at compile time: StringPiece, fbstring, or std::string.  If you are
 381  * using a vector to hold the output, it detects the type based on
 382  * what your vector contains.  If the output vector is not empty, split
 383  * will append to the end of the vector.
 384  *
 385  * You can also use splitTo() to write the output to an arbitrary
 386  * OutputIterator (e.g. std::inserter() on a std::set<>), in which
 387  * case you have to tell the function the type.  (Rationale:
 388  * OutputIterators don't have a value_type, so we can't detect the
 389  * type in splitTo without being told.)
 390  *
 391  * Examples:
 392  *
 393  *   std::vector<folly::StringPiece> v;
 394  *   folly::split(":", "asd:bsd", v);
 395  *
 396  *   std::set<StringPiece> s;
 397  *   folly::splitTo<StringPiece>(":", "asd:bsd:asd:csd",
 398  *    std::inserter(s, s.begin()));
 399  *
 400  * Split also takes a flag (ignoreEmpty) that indicates whether adjacent
 401  * delimiters should be treated as one single separator (ignoring empty tokens)
 402  * or not (generating empty tokens).
 403  */
 404
 405 template<class Delim, class String, class OutputType>
 406 void split(const Delim& delimiter,
 407            const String& input,
 408            std::vector<OutputType>& out,
 409            bool ignoreEmpty = false);
 410
 411 template<class Delim, class String, class OutputType>
 412 void split(const Delim& delimiter,
 413            const String& input,
 414            folly::fbvector<OutputType>& out,
 415            bool ignoreEmpty = false);
 416
 417 template<class OutputValueType, class Delim, class String,
 418          class OutputIterator>
 419 void splitTo(const Delim& delimiter,
 420              const String& input,
 421              OutputIterator out,
 422              bool ignoreEmpty = false);
 423
 424 /*
 425  * Split a string into a fixed number of pieces by delimiter. Returns 'true' if
 426  * the fields were all successfully populated.
 427  *
 428  * Example:
 429  *
 430  *  folly::StringPiece name, key, value;
 431  *  if (folly::split('\t', line, name, key, value))
 432  *    ...
 433  *
 434  * The 'exact' template paremeter specifies how the function behaves when too
 435  * many fields are present in the input string. When 'exact' is set to its
 436  * default value of 'true', a call to split will fail if the number of fields in
 437  * the input string does not exactly match the number of output parameters
 438  * passed. If 'exact' is overridden to 'false', all remaining fields will be
 439  * stored, unsplit, in the last field, as shown below:
 440  *
 441  *  folly::StringPiece x, y.
 442  *  if (folly::split<false>(':', "a:b:c", x, y))
 443  *    assert(x == "a" && y == "b:c");
 444  */
 445 template<bool exact = true,
 446          class Delim,
 447          class... StringPieces>
 448 bool split(const Delim& delimiter,
 449            StringPiece input,
 450            StringPiece& outHead,
 451            StringPieces&... outTail);
 452
 453 /*
 454  * Join list of tokens.
 455  *
 456  * Stores a string representation of tokens in the same order with
 457  * deliminer between each element.
 458  */
 459
 460 template <class Delim, class Iterator, class String>
 461 void join(const Delim& delimiter,
 462           Iterator begin,
 463           Iterator end,
 464           String& output);
 465
 466 template <class Delim, class Container, class String>
 467 void join(const Delim& delimiter,
 468           const Container& container,
 469           String& output) {
 470   join(delimiter, container.begin(), container.end(), output);
 471 }
 472
 473 template <class Delim, class Value, class String>
 474 void join(const Delim& delimiter,
 475           const std::initializer_list<Value>& values,
 476           String& output) {
 477   join(delimiter, values.begin(), values.end(), output);
 478 }
 479
 480 template <class Delim, class Container>
 481 std::string join(const Delim& delimiter,
 482                  const Container& container) {
 483   std::string output;
 484   join(delimiter, container.begin(), container.end(), output);
 485   return output;
 486 }
 487
 488 template <class Delim, class Value>
 489 std::string join(const Delim& delimiter,
 490                  const std::initializer_list<Value>& values) {
 491   std::string output;
 492   join(delimiter, values.begin(), values.end(), output);
 493   return output;
 494 }
 495
 496 } // namespace folly
 497
 498 // Hash functions for string and fbstring usable with e.g. hash_map
 499 //
 500 // We let Boost pick the namespace here for us, since it has logic to do the
 501 // right thing based on the C++ standard library implementation being used.
 502 namespace BOOST_STD_EXTENSION_NAMESPACE {
 503
 504 template <class C>
 505 struct hash<folly::basic_fbstring<C> > : private hash<const C*> {
 506   size_t operator()(const folly::basic_fbstring<C> & s) const {
 507     return hash<const C*>::operator()(s.c_str());
 508   }
 509 };
 510
 511 template <class C>
 512 struct hash<std::basic_string<C> > : private hash<const C*> {
 513   size_t operator()(const std::basic_string<C> & s) const {
 514     return hash<const C*>::operator()(s.c_str());
 515   }
 516 };
 517
 518 }
 519
 520 // Hook into boost's type traits
 521 namespace boost {
 522 template <class T>
 523 struct has_nothrow_constructor<folly::basic_fbstring<T> > : true_type {
 524   enum { value = true };
 525 };
 526 } // namespace boost
 527
 528 #include "folly/String-inl.h"
 529
 530 #endif