2 * Copyright 2014 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #ifndef FOLLY_BASE_STRING_H_
18 #define FOLLY_BASE_STRING_H_
22 #include <boost/type_traits.hpp>
24 #ifdef _GLIBCXX_SYMVER
25 #include <ext/hash_set>
26 #include <ext/hash_map>
29 #include <unordered_set>
30 #include <unordered_map>
32 #include "folly/Conv.h"
33 #include "folly/FBString.h"
34 #include "folly/FBVector.h"
35 #include "folly/Portability.h"
36 #include "folly/Range.h"
37 #include "folly/ScopeGuard.h"
39 // Compatibility function, to make sure toStdString(s) can be called
40 // to convert a std::string or fbstring variable s into type std::string
41 // with very little overhead if s was already std::string
45 std::string toStdString(const folly::fbstring& s) {
46 return std::string(s.data(), s.size());
50 const std::string& toStdString(const std::string& s) {
54 // If called with a temporary, the compiler will select this overload instead
55 // of the above, so we don't return a (lvalue) reference to a temporary.
57 std::string&& toStdString(std::string&& s) {
62 * C-Escape a string, making it suitable for representation as a C string
63 * literal. Appends the result to the output string.
65 * Backslashes all occurrences of backslash and double-quote:
69 * Replaces all non-printable ASCII characters with backslash-octal
73 * Note that we use backslash-octal instead of backslash-hex because the octal
74 * representation is guaranteed to consume no more than 3 characters; "\3760"
75 * represents two characters, one with value 254, and one with value 48 ('0'),
76 * whereas "\xfe0" represents only one character (with value 4064, which leads
77 * to implementation-defined behavior).
79 template <class String>
80 void cEscape(StringPiece str, String& out);
83 * Similar to cEscape above, but returns the escaped string.
85 template <class String>
86 String cEscape(StringPiece str) {
93 * C-Unescape a string; the opposite of cEscape above. Appends the result
94 * to the output string.
96 * Recognizes the standard C escape sequences:
98 * \' \" \? \\ \a \b \f \n \r \t \v
102 * In strict mode (default), throws std::invalid_argument if it encounters
103 * an unrecognized escape sequence. In non-strict mode, it leaves
104 * the escape sequence unchanged.
106 template <class String>
107 void cUnescape(StringPiece str, String& out, bool strict = true);
110 * Similar to cUnescape above, but returns the escaped string.
112 template <class String>
113 String cUnescape(StringPiece str, bool strict = true) {
115 cUnescape(str, out, strict);
120 * URI-escape a string. Appends the result to the output string.
122 * Alphanumeric characters and other characters marked as "unreserved" in RFC
123 * 3986 ( -_.~ ) are left unchanged. In PATH mode, the forward slash (/) is
124 * also left unchanged. In QUERY mode, spaces are replaced by '+'. All other
125 * characters are percent-encoded.
127 enum class UriEscapeMode : unsigned char {
128 // The values are meaningful, see generate_escape_tables.py
133 template <class String>
134 void uriEscape(StringPiece str,
136 UriEscapeMode mode = UriEscapeMode::ALL);
139 * Similar to uriEscape above, but returns the escaped string.
141 template <class String>
142 String uriEscape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
144 uriEscape(str, out, mode);
149 * URI-unescape a string. Appends the result to the output string.
151 * In QUERY mode, '+' are replaced by space. %XX sequences are decoded if
152 * XX is a valid hex sequence, otherwise we throw invalid_argument.
154 template <class String>
155 void uriUnescape(StringPiece str,
157 UriEscapeMode mode = UriEscapeMode::ALL);
160 * Similar to uriUnescape above, but returns the unescaped string.
162 template <class String>
163 String uriUnescape(StringPiece str, UriEscapeMode mode = UriEscapeMode::ALL) {
165 uriUnescape(str, out, mode);
170 * stringPrintf is much like printf but deposits its result into a
171 * string. Two signatures are supported: the first simply returns the
172 * resulting string, and the second appends the produced characters to
173 * the specified string and returns a reference to it.
175 std::string stringPrintf(const char* format, ...)
176 __attribute__ ((format (printf, 1, 2)));
178 /** Similar to stringPrintf, with different signiture.
180 void stringPrintf(std::string* out, const char* fmt, ...)
181 __attribute__ ((format (printf, 2, 3)));
183 std::string& stringAppendf(std::string* output, const char* format, ...)
184 __attribute__ ((format (printf, 2, 3)));
187 * Backslashify a string, that is, replace non-printable characters
188 * with C-style (but NOT C compliant) "\xHH" encoding. If hex_style
189 * is false, then shorthand notations like "\0" will be used instead
190 * of "\x00" for the most common backslash cases.
192 * There are two forms, one returning the input string, and one
193 * creating output in the specified output string.
195 * This is mainly intended for printing to a terminal, so it is not
196 * particularly optimized.
198 * Do *not* use this in situations where you expect to be able to feed
199 * the string to a C or C++ compiler, as there are nuances with how C
200 * parses such strings that lead to failures. This is for display
201 * purposed only. If you want a string you can embed for use in C or
202 * C++, use cEscape instead. This function is for display purposes
205 template <class String1, class String2>
206 void backslashify(const String1& input, String2& output, bool hex_style=false);
208 template <class String>
209 String backslashify(const String& input, bool hex_style=false) {
211 backslashify(input, output, hex_style);
216 * Take a string and "humanify" it -- that is, make it look better.
217 * Since "better" is subjective, caveat emptor. The basic approach is
218 * to count the number of unprintable characters. If there are none,
219 * then the output is the input. If there are relatively few, or if
220 * there is a long "enough" prefix of printable characters, use
221 * backslashify. If it is mostly binary, then simply hex encode.
223 * This is an attempt to make a computer smart, and so likely is wrong
226 template <class String1, class String2>
227 void humanify(const String1& input, String2& output);
229 template <class String>
230 String humanify(const String& input) {
232 humanify(input, output);
237 * Same functionality as Python's binascii.hexlify. Returns true
238 * on successful conversion.
240 * If append_output is true, append data to the output rather than
243 template<class InputString, class OutputString>
244 bool hexlify(const InputString& input, OutputString& output,
248 * Same functionality as Python's binascii.unhexlify. Returns true
249 * on successful conversion.
251 template<class InputString, class OutputString>
252 bool unhexlify(const InputString& input, OutputString& output);
255 * A pretty-printer for numbers that appends suffixes of units of the
256 * given type. It prints 4 sig-figs of value with the most
259 * If `addSpace' is true, we put a space between the units suffix and
263 * PRETTY_TIME - s, ms, us, ns, etc.
264 * PRETTY_BYTES_METRIC - kB, MB, GB, etc (goes up by 10^3 = 1000 each time)
265 * PRETTY_BYTES - kB, MB, GB, etc (goes up by 2^10 = 1024 each time)
266 * PRETTY_BYTES_IEC - KiB, MiB, GiB, etc
267 * PRETTY_UNITS_METRIC - k, M, G, etc (goes up by 10^3 = 1000 each time)
268 * PRETTY_UNITS_BINARY - k, M, G, etc (goes up by 2^10 = 1024 each time)
269 * PRETTY_UNITS_BINARY_IEC - Ki, Mi, Gi, etc
271 * @author Mark Rabkin <mrabkin@fb.com>
278 PRETTY_BYTES = PRETTY_BYTES_BINARY,
279 PRETTY_BYTES_BINARY_IEC,
280 PRETTY_BYTES_IEC = PRETTY_BYTES_BINARY_IEC,
284 PRETTY_UNITS_BINARY_IEC,
289 std::string prettyPrint(double val, PrettyType, bool addSpace = true);
292 * Write a hex dump of size bytes starting at ptr to out.
294 * The hex dump is formatted as follows:
296 * for the string "abcdefghijklmnopqrstuvwxyz\x02"
297 00000000 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 70 |abcdefghijklmnop|
298 00000010 71 72 73 74 75 76 77 78 79 7a 02 |qrstuvwxyz. |
300 * that is, we write 16 bytes per line, both as hex bytes and as printable
301 * characters. Non-printable characters are replaced with '.'
302 * Lines are written to out one by one (one StringPiece at a time) without
305 template <class OutIt>
306 void hexDump(const void* ptr, size_t size, OutIt out);
309 * Return the hex dump of size bytes starting at ptr as a string.
311 std::string hexDump(const void* ptr, size_t size);
314 * Return a fbstring containing the description of the given errno value.
315 * Takes care not to overwrite the actual system errno, so calling
316 * errnoStr(errno) is valid.
318 fbstring errnoStr(int err);
321 * Return the demangled (prettyfied) version of a C++ type.
323 * This function tries to produce a human-readable type, but the type name will
324 * be returned unchanged in case of error or if demangling isn't supported on
327 * Use for debugging -- do not rely on demangle() returning anything useful.
329 * This function may allocate memory (and therefore throw std::bad_alloc).
331 fbstring demangle(const char* name);
332 inline fbstring demangle(const std::type_info& type) {
333 return demangle(type.name());
337 * Return the demangled (prettyfied) version of a C++ type in a user-provided
340 * The semantics are the same as for snprintf or strlcpy: bufSize is the size
341 * of the buffer, the string is always null-terminated, and the return value is
342 * the number of characters (not including the null terminator) that would have
343 * been written if the buffer was big enough. (So a return value >= bufSize
344 * indicates that the output was truncated)
346 * This function does not allocate memory and is async-signal-safe.
348 * Note that the underlying function for the fbstring-returning demangle is
349 * somewhat standard (abi::__cxa_demangle, which uses malloc), the underlying
350 * function for this version is less so (cplus_demangle_v3_callback from
351 * libiberty), so it is possible for the fbstring version to work, while this
352 * version returns the original, mangled name.
354 size_t demangle(const char* name, char* buf, size_t bufSize);
355 inline size_t demangle(const std::type_info& type, char* buf, size_t bufSize) {
356 return demangle(type.name(), buf, bufSize);
360 * Debug string for an exception: include type and what().
362 inline fbstring exceptionStr(const std::exception& e) {
363 return folly::to<fbstring>(demangle(typeid(e)), ": ", e.what());
366 inline fbstring exceptionStr(std::exception_ptr ep) {
368 std::rethrow_exception(ep);
369 } catch (const std::exception& e) {
370 return exceptionStr(e);
372 return "<unknown exception>";
377 * Split a string into a list of tokens by delimiter.
379 * The split interface here supports different output types, selected
380 * at compile time: StringPiece, fbstring, or std::string. If you are
381 * using a vector to hold the output, it detects the type based on
382 * what your vector contains. If the output vector is not empty, split
383 * will append to the end of the vector.
385 * You can also use splitTo() to write the output to an arbitrary
386 * OutputIterator (e.g. std::inserter() on a std::set<>), in which
387 * case you have to tell the function the type. (Rationale:
388 * OutputIterators don't have a value_type, so we can't detect the
389 * type in splitTo without being told.)
393 * std::vector<folly::StringPiece> v;
394 * folly::split(":", "asd:bsd", v);
396 * std::set<StringPiece> s;
397 * folly::splitTo<StringPiece>(":", "asd:bsd:asd:csd",
398 * std::inserter(s, s.begin()));
400 * Split also takes a flag (ignoreEmpty) that indicates whether adjacent
401 * delimiters should be treated as one single separator (ignoring empty tokens)
402 * or not (generating empty tokens).
405 template<class Delim, class String, class OutputType>
406 void split(const Delim& delimiter,
408 std::vector<OutputType>& out,
409 bool ignoreEmpty = false);
411 template<class Delim, class String, class OutputType>
412 void split(const Delim& delimiter,
414 folly::fbvector<OutputType>& out,
415 bool ignoreEmpty = false);
417 template<class OutputValueType, class Delim, class String,
418 class OutputIterator>
419 void splitTo(const Delim& delimiter,
422 bool ignoreEmpty = false);
425 * Split a string into a fixed number of pieces by delimiter. Returns 'true' if
426 * the fields were all successfully populated.
430 * folly::StringPiece name, key, value;
431 * if (folly::split('\t', line, name, key, value))
434 * The 'exact' template paremeter specifies how the function behaves when too
435 * many fields are present in the input string. When 'exact' is set to its
436 * default value of 'true', a call to split will fail if the number of fields in
437 * the input string does not exactly match the number of output parameters
438 * passed. If 'exact' is overridden to 'false', all remaining fields will be
439 * stored, unsplit, in the last field, as shown below:
441 * folly::StringPiece x, y.
442 * if (folly::split<false>(':', "a:b:c", x, y))
443 * assert(x == "a" && y == "b:c");
445 template<bool exact = true,
447 class... StringPieces>
448 bool split(const Delim& delimiter,
450 StringPiece& outHead,
451 StringPieces&... outTail);
454 * Join list of tokens.
456 * Stores a string representation of tokens in the same order with
457 * deliminer between each element.
460 template <class Delim, class Iterator, class String>
461 void join(const Delim& delimiter,
466 template <class Delim, class Container, class String>
467 void join(const Delim& delimiter,
468 const Container& container,
470 join(delimiter, container.begin(), container.end(), output);
473 template <class Delim, class Value, class String>
474 void join(const Delim& delimiter,
475 const std::initializer_list<Value>& values,
477 join(delimiter, values.begin(), values.end(), output);
480 template <class Delim, class Container>
481 std::string join(const Delim& delimiter,
482 const Container& container) {
484 join(delimiter, container.begin(), container.end(), output);
488 template <class Delim, class Value>
489 std::string join(const Delim& delimiter,
490 const std::initializer_list<Value>& values) {
492 join(delimiter, values.begin(), values.end(), output);
498 // Hash functions for string and fbstring usable with e.g. hash_map
500 // We let Boost pick the namespace here for us, since it has logic to do the
501 // right thing based on the C++ standard library implementation being used.
502 namespace BOOST_STD_EXTENSION_NAMESPACE {
505 struct hash<folly::basic_fbstring<C> > : private hash<const C*> {
506 size_t operator()(const folly::basic_fbstring<C> & s) const {
507 return hash<const C*>::operator()(s.c_str());
512 struct hash<std::basic_string<C> > : private hash<const C*> {
513 size_t operator()(const std::basic_string<C> & s) const {
514 return hash<const C*>::operator()(s.c_str());
520 // Hook into boost's type traits
523 struct has_nothrow_constructor<folly::basic_fbstring<T> > : true_type {
524 enum { value = true };
528 #include "folly/String-inl.h"