2 * Eddie Kohler, Yandong Mao, Robert Morris
3 * Copyright (c) 2012-2013 President and Fellows of Harvard College
4 * Copyright (c) 2012-2013 Massachusetts Institute of Technology
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, subject to the conditions
9 * listed in the Masstree LICENSE file. These conditions include: you must
10 * preserve this copyright notice, and you cannot mention the copyright
11 * holders in advertising related to the Software without their permission.
12 * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
13 * notice is a summary of the Masstree LICENSE file; the license in that file
16 #ifndef STRING_BASE_HH
17 #define STRING_BASE_HH
18 #include "compiler.hh"
19 #include "hashcode.hh"
27 #define LCDF_CONSTANT_CSTR(cstr) ((cstr) && __builtin_constant_p(strlen((cstr))))
29 class String_generic {
31 static const char empty_data[1];
32 static const char bool_data[11]; // "false\0true\0"
33 static const char out_of_memory_data[15];
34 static const char base64_encoding_table[65];
35 static const unsigned char base64_decoding_map[256];
36 enum { out_of_memory_length = 14 };
37 static bool out_of_memory(const char* s) {
38 return unlikely(s >= out_of_memory_data
39 && s <= out_of_memory_data + out_of_memory_length);
41 static bool equals(const char* a, int a_len, const char* b, int b_len) {
42 return a_len == b_len && memcmp(a, b, a_len) == 0;
44 static int compare(const char* a, int a_len, const char* b, int b_len);
45 static inline int compare(const unsigned char* a, int a_len,
46 const unsigned char* b, int b_len) {
47 return compare(reinterpret_cast<const char*>(a), a_len,
48 reinterpret_cast<const char*>(b), b_len);
50 static int natural_compare(const char* a, int a_len, const char* b, int b_len);
51 static int natural_compare(const unsigned char* a, int a_len,
52 const unsigned char* b, int b_len) {
53 return natural_compare(reinterpret_cast<const char*>(a), a_len,
54 reinterpret_cast<const char*>(b), b_len);
56 static bool starts_with(const char *a, int a_len, const char *b, int b_len) {
57 return a_len >= b_len && memcmp(a, b, b_len) == 0;
59 static int find_left(const char *s, int len, int start, char x);
60 static int find_left(const char *s, int len, int start, const char *x, int x_len);
61 static int find_right(const char *s, int len, int start, char x);
62 static int find_right(const char *s, int len, int start, const char *x, int x_len);
63 static bool glob_match(const char* s, int slen, const char* pattern, int plen);
64 template <typename T> static inline typename T::substring_type ltrim(const T &str);
65 template <typename T> static inline typename T::substring_type rtrim(const T &str);
66 template <typename T> static inline typename T::substring_type trim(const T &str);
67 static hashcode_t hashcode(const char *s, int len);
68 static hashcode_t hashcode(const char *first, const char *last) {
69 return hashcode(first, last - first);
71 static long to_i(const char* first, const char* last);
78 typedef const char* const_iterator;
79 typedef const_iterator iterator;
80 typedef const unsigned char* const_unsigned_iterator;
81 typedef const_unsigned_iterator unsigned_iterator;
82 typedef int (String_base<T>::*unspecified_bool_type)() const;
84 const char* data() const {
85 return static_cast<const T*>(this)->data();
88 return static_cast<const T*>(this)->length();
91 /** @brief Return a pointer to the string's data as unsigned chars.
93 Only the first length() characters are valid, and the string data
94 might not be null-terminated. @sa data() */
95 const unsigned char* udata() const {
96 return reinterpret_cast<const unsigned char*>(data());
98 /** @brief Return an iterator for the beginning of the string.
100 String iterators are simply pointers into string data, so they are
101 quite efficient. @sa String::data */
102 const_iterator begin() const {
105 /** @brief Return an iterator for the end of the string.
107 The result points one character beyond the last character in the
109 const_iterator end() const {
110 return data() + length();
112 /** @brief Return an unsigned iterator for the beginning of the string.
114 This is equivalent to reinterpret_cast<const unsigned char
116 const_unsigned_iterator ubegin() const {
117 return reinterpret_cast<const_unsigned_iterator>(data());
119 /** @brief Return an unsigned iterator for the end of the string.
121 This is equivalent to reinterpret_cast<const unsigned char
123 const_unsigned_iterator uend() const {
124 return reinterpret_cast<const_unsigned_iterator>(data() + length());
126 /** @brief Test if the string is nonempty. */
127 operator unspecified_bool_type() const {
128 return length() ? &String_base<T>::length : 0;
130 /** @brief Test if the string is empty. */
131 bool operator!() const {
132 return length() == 0;
134 /** @brief Test if the string is empty. */
136 return length() == 0;
138 /** @brief Test if the string is an out-of-memory string. */
139 bool out_of_memory() const {
140 return String_generic::out_of_memory(data());
142 /** @brief Return the @a i th character in the string.
144 Does not check bounds. @sa at() */
145 const char& operator[](int i) const {
148 /** @brief Return the @a i th character in the string.
150 Checks bounds: an assertion will fail if @a i is less than 0 or not
151 less than length(). @sa operator[] */
152 const char& at(int i) const {
153 assert(unsigned(i) < unsigned(length()));
156 /** @brief Return the first character in the string.
158 Does not check bounds. Same as (*this)[0]. */
159 const char& front() const {
162 /** @brief Return the last character in the string.
164 Does not check bounds. Same as (*this)[length() - 1]. */
165 const char& back() const {
166 return data()[length() - 1];
168 /** @brief Test if this string is equal to the C string @a cstr. */
169 bool equals(const char *cstr) const {
170 return String_generic::equals(data(), length(), cstr, strlen(cstr));
172 /** @brief Test if this string is equal to the first @a len characters
174 bool equals(const char *s, int len) const {
175 return String_generic::equals(data(), length(), s, len);
177 /** @brief Test if this string is equal to @a x. */
178 template <typename TT>
179 bool equals(const String_base<TT>& x) const {
180 return String_generic::equals(data(), length(), x.data(), x.length());
182 /** @brief Compare this string with the C string @a cstr.
184 Returns 0 if this string equals @a cstr, negative if this string is
185 less than @a cstr in lexicographic order, and positive if this
186 string is greater than @a cstr. Lexicographic order treats
187 characters as unsigned. */
188 int compare(const char* cstr) const {
189 return String_generic::compare(data(), length(), cstr, strlen(cstr));
191 /** @brief Compare this string with the first @a len characters of @a
193 int compare(const char* s, int len) const {
194 return String_generic::compare(data(), length(), s, len);
196 /** @brief Compare this string with @a x. */
197 template <typename TT>
198 int compare(const String_base<TT>& x) const {
199 return String_generic::compare(data(), length(), x.data(), x.length());
201 /** @brief Compare strings @a a and @a b. */
202 template <typename TT, typename UU>
203 static int compare(const String_base<TT>& a, const String_base<UU>& b) {
204 return String_generic::compare(a.data(), a.length(), b.data(), b.length());
206 /** @brief Compare strings @a a and @a b. */
207 template <typename UU>
208 static int compare(const char* a, const String_base<UU> &b) {
209 return String_generic::compare(a, strlen(a), b.data(), b.length());
211 /** @brief Compare strings @a a and @a b. */
212 template <typename TT>
213 static int compare(const String_base<TT>& a, const char* b) {
214 return String_generic::compare(a.data(), a.length(), b, strlen(b));
216 /** @brief Compare strings @a a and @a b. */
217 static int compare(const char* a, const char* b) {
218 return String_generic::compare(a, strlen(a), b, strlen(b));
220 /** @brief Compare this string with the C string @a cstr using natural order.
222 Natural string comparison attempts to order embedded decimal number
223 reprepresentations according to their numeric order; thus, the
224 string "100" compares greater than the string "2". Returns 0 if this
225 string equals @a cstr (only possible if the strings are
226 byte-for-byte identical), negative if this string is less than @a
227 cstr in natural order, and positive if this string is greater
228 than @a cstr in natural order. */
229 int natural_compare(const char *cstr) const {
230 return String_generic::natural_compare(data(), length(), cstr, strlen(cstr));
232 /** @brief Compare this string with the first @a len characters of @a
233 s using natural order. */
234 int natural_compare(const char *s, int len) const {
235 return String_generic::natural_compare(data(), length(), s, len);
237 /** @brief Compare this string with @a x using natural order. */
238 template <typename TT>
239 int natural_compare(const String_base<TT> &x) const {
240 return String_generic::natural_compare(data(), length(), x.data(), x.length());
242 /** @brief Compare strings @a a and @a b using natural order. */
243 template <typename TT, typename UU>
244 static int natural_compare(const String_base<TT> &a, const String_base<UU> &b) {
245 return String_generic::natural_compare(a.data(), a.length(), b.data(), b.length());
247 /** @brief Compare strings @a a and @a b using natural order. */
248 template <typename UU>
249 static int natural_compare(const char* a, const String_base<UU> &b) {
250 return String_generic::natural_compare(a, strlen(a), b.data(), b.length());
252 /** @brief Compare strings @a a and @a b using natural order. */
253 template <typename TT>
254 static int natural_compare(const String_base<TT>& a, const char* b) {
255 return String_generic::natural_compare(a.data(), a.length(), b, strlen(b));
257 /** @brief Compare strings @a a and @a b using natural order. */
258 static int natural_compare(const char* a, const char* b) {
259 return String_generic::natural_compare(a, strlen(a), b, strlen(b));
261 /** @brief Compare strings @a a and @a b using natural order. */
262 static int natural_compare(const std::string& a, const std::string& b) {
263 return String_generic::natural_compare(a.data(), a.length(), b.data(), b.length());
265 /** @brief Comparator function object for natural string comparison. */
266 class natural_comparator {
268 template <typename TT, typename UU>
269 bool operator()(const TT& a, const UU& b) const {
270 return String_base<T>::natural_compare(a, b) < 0;
273 /** @brief Test if this string begins with the C string @a cstr. */
274 bool starts_with(const char *cstr) const {
275 return String_generic::starts_with(data(), length(), cstr, strlen(cstr));
277 /** @brief Test if this string begins with the first @a len characters
279 bool starts_with(const char *s, int len) const {
280 return String_generic::starts_with(data(), length(), s, len);
282 /** @brief Test if this string begins with @a x. */
283 template <typename TT>
284 bool starts_with(const String_base<TT> &x) const {
285 return String_generic::starts_with(data(), length(), x.data(), x.length());
287 /** @brief Search for a character in this string.
289 Return the index of the leftmost occurrence of @a c, starting at
290 index @a start and working up to the end of the string. Return -1 if
291 the character is not found. */
292 int find_left(char x, int start = 0) const {
293 return String_generic::find_left(data(), length(), start, x);
295 /** @brief Search for the C string @a cstr as a substring in this string.
297 Return the index of the leftmost occurrence of @a cstr, starting at
298 index @a start. Return -1 if the substring is not found. */
299 int find_left(const char *cstr, int start = 0) const {
300 return String_generic::find_left(data(), length(), start, cstr, strlen(cstr));
302 /** @brief Search for @a x as a substring in this string.
304 Return the index of the leftmost occurrence of @a x, starting at
305 index @a start. Return -1 if the substring is not found. */
306 template <typename TT>
307 int find_left(const String_base<TT> &x, int start = 0) const {
308 return String_generic::find_left(data(), length(), start, x.data(), x.length());
310 /** @brief Search backwards for a character in this string.
312 Return the index of the rightmost occurrence of @a c, starting at
313 index @a start and working up to the end of the string. Return -1 if
314 the character is not found. */
315 int find_right(char c, int start = INT_MAX) const {
316 return String_generic::find_right(data(), length(), start, c);
318 /** @brief Search backwards for the C string @a cstr as a substring in
321 Return the index of the rightmost occurrence of @a cstr, starting
322 at index @a start. Return -1 if the substring is not found. */
323 int find_right(const char *cstr, int start = INT_MAX) const {
324 return String_generic::find_right(data(), length(), start, cstr, strlen(cstr));
326 /** @brief Search backwards for @a x as a substring in this string.
328 Return the index of the rightmost occurrence of @a x, starting at
329 index @a start. Return -1 if the substring is not found. */
330 template <typename TT>
331 int find_right(const String_base<TT> &x, int start = INT_MAX) const {
332 return String_generic::find_right(data(), length(), start, x.data(), x.length());
334 /** @brief Test if this string matches the glob @a pattern.
336 Glob pattern syntax allows * (any number of characters), ? (one
337 arbitrary character), [] (character classes, possibly negated), and
339 bool glob_match(const char* pattern) const {
340 return String_generic::glob_match(data(), length(), pattern, strlen(pattern));
343 template <typename TT>
344 bool glob_match(const String_base<TT>& pattern) const {
345 return String_generic::glob_match(data(), length(), pattern.data(), pattern.length());
347 /** @brief Return a 32-bit hash function of the characters in [@a first, @a last).
349 Uses Paul Hsieh's "SuperFastHash" algorithm, described at
350 http://www.azillionmonkeys.com/qed/hash.html
351 This hash function uses all characters in the string.
353 @invariant If last1 - first1 == last2 - first2 and memcmp(first1,
354 first2, last1 - first1) == 0, then hashcode(first1, last1) ==
355 hashcode(first2, last2). */
356 static hashcode_t hashcode(const char *first, const char *last) {
357 return String_generic::hashcode(first, last);
359 /** @brief Return a 32-bit hash function of the characters in this string. */
360 hashcode_t hashcode() const {
361 return String_generic::hashcode(data(), length());
364 /** @brief Return the integer value of this string. */
366 return String_generic::to_i(begin(), end());
369 template <typename E>
370 const_iterator encode_json_partial(E& e) const;
371 template <typename E>
372 inline void encode_json(E& e) const;
373 template <typename E>
374 void encode_base64(E& e, bool pad = false) const;
375 template <typename E>
376 bool decode_base64(E& e) const;
378 /** @brief Return this string as a std::string. */
379 inline operator std::string() const {
380 return std::string(begin(), end());
384 String_base() = default;
387 template <typename T, typename U>
388 inline bool operator==(const String_base<T> &a, const String_base<U> &b) {
392 template <typename T>
393 inline bool operator==(const String_base<T> &a, const std::string &b) {
394 return a.equals(b.data(), b.length());
397 template <typename T>
398 inline bool operator==(const std::string &a, const String_base<T> &b) {
399 return b.equals(a.data(), a.length());
402 template <typename T>
403 inline bool operator==(const String_base<T> &a, const char *b) {
404 return a.equals(b, strlen(b));
407 template <typename T>
408 inline bool operator==(const char *a, const String_base<T> &b) {
409 return b.equals(a, strlen(a));
412 template <typename T, typename U>
413 inline bool operator!=(const String_base<T> &a, const String_base<U> &b) {
417 template <typename T>
418 inline bool operator!=(const String_base<T> &a, const std::string &b) {
422 template <typename T>
423 inline bool operator!=(const std::string &a, const String_base<T> &b) {
427 template <typename T>
428 inline bool operator!=(const String_base<T> &a, const char *b) {
432 template <typename T>
433 inline bool operator!=(const char *a, const String_base<T> &b) {
437 template <typename T, typename U>
438 inline bool operator<(const String_base<T> &a, const String_base<U> &b) {
439 return a.compare(b) < 0;
442 template <typename T, typename U>
443 inline bool operator<=(const String_base<T> &a, const String_base<U> &b) {
444 return a.compare(b) <= 0;
447 template <typename T, typename U>
448 inline bool operator>=(const String_base<T> &a, const String_base<U> &b) {
449 return a.compare(b) >= 0;
452 template <typename T, typename U>
453 inline bool operator>(const String_base<T> &a, const String_base<U> &b) {
454 return a.compare(b) > 0;
457 template <typename T>
458 inline std::ostream &operator<<(std::ostream &f, const String_base<T> &str) {
459 return f.write(str.data(), str.length());
462 template <typename T>
463 inline hashcode_t hashcode(const String_base<T>& x) {
464 return String_generic::hashcode(x.data(), x.length());
468 template <typename T>
469 inline size_t hash_value(const String_base<T>& x) {
470 return String_generic::hashcode(x.data(), x.length());
473 template <typename T>
474 inline typename T::substring_type String_generic::ltrim(const T &str) {
475 const char *b = str.begin(), *e = str.end();
476 while (b != e && isspace((unsigned char) b[0]))
478 return str.fast_substring(b, e);
481 template <typename T>
482 inline typename T::substring_type String_generic::rtrim(const T &str) {
483 const char *b = str.begin(), *e = str.end();
484 while (b != e && isspace((unsigned char) e[-1]))
486 return str.fast_substring(b, e);
489 template <typename T>
490 inline typename T::substring_type String_generic::trim(const T &str) {
491 const char *b = str.begin(), *e = str.end();
492 while (b != e && isspace((unsigned char) e[-1]))
494 while (b != e && isspace((unsigned char) b[0]))
496 return str.fast_substring(b, e);
500 # define LCDF_MAKE_STRING_HASH(type) \
501 namespace std { template <> struct hash<type> \
502 : public unary_function<const type&, size_t> { \
503 size_t operator()(const type& x) const noexcept { \
504 return x.hashcode(); \
507 # define LCDF_MAKE_STRING_HASH(type)
510 template <typename T> template <typename E>
511 typename String_base<T>::const_iterator String_base<T>::encode_json_partial(E& enc) const {
512 const char *last = this->begin(), *end = this->end();
513 for (const char *s = last; s != end; ++s) {
514 int c = (unsigned char) *s;
516 // U+2028 and U+2029 can't appear in Javascript strings! (Though
517 // they are legal in JSON strings, according to the JSON
519 if (unlikely(c == 0xE2)
520 && s + 2 < end && (unsigned char) s[1] == 0x80
521 && (unsigned char) (s[2] | 1) == 0xA9)
522 c = 0x2028 + (s[2] & 1);
523 else if (likely(c >= 32 && c != '\\' && c != '\"' && c != '/'))
549 default: { // c is a control character, 0x2028, or 0x2029
550 char* x = enc.reserve(5);
551 snprintf(x, 5, "u%04X", c);
552 if (c > 255) // skip rest of encoding of U+202[89]
562 template <typename T> template <typename E>
563 inline void String_base<T>::encode_json(E& enc) const {
564 const char* last = encode_json_partial(enc);
565 enc.append(last, end());
568 template <typename T> template <typename E>
569 void String_base<T>::encode_base64(E& enc, bool pad) const {
570 char* out = enc.reserve(((length() + 2) * 4) / 3);
571 const unsigned char* s = this->ubegin(), *end = this->uend();
572 for (; end - s >= 3; s += 3) {
573 unsigned x = (s[0] << 16) | (s[1] << 8) | s[2];
574 *out++ = String_generic::base64_encoding_table[x >> 18];
575 *out++ = String_generic::base64_encoding_table[(x >> 12) & 63];
576 *out++ = String_generic::base64_encoding_table[(x >> 6) & 63];
577 *out++ = String_generic::base64_encoding_table[x & 63];
580 unsigned x = s[0] << 16;
583 *out++ = String_generic::base64_encoding_table[x >> 18];
584 *out++ = String_generic::base64_encoding_table[(x >> 12) & 63];
586 *out++ = String_generic::base64_encoding_table[(x >> 6) & 63];
595 template <typename T> template <typename E>
596 bool String_base<T>::decode_base64(E& enc) const {
597 char* out = enc.reserve((length() * 3) / 4 + 1);
598 const unsigned char* s = this->ubegin(), *end = this->uend();
599 while (end > s && end[-1] == '=')
601 for (; end - s >= 4; s += 4) {
602 unsigned x = ((((unsigned) String_generic::base64_decoding_map[s[0]]) - 1) << 18)
603 | ((((unsigned) String_generic::base64_decoding_map[s[1]]) - 1) << 12)
604 | ((((unsigned) String_generic::base64_decoding_map[s[2]]) - 1) << 6)
605 | (((unsigned) String_generic::base64_decoding_map[s[3]]) - 1);
608 *out++ = (unsigned char) (x >> 16);
609 *out++ = (unsigned char) (x >> 8);
610 *out++ = (unsigned char) x;
613 unsigned x = ((((unsigned) String_generic::base64_decoding_map[s[0]]) - 1) << 18)
614 | ((((unsigned) String_generic::base64_decoding_map[s[1]]) - 1) << 12)
615 | (end - s == 3 ? (((unsigned) String_generic::base64_decoding_map[s[2]]) - 1) << 6 : 0);
618 *out++ = (unsigned char) (x >> 16);
620 *out++ = (unsigned char) (x >> 8);