2 * Eddie Kohler, Yandong Mao, Robert Morris
3 * Copyright (c) 2012-2013 President and Fellows of Harvard College
4 * Copyright (c) 2012-2013 Massachusetts Institute of Technology
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, subject to the conditions
9 * listed in the Masstree LICENSE file. These conditions include: you must
10 * preserve this copyright notice, and you cannot mention the copyright
11 * holders in advertising related to the Software without their permission.
12 * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
13 * notice is a summary of the Masstree LICENSE file; the license in that file
17 * string.{cc,hh} -- a String class with shared substrings
20 * Copyright (c) 1999-2000 Massachusetts Institute of Technology
21 * Copyright (c) 2001-2013 Eddie Kohler
22 * Copyright (c) 2008-2009 Meraki, Inc.
24 * Permission is hereby granted, free of charge, to any person obtaining a
25 * copy of this software and associated documentation files (the "Software"),
26 * to deal in the Software without restriction, subject to the conditions
27 * listed in the Click LICENSE file. These conditions include: you must
28 * preserve this copyright notice, and you cannot mention the copyright
29 * holders in advertising related to the Software without their permission.
30 * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
31 * notice is a summary of the Click LICENSE file; the license in that file
36 #include "straccum.hh"
45 * @brief The LCDF String class.
49 * @brief A string of characters.
51 * The String class represents a string of characters. Strings may be
52 * constructed from C strings, characters, numbers, and so forth. They may
53 * also be added together. The underlying character arrays are dynamically
54 * allocated; String operations allocate and free memory as needed. A String
55 * and its substrings generally share memory. Accessing a character by index
56 * takes O(1) time; so does creating a substring.
58 * <h3>Out-of-memory strings</h3>
60 * When there is not enough memory to create a particular string, a special
61 * "out-of-memory" string is returned instead. Out-of-memory strings are
62 * contagious: the result of any concatenation operation involving an
63 * out-of-memory string is another out-of-memory string. Thus, the final
64 * result of a series of String operations will be an out-of-memory string,
65 * even if the out-of-memory condition occurs in the middle.
67 * The canonical out-of-memory string is 14 bytes long, and equals the UTF-8
68 * encoding of "\U0001F4A3ENOMEM\U0001F4A3" (that is, U+1F4A3 BOMB +
69 * "ENOMEM" + U+1F4A3 BOMB). This sequence is unlikely to show up in normal
70 * text, compares high relative to most other textual strings, and is valid
73 * All canonical out-of-memory strings are equal and share the same data(),
74 * which is different from the data() of any other string. See
75 * String::out_of_memory_data(). The String::make_out_of_memory() function
76 * returns a canonical out-of-memory string.
78 * Other strings may also be out-of-memory strings. For example,
79 * String::make_stable(String::out_of_memory_data()) ==
80 * String::make_out_of_memory(), and some (but not all) substrings of
81 * out-of-memory strings are also out-of-memory strings.
84 const char String_generic::empty_data[] = "";
85 // oom_data is the UTF-8 encoding of U+1F4A3 BOMB + "ENOMEM" + U+1F4A3 BOMB
86 const char String_generic::out_of_memory_data[] = "\360\237\222\243ENOMEM\360\237\222\243";
87 const char String_generic::bool_data[] = "false\0true";
88 const char String_generic::base64_encoding_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
89 const unsigned char String_generic::base64_decoding_map[] =
90 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
91 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
92 "\0\0\0\0\0\0\0\0\0\0\0\x3F\0\0\0\x40"
93 "\x35\x36\x37\x38\x39\x3A\x3B\x3C\x3D\x3E\0\0\0\0\0\0"
94 "\0\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
95 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\0\0\0\0\0"
96 "\0\x1B\x1C\x1D\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29"
97 "\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31\x32\x33\x34\0\0\0\0\0"
98 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
99 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
100 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
101 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
102 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
103 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
104 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
105 "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
106 const char String::int_data[] = "0\0001\0002\0003\0004\0005\0006\0007\0008\0009";
108 #if HAVE_STRING_PROFILING > 1
109 # define MEMO_INITIALIZER_TAIL , 0, 0
111 # define MEMO_INITIALIZER_TAIL
114 const String::rep_type String::null_string_rep = {
115 String_generic::empty_data, 0, 0
117 const String::rep_type String::oom_string_rep = {
118 String_generic::out_of_memory_data, String_generic::out_of_memory_length, 0
120 const String::rep_type String::zero_string_rep = {
124 #if HAVE_STRING_PROFILING
125 uint64_t String::live_memo_count;
126 uint64_t String::memo_sizes[55];
127 uint64_t String::live_memo_sizes[55];
128 uint64_t String::live_memo_bytes[55];
129 # if HAVE_STRING_PROFILING > 1
130 String::memo_type *String::live_memos[55];
135 String_generic::compare(const char* a, int a_len, const char* b, int b_len)
138 int len = a_len < b_len ? a_len : b_len;
139 int cmp = memcmp(a, b, len);
143 return a_len - b_len;
146 int String_generic::natural_compare(const char* a, int a_len,
147 const char* b, int b_len) {
148 const char* ae = a + a_len;
149 const char* be = b + b_len;
150 const char* aperiod = 0;
151 bool aperiod_negative = false;
154 while (a < ae && b < be) {
155 if (isdigit((unsigned char) *a) && isdigit((unsigned char) *b)) {
156 // compare the two numbers, but treat them as strings
157 // (a decimal conversion might cause overflow)
158 bool potential_decimal = (a == aperiod);
160 // check if both are negative (note that if we get here, entire
161 // string prefixes are identical)
162 bool negative = false;
163 if (a > ae - a_len && a[-1] == '-'
164 && (a == ae - a_len + 1
165 || isspace((unsigned char) a[-2])))
168 // skip initial '0's, but remember any difference in length
169 const char *ia = a, *ib = b;
170 while (a < ae && *a == '0')
172 while (b < be && *b == '0')
174 int longer_zeros = (a - ia) - (b - ib);
176 // walk over digits, remembering first nonidentical digit comparison
177 int digit_compare = 0;
180 a_good = a < ae && isdigit((unsigned char) *a);
181 b_good = b < be && isdigit((unsigned char) *b);
182 if (!a_good || !b_good)
184 if (digit_compare == 0)
185 digit_compare = *a - *b;
190 // real number comparison: leading zeros are significant,
191 // digit comparisons take precedence
192 if (potential_decimal) {
193 const char *ax = a, *bx = b;
194 while (ax < ae && isdigit((unsigned char) *ax))
196 while (bx < be && isdigit((unsigned char) *bx))
198 // watch for IP addresses: don't treat "0.2." like a decimal
199 if (!(ax + 1 < ae && *ax == '.' && !isspace((unsigned char) ax[1]))
200 && !(bx + 1 < be && *bx == '.' && !isspace((unsigned char) bx[1]))) {
201 negative = aperiod_negative;
203 return negative ? 1 : -1;
208 // if one number is longer, it must also be larger
209 if (a_good != b_good)
210 return negative == a_good ? -1 : 1;
211 // otherwise, digit comparisons take precedence
213 return negative == (digit_compare > 0) ? -1 : 1;
214 // as a last resort, the longer string of zeros is greater
217 // prepare for potential decimal comparison later
219 a_good = a + 1 < ae && *a == '.'
220 && isdigit((unsigned char) a[1]);
221 b_good = b + 1 < be && *b == '.'
222 && isdigit((unsigned char) b[1]);
223 if (a_good != b_good)
224 return negative == b_good ? 1 : -1;
227 aperiod_negative = negative;
231 // if we get here, the numeric portions were byte-for-byte
232 // identical; move on
233 } else if (isdigit((unsigned char) *a))
234 return isalpha((unsigned char) *b) ? -1 : 1;
235 else if (isdigit((unsigned char) *b))
236 return isalpha((unsigned char) *a) ? 1 : -1;
238 int alower = (unsigned char) tolower((unsigned char) *a);
239 int blower = (unsigned char) tolower((unsigned char) *b);
240 if (alower != blower)
241 return alower - blower;
242 if (raw_compare == 0)
243 raw_compare = (unsigned char) *a - (unsigned char) *b;
251 if ((ae - a) != (be - b))
252 return (ae - a) - (be - b);
258 String_generic::hashcode(const char *s, int len)
265 const char *end = s + len - rem;
268 #if !HAVE_INDIFFERENT_ALIGNMENT
269 if (!(reinterpret_cast<uintptr_t>(s) & 1)) {
271 #define get16(p) (*reinterpret_cast<const uint16_t *>((p)))
272 for (; s != end; s += 4) {
274 uint32_t tmp = (get16(s + 2) << 11) ^ hash;
275 hash = (hash << 16) ^ tmp;
283 #if !HAVE_INDIFFERENT_ALIGNMENT
285 # if !__i386__ && !__x86_64__ && !__arch_um__
286 # define get16(p) (((unsigned char) (p)[0] << 8) + (unsigned char) (p)[1])
288 # define get16(p) ((unsigned char) (p)[0] + ((unsigned char) (p)[1] << 8))
290 // should be exactly the same as the code above
291 for (; s != end; s += 4) {
293 uint32_t tmp = (get16(s + 2) << 11) ^ hash;
294 hash = (hash << 16) ^ tmp;
305 /* Handle end cases */
306 if (0) { // weird organization avoids uninitialized
307 rem2: // variable warnings
311 hash ^= ((unsigned char) s[2]) << 18;
318 } else if (rem == 1) {
319 hash += (unsigned char) *s;
324 /* Force "avalanching" of final 127 bits */
336 String_generic::find_left(const char *s, int len, int start,
341 for (int i = start; i < len; ++i)
348 String_generic::find_left(const char *s, int len, int start,
349 const char *x, int x_len)
354 return start <= len ? start : -1;
355 int max_pos = len - x_len;
356 for (int i = start; i <= max_pos; ++i)
357 if (memcmp(s + i, x, x_len) == 0)
363 String_generic::find_right(const char *s, int len, int start,
368 for (int i = start; i >= 0; --i)
375 String_generic::find_right(const char *s, int len, int start,
376 const char *x, int x_len)
381 return start >= 0 ? start : -1;
382 for (int i = start; i >= 0; --i)
383 if (memcmp(s + i, x, x_len) == 0)
388 long String_generic::to_i(const char* s, const char* ends) {
390 if (s != ends && (s[0] == '-' || s[0] == '+')) {
395 if (s == ends || !isdigit((unsigned char) *s))
397 unsigned long x = (unsigned char) *s - '0';
398 for (++s; s != ends && isdigit((unsigned char) *s); ++s)
399 x = x * 10 + *s - '0'; // XXX overflow
403 bool String_generic::glob_match(const char* sbegin, int slen,
404 const char* pbegin, int plen) {
405 const char* send = sbegin + slen;
406 const char* pend = pbegin + plen;
408 // quick common-case check for suffix matches
409 while (pbegin < pend && sbegin < send
410 && pend[-1] != '*' && pend[-1] != '?' && pend[-1] != ']'
411 && (pbegin + 1 == pend || pend[-2] != '\\'))
412 if (pend[-1] == send[-1])
417 std::vector<const char*> state, nextstate;
418 state.push_back(pbegin);
420 for (const char* s = sbegin; s != send && state.size(); ++s) {
422 for (const char** pp = state.data(); pp != state.data() + state.size(); ++pp)
427 nextstate.push_back(*pp + 1);
432 if (nextstate.empty() || nextstate.back() != *pp)
433 nextstate.push_back(*pp);
441 const char *ec = *pp + 1;
443 if (ec != pend && *ec == '^') {
455 } while (ec != pend && *ec != ']');
459 if (found == !negated)
460 nextstate.push_back(ec + 1);
466 nextstate.push_back(*pp + 1);
470 state.swap(nextstate);
473 for (const char** pp = state.data(); pp != state.data() + state.size(); ++pp) {
474 while (*pp != pend && **pp == '*')
484 #if HAVE_STRING_PROFILING
485 void String::memo_type::account_new() {
486 int bucket = profile_memo_size_bucket(this->dirty, this->capacity);
487 ++memo_sizes[bucket];
488 ++live_memo_sizes[bucket];
489 live_memo_bytes[bucket] += this->capacity;
491 # if HAVE_STRING_PROFILING > 1
492 this->pprev = &live_memos[bucket];
493 if ((this->next = *this->pprev))
494 this->next->pprev = &this->next;
499 void String::memo_type::account_destroy() {
500 int bucket = profile_memo_size_bucket(this->dirty, this->capacity);
501 --live_memo_sizes[bucket];
502 live_memo_bytes[bucket] -= this->capacity;
504 # if HAVE_STRING_PROFILING > 1
505 if ((*this->pprev = this->next))
506 this->next->pprev = this->pprev;
511 inline String::memo_type* String::create_memo(int capacity, int dirty) {
512 assert(capacity > 0 && capacity >= dirty);
514 reinterpret_cast<memo_type *>(new char[capacity + MEMO_SPACE]);
516 memo->initialize(capacity, dirty);
521 String::delete_memo(memo_type *memo)
523 assert(memo->capacity > 0);
524 assert(memo->capacity >= memo->dirty);
525 memo->account_destroy();
526 delete[] reinterpret_cast<char*>(memo);
530 #if HAVE_STRING_PROFILING
532 String::one_profile_report(StringAccum &sa, int i, int examples)
535 sa << "memo_dirty_" << i;
537 uint32_t s = (i - 17) * 2 + 17;
538 sa << "memo_cap_" << s << '_' << (s + 1);
540 uint32_t s = (i - 25) * 8 + 33;
541 sa << "memo_cap_" << s << '_' << (s + 7);
543 uint32_t s1 = (1U << (i - 23)) + 1;
544 uint32_t s2 = (s1 - 1) << 1;
545 sa << "memo_cap_" << s1 << '_' << s2;
547 sa << '\t' << live_memo_sizes[i] << '\t' << memo_sizes[i] << '\t' << live_memo_bytes[i] << '\n';
549 # if HAVE_STRING_PROFILING > 1
550 for (memo_type *m = live_memos[i]; m; m = m->next) {
551 sa << " [" << m->dirty << "] ";
552 uint32_t dirty = m->dirty;
553 if (dirty > 0 && m->real_data[dirty - 1] == '\0')
555 sa.append(m->real_data, dirty > 128 ? 128 : dirty);
563 String::profile_report(StringAccum &sa, int examples)
565 uint64_t all_live_sizes = 0, all_sizes = 0, all_live_bytes = 0;
566 for (int i = 0; i < 55; ++i) {
568 one_profile_report(sa, i, examples);
569 all_live_sizes += live_memo_sizes[i];
570 all_sizes += memo_sizes[i];
571 all_live_bytes += live_memo_bytes[i];
573 sa << "memo_total\t" << all_live_sizes << '\t' << all_sizes << '\t' << all_live_bytes << '\n';
577 /** @endcond never */
580 /** @brief Construct a base-10 string representation of @a x. */
581 String::String(int x)
583 if (x >= 0 && x < 10)
584 _r.assign(int_data + 2 * x, 1, 0);
587 sprintf(buf, "%d", x);
588 assign(buf, -1, false);
593 String::String(unsigned x)
596 _r.assign(int_data + 2 * x, 1, 0);
599 sprintf(buf, "%u", x);
600 assign(buf, -1, false);
605 String::String(long x)
607 if (x >= 0 && x < 10)
608 _r.assign(int_data + 2 * x, 1, 0);
611 sprintf(buf, "%ld", x);
612 assign(buf, -1, false);
617 String::String(unsigned long x)
620 _r.assign(int_data + 2 * x, 1, 0);
623 sprintf(buf, "%lu", x);
624 assign(buf, -1, false);
629 String::String(long long x)
631 if (x >= 0 && x < 10)
632 _r.assign(int_data + 2 * x, 1, 0);
635 sprintf(buf, "%lld", x);
636 assign(buf, -1, false);
641 String::String(unsigned long long x)
644 _r.assign(int_data + 2 * x, 1, 0);
647 sprintf(buf, "%llu", x);
648 assign(buf, -1, false);
652 String::String(double x)
655 int len = sprintf(buf, "%.12g", x);
656 assign(buf, len, false);
660 String::hard_make_stable(const char *s, int len)
664 return String(s, len, 0);
668 String::make_fill(int c, int len)
671 s.append_fill(c, len);
676 String::assign_out_of_memory()
683 String::assign(const char *s, int len, bool need_deref)
691 // need to start with dereference
694 if (unlikely((m = _r.memo())
696 && s + len <= m->real_data + m->capacity)) {
697 // Be careful about "String s = ...; s = s.c_str();"
698 _r.assign_noref(s, len, m);
706 s = String_generic::empty_data;
709 // Make the memo a multiple of 16 characters and bigger than 'len'.
710 int memo_capacity = (len + 15 + MEMO_SPACE) & ~15;
711 m = create_memo(memo_capacity - MEMO_SPACE, len);
714 assign_out_of_memory();
717 memcpy(m->real_data, s, len);
721 _r.assign_noref(s, len, m);
724 /** @brief Append @a len unknown characters to this string.
725 * @return Modifiable pointer to the appended characters.
727 * The caller may safely modify the returned memory. Null is returned if
728 * the string becomes out-of-memory. */
730 String::append_uninitialized(int len)
732 // Appending anything to "out of memory" leaves it as "out of memory"
733 if (unlikely(len <= 0) || out_of_memory())
736 // If we can, append into unused space. First, we check that there's
737 // enough unused space for 'len' characters to fit; then, we check
738 // that the unused space immediately follows the data in '*this'.
740 memo_type* m = _r.memo();
741 if (m && ((dirty = m->dirty), m->capacity > dirty + len)) {
742 char *real_dirty = m->real_data + dirty;
743 if (real_dirty == _r.data + _r.length) {
744 m->dirty = dirty + len;
746 assert(m->dirty < m->capacity);
747 #if HAVE_STRING_PROFILING
748 profile_update_memo_dirty(m, dirty, dirty + len, m->capacity);
754 // Now we have to make new space. Make sure the memo is a multiple of 16
755 // bytes and that it is at least 16. But for large strings, allocate a
756 // power of 2, since power-of-2 sizes minimize waste in frequently-used
757 // allocators, like Linux kmalloc.
758 int want_memo_len = _r.length + len + MEMO_SPACE;
760 if (want_memo_len <= 1024)
761 memo_capacity = (want_memo_len + 15) & ~15;
763 for (memo_capacity = 2048; memo_capacity < want_memo_len; )
766 m = create_memo(memo_capacity - MEMO_SPACE, _r.length + len);
768 assign_out_of_memory();
772 char *new_data = m->real_data;
773 memcpy(new_data, _r.data, _r.length);
776 _r.assign_noref(new_data, _r.length + len, m);
777 return const_cast<char*>(_r.data + _r.length - len);
781 String::append(const char* s, int len, memo_type* memo)
790 if (unlikely(len == 0) || out_of_memory())
792 else if (unlikely(s == String_generic::out_of_memory_data) && !memo)
793 // Appending "out of memory" to a regular string makes it "out of
795 assign_out_of_memory();
796 else if (_r.length == 0 && reinterpret_cast<uintptr_t>(memo) > 1) {
798 _r.assign(s, len, memo);
799 } else if (likely(!((my_memo = _r.memo())
800 && s >= my_memo->real_data
801 && s + len <= my_memo->real_data + my_memo->capacity))) {
802 if (char *space = append_uninitialized(len))
803 memcpy(space, s, len);
805 String preserve_s(*this);
806 if (char *space = append_uninitialized(len))
807 memcpy(space, s, len);
811 /** @brief Append @a len copies of character @a c to this string. */
813 String::append_fill(int c, int len)
816 if (char *space = append_uninitialized(len))
817 memset(space, c, len);
820 /** @brief Ensure the string's data is unshared and return a mutable
823 String::mutable_data()
825 // If _memo has a capacity (it's not one of the special strings) and it's
826 // uniquely referenced, return _data right away.
828 return const_cast<char *>(_r.data);
830 // Otherwise, make a copy of it. Rely on: deref() doesn't change _data or
831 // _length; and if _capacity == 0, then deref() doesn't free _real_data.
832 // But in multithreaded situations we must hold a local copy of memo!
833 String do_not_delete_underlying_memo(*this);
835 assign(_r.data, _r.length, false);
836 return const_cast<char *>(_r.data);
840 String::hard_c_str() const
843 // We may already have a '\0' in the right place. If _memo has no
844 // capacity, then this is one of the special strings (null or
845 // stable). We are guaranteed, in these strings, that _data[_length]
846 // exists. Otherwise must check that _data[_length] exists.
847 const char *end_data = _r.data + _r.length;
848 memo_type* m = _r.memo();
849 if ((m && end_data >= m->real_data + m->dirty)
850 || *end_data != '\0') {
851 if (char *x = const_cast<String *>(this)->append_uninitialized(1)) {
859 /** @brief Null-terminate the string and return a mutable pointer to its
863 String::mutable_c_str()
865 (void) mutable_data();
867 return const_cast<char *>(_r.data);
870 /** @brief Return a substring of this string, consisting of the @a len
871 characters starting at index @a pos.
872 @param pos substring's first position relative to the string
873 @param len length of substring
875 If @a pos is negative, starts that far from the end of the string. If @a
876 len is negative, leaves that many characters off the end of the string.
877 If @a pos and @a len specify a substring that is partly outside the
878 string, only the part within the string is returned. If the substring is
879 beyond either end of the string, returns an empty string (but this
880 should be considered a programming error; a future version may generate
881 a warning for this case).
883 @note String::substr() is intended to behave like Perl's substr(). */
885 String::substr(int pos, int len) const
892 pos2 = _r.length + len;
893 else if (pos >= 0 && len >= _r.length) // avoid integer overflow
900 if (pos2 > _r.length)
907 return String(_r.data + pos, pos2 - pos, _r.memo());
912 hard_lower(const String &s, int pos)
914 String new_s(s.data(), s.length());
915 char *x = const_cast<char *>(new_s.data()); // know it's mutable
916 int len = s.length();
917 for (; pos < len; pos++)
918 x[pos] = tolower((unsigned char) x[pos]);
922 /** @brief Return a lowercased version of this string.
924 Translates the ASCII characters 'A' through 'Z' into their lowercase
927 String::lower() const
930 if (!out_of_memory())
931 for (int i = 0; i < _r.length; i++)
932 if (_r.data[i] >= 'A' && _r.data[i] <= 'Z')
933 return hard_lower(*this, i);
938 hard_upper(const String &s, int pos)
940 String new_s(s.data(), s.length());
941 char *x = const_cast<char *>(new_s.data()); // know it's mutable
942 int len = s.length();
943 for (; pos < len; pos++)
944 x[pos] = toupper((unsigned char) x[pos]);
948 /** @brief Return an uppercased version of this string.
950 Translates the ASCII characters 'a' through 'z' into their uppercase
953 String::upper() const
956 for (int i = 0; i < _r.length; i++)
957 if (_r.data[i] >= 'a' && _r.data[i] <= 'z')
958 return hard_upper(*this, i);
963 hard_printable(const String &s, int pos, int type)
965 StringAccum sa(s.length() * 2);
966 sa.append(s.data(), pos);
967 const unsigned char *x = reinterpret_cast<const unsigned char *>(s.data());
968 int len = s.length();
969 for (; pos < len; pos++) {
970 if (type == 2 && (x[pos] == '\\' || x[pos] == '\"'))
971 sa << '\\' << x[pos];
972 else if (x[pos] >= 32 && x[pos] < 127)
974 else if (x[pos] < 32 && type == 2) {
975 if (x[pos] >= 9 && x[pos] <= 13)
976 sa << '\\' << ("tnvfr"[x[pos] - 9]);
977 else if (char *buf = sa.extend(4, 1))
978 sprintf(buf, "\\%03o", x[pos]);
979 } else if (x[pos] < 32 && type != 1)
980 sa << '^' << (unsigned char)(x[pos] + 64);
981 else if (char *buf = sa.extend(4, 1))
982 sprintf(buf, "\\%03o", x[pos]);
984 return sa.take_string();
987 /** @brief Return a "printable" version of this string.
988 @param type quoting type
990 The default quoting type (0) translates control characters 0-31 into
991 "control" sequences, such as "^@" for the null character, and characters
992 127-255 into octal escape sequences, such as "\377" for 255. Quoting
993 type 1 translates all characters outside of 32-126 into octal escape
994 sequences. Quoting type 2 uses C escapes, including "\\" and "\"". */
996 String::printable(int type) const
999 if (!out_of_memory())
1000 for (int i = 0; i < _r.length; i++)
1001 if (_r.data[i] < 32 || _r.data[i] > 126)
1002 return hard_printable(*this, i, type);
1006 String String::to_hex() const {
1008 static const char hexval[] = "0123456789ABCDEF";
1009 char* x = sa.extend(2 * _r.length);
1010 for (int i = 0; i != _r.length; ++i) {
1011 *x++ = hexval[(unsigned char) _r.data[i] >> 4];
1012 *x++ = hexval[_r.data[i] & 15];
1014 return sa.take_string();
1017 /** @brief Return the substring with left whitespace removed. */
1019 String::ltrim() const
1021 return String_generic::ltrim(*this);
1024 /** @brief Return the substring with right whitespace removed. */
1026 String::rtrim() const
1028 return String_generic::rtrim(*this);
1031 /** @brief Return the substring with left and right whitespace removed. */
1033 String::trim() const
1035 return String_generic::trim(*this);
1039 String::align(int n)
1041 int offset = reinterpret_cast<uintptr_t>(_r.data) % n;
1044 s.append_uninitialized(_r.length + n + 1);
1045 offset = reinterpret_cast<uintptr_t>(s._r.data) % n;
1046 memcpy((char *)s._r.data + n - offset, _r.data, _r.length);
1047 s._r.data += n - offset;
1048 s._r.length = _r.length;
1053 /** @brief Return the pointer to the next character in UTF-8 encoding. */
1054 const unsigned char *
1055 String::skip_utf8_char(const unsigned char *s, const unsigned char *end)
1058 if (c > 0 && c < 0x80)
1061 /* zero, or bad/overlong encoding */;
1062 else if (c < 0xE0) { // 2 bytes: U+80-U+7FF
1063 if (likely(s + 1 < end
1064 && s[1] >= 0x80 && s[1] < 0xC0))
1066 } else if (c < 0xF0) { // 3 bytes: U+800-U+FFFF
1067 if (likely(s + 2 < end
1068 && s[1] >= 0x80 && s[1] < 0xC0
1069 && s[2] >= 0x80 && s[2] < 0xC0
1070 && (c != 0xE0 || s[1] >= 0xA0) /* not overlong encoding */
1071 && (c != 0xED || s[1] < 0xA0) /* not surrogate */))
1073 } else if (c < 0xF5) { // 4 bytes: U+10000-U+10FFFF
1074 if (likely(s + 3 < end
1075 && s[1] >= 0x80 && s[1] < 0xC0
1076 && s[2] >= 0x80 && s[2] < 0xC0
1077 && s[3] >= 0x80 && s[3] < 0xC0
1078 && (c != 0xF0 || s[1] >= 0x90) /* not overlong encoding */
1079 && (c != 0xF4 || s[1] < 0x90) /* not >U+10FFFF */))
1086 String::parse_cesu8_char(const unsigned char *s, const unsigned char *end)
1090 && s[1] >= 0xA0 && s[1] < 0xB0
1091 && s[2] >= 0x80 && s[2] < 0xC0
1093 && s[4] >= 0xB0 && s[4] < 0xC0
1094 && s[5] >= 0x80 && s[5] < 0xC0)
1096 + ((s[1] & 0x0F) << 16) + ((s[2] & 0x3F) << 10)
1097 + ((s[4] & 0x0F) << 6) + (s[5] & 0x3F);
1102 static const uint16_t windows1252_c1_mapping[] = {
1103 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1104 0x20C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
1105 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1106 0x20DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
1110 String::windows1252_to_utf8() const
1112 const unsigned char *s = ubegin(), *last = s, *ends = uend();
1114 for (; s != ends; ++s) {
1116 if (unlikely(c == 0)) {
1119 } else if (likely(c < 128))
1123 if (unlikely(c < 160))
1124 c = windows1252_c1_mapping[c - 128];
1129 if (last == ubegin())
1133 return sa.take_string();
1138 String::utf16be_to_utf8(int flags) const
1140 const unsigned char *s = ubegin(), *ends = uend();
1143 if ((flags & utf_strip_bom) && s != ends
1144 && s[0] == 0xFE && s[1] == 0xFF)
1147 sa.reserve((ends - s) / 2);
1148 for (; s != ends; s += 2) {
1149 int c = s[0] * 256 + s[1];
1150 if (likely(c < 0xD800) || c >= 0xE000)
1152 else if (c < 0xDC00 && s + 2 != ends
1153 && s[2] >= 0xDC && s[2] < 0xE0) {
1154 c = 0x10000 + ((c - 0xD800) << 10)
1155 + ((s[2] & 0x03) << 8) + s[3];
1158 } else if (c && (flags & utf_replacement))
1159 sa.append_utf8(u_replacement);
1161 return sa.take_string();
1165 String::utf16le_to_utf8(int flags) const
1167 const unsigned char *s = ubegin(), *ends = uend();
1170 if ((flags & utf_strip_bom) && s != ends
1171 && s[1] == 0xFE && s[0] == 0xFF)
1174 sa.reserve((ends - s) / 2);
1175 for (; s != ends; s += 2) {
1176 int c = s[1] * 256 + s[0];
1177 if (likely(c < 0xD800) || c >= 0xE000)
1179 else if (c < 0xDC00 && s + 2 != ends
1180 && s[3] >= 0xDC && s[3] < 0xE0) {
1181 c = 0x10000 + ((c - 0xD800) << 10)
1182 + ((s[3] & 0x03) << 8) + s[2];
1185 } else if (c && (flags & utf_replacement))
1186 sa.append_utf8(u_replacement);
1188 return sa.take_string();
1192 String::utf16_to_utf8(int flags) const
1196 const unsigned char *s = ubegin(), *ends = uend();
1199 int c = s[0] * 256 + s[1];
1200 if (c == 0xFEFF || (c != 0xFFFE && !(flags & utf_prefer_le)))
1201 return utf16be_to_utf8(flags);
1203 return utf16le_to_utf8(flags);
1207 String::cesu8_to_utf8(int flags) const
1209 const unsigned char *s = ubegin(), *last = s, *t, *ends = uend();
1211 if (flags & utf_strip_bom)
1212 s = last = skip_utf8_bom(s, ends);
1215 if (likely(c > 0 && c < 0x7F))
1217 else if (likely((t = skip_utf8_char(s, ends)) != s))
1219 else if ((c = parse_cesu8_char(s, ends))) {
1226 if ((flags & utf_replacement) && c != 0)
1227 sa.append_utf8(u_replacement);
1228 for (++s; s != ends && *s >= 0x80 && *s < 0xC0; ++s)
1233 if (last == ubegin())
1237 return sa.take_string();
1242 String::utf8_to_utf8(int flags) const
1244 const unsigned char *s = ubegin(), *last = s, *t, *ends = uend();
1246 if (flags & utf_strip_bom)
1247 s = last = skip_utf8_bom(s, ends);
1250 if (likely(c > 0 && c < 0x7F))
1252 else if (likely((t = skip_utf8_char(s, ends)) != s))
1256 if ((flags & utf_replacement) && c != 0)
1257 sa.append_utf8(u_replacement);
1258 for (++s; s != ends && *s >= 0x80 && *s < 0xC0; ++s)
1263 if (last == ubegin())
1267 return sa.take_string();
1271 /** @brief Return a version of the string converted to UTF-8.
1273 Null characters are dropped. Then, if the result contains invalid
1274 UTF-8, the string is assumed to be Windows-1252 encoded, and is
1275 converted to UTF-8 accordingly. */
1277 String::to_utf8(int flags) const
1279 const unsigned char *s = ubegin(), *ends = uend();
1281 if ((s[0] == 0xFF && s[1] == 0xFE)
1282 || (s[0] > 0 && s[0] < 0x80 && s[1] == 0))
1283 return utf16le_to_utf8(flags);
1284 else if ((s[0] == 0xFE && s[1] == 0xFF)
1285 || (s[0] == 0 && s[1] > 0 && s[1] < 0x80))
1286 return utf16be_to_utf8(flags);
1289 const unsigned char *last = s, *t;
1290 if (ends - s > 3 && (flags & utf_strip_bom))
1291 s = last = skip_utf8_bom(s, ends);
1293 bool any_long_utf8 = false;
1296 if (likely(*s > 0 && *s < 128))
1298 else if (likely((t = skip_utf8_char(s, ends)) != s)) {
1299 any_long_utf8 = true;
1301 } else if ((c = parse_cesu8_char(s, ends)) != 0) {
1302 any_long_utf8 = true;
1307 } else if (*s == 0) {
1311 } else if (!any_long_utf8)
1317 c = windows1252_c1_mapping[c - 128];
1325 if (last == ubegin())
1328 sa.append(last, ends);
1329 return sa.take_string();
1334 if (likely(*s > 0 && *s < 128))
1341 c = windows1252_c1_mapping[c - 128];
1351 /** @brief Return this string's contents encoded for JSON.
1352 @pre *this is encoded in UTF-8.
1354 For instance, String("a\"").encode_json() == "a\\\"". Note that the
1355 double-quote characters that usually surround a JSON string are not
1357 String String::encode_json() const {
1359 const char* last = encode_json_partial(sa);
1360 if (last == begin())
1363 sa.append(last, end());
1364 return sa.take_string();
1368 String String::encode_base64(bool pad) const {
1370 encode_base64(sa, pad);
1371 return sa.take_string();
1374 String String::decode_base64() const {
1376 if (!decode_base64(sa))
1378 return sa.take_string();