--- /dev/null
+/* Masstree
+ * Eddie Kohler, Yandong Mao, Robert Morris
+ * Copyright (c) 2012-2013 President and Fellows of Harvard College
+ * Copyright (c) 2012-2013 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, subject to the conditions
+ * listed in the Masstree LICENSE file. These conditions include: you must
+ * preserve this copyright notice, and you cannot mention the copyright
+ * holders in advertising related to the Software without their permission.
+ * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
+ * notice is a summary of the Masstree LICENSE file; the license in that file
+ * is legally binding.
+ */
+/*
+ * string.{cc,hh} -- a String class with shared substrings
+ * Eddie Kohler
+ *
+ * Copyright (c) 1999-2000 Massachusetts Institute of Technology
+ * Copyright (c) 2001-2013 Eddie Kohler
+ * Copyright (c) 2008-2009 Meraki, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, subject to the conditions
+ * listed in the Click LICENSE file. These conditions include: you must
+ * preserve this copyright notice, and you cannot mention the copyright
+ * holders in advertising related to the Software without their permission.
+ * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
+ * notice is a summary of the Click LICENSE file; the license in that file
+ * is legally binding.
+ */
+
+#include "string.hh"
+#include "straccum.hh"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <vector>
+namespace lcdf {
+
+/** @file string.hh
+ * @brief The LCDF String class.
+ */
+
+/** @class String
+ * @brief A string of characters.
+ *
+ * The String class represents a string of characters. Strings may be
+ * constructed from C strings, characters, numbers, and so forth. They may
+ * also be added together. The underlying character arrays are dynamically
+ * allocated; String operations allocate and free memory as needed. A String
+ * and its substrings generally share memory. Accessing a character by index
+ * takes O(1) time; so does creating a substring.
+ *
+ * <h3>Out-of-memory strings</h3>
+ *
+ * When there is not enough memory to create a particular string, a special
+ * "out-of-memory" string is returned instead. Out-of-memory strings are
+ * contagious: the result of any concatenation operation involving an
+ * out-of-memory string is another out-of-memory string. Thus, the final
+ * result of a series of String operations will be an out-of-memory string,
+ * even if the out-of-memory condition occurs in the middle.
+ *
+ * The canonical out-of-memory string is 14 bytes long, and equals the UTF-8
+ * encoding of "\U0001F4A3ENOMEM\U0001F4A3" (that is, U+1F4A3 BOMB +
+ * "ENOMEM" + U+1F4A3 BOMB). This sequence is unlikely to show up in normal
+ * text, compares high relative to most other textual strings, and is valid
+ * UTF-8.
+ *
+ * All canonical out-of-memory strings are equal and share the same data(),
+ * which is different from the data() of any other string. See
+ * String::out_of_memory_data(). The String::make_out_of_memory() function
+ * returns a canonical out-of-memory string.
+ *
+ * Other strings may also be out-of-memory strings. For example,
+ * String::make_stable(String::out_of_memory_data()) ==
+ * String::make_out_of_memory(), and some (but not all) substrings of
+ * out-of-memory strings are also out-of-memory strings.
+ */
+
+const char String_generic::empty_data[] = "";
+// oom_data is the UTF-8 encoding of U+1F4A3 BOMB + "ENOMEM" + U+1F4A3 BOMB
+const char String_generic::out_of_memory_data[] = "\360\237\222\243ENOMEM\360\237\222\243";
+const char String_generic::bool_data[] = "false\0true";
+const char String_generic::base64_encoding_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+const unsigned char String_generic::base64_decoding_map[] =
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\x3F\0\0\0\x40"
+ "\x35\x36\x37\x38\x39\x3A\x3B\x3C\x3D\x3E\0\0\0\0\0\0"
+ "\0\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+ "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\0\0\0\0\0"
+ "\0\x1B\x1C\x1D\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29"
+ "\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31\x32\x33\x34\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+const char String::int_data[] = "0\0001\0002\0003\0004\0005\0006\0007\0008\0009";
+
+#if HAVE_STRING_PROFILING > 1
+# define MEMO_INITIALIZER_TAIL , 0, 0
+#else
+# define MEMO_INITIALIZER_TAIL
+#endif
+
+const String::rep_type String::null_string_rep = {
+ String_generic::empty_data, 0, 0
+};
+const String::rep_type String::oom_string_rep = {
+ String_generic::out_of_memory_data, String_generic::out_of_memory_length, 0
+};
+const String::rep_type String::zero_string_rep = {
+ &int_data[0], 0, 0
+};
+
+#if HAVE_STRING_PROFILING
+uint64_t String::live_memo_count;
+uint64_t String::memo_sizes[55];
+uint64_t String::live_memo_sizes[55];
+uint64_t String::live_memo_bytes[55];
+# if HAVE_STRING_PROFILING > 1
+String::memo_type *String::live_memos[55];
+# endif
+#endif
+
+int
+String_generic::compare(const char* a, int a_len, const char* b, int b_len)
+{
+ if (a != b) {
+ int len = a_len < b_len ? a_len : b_len;
+ int cmp = memcmp(a, b, len);
+ if (cmp != 0)
+ return cmp;
+ }
+ return a_len - b_len;
+}
+
+int String_generic::natural_compare(const char* a, int a_len,
+ const char* b, int b_len) {
+ const char* ae = a + a_len;
+ const char* be = b + b_len;
+ const char* aperiod = 0;
+ bool aperiod_negative = false;
+ int raw_compare = 0;
+
+ while (a < ae && b < be) {
+ if (isdigit((unsigned char) *a) && isdigit((unsigned char) *b)) {
+ // compare the two numbers, but treat them as strings
+ // (a decimal conversion might cause overflow)
+ bool potential_decimal = (a == aperiod);
+
+ // check if both are negative (note that if we get here, entire
+ // string prefixes are identical)
+ bool negative = false;
+ if (a > ae - a_len && a[-1] == '-'
+ && (a == ae - a_len + 1
+ || isspace((unsigned char) a[-2])))
+ negative = true;
+
+ // skip initial '0's, but remember any difference in length
+ const char *ia = a, *ib = b;
+ while (a < ae && *a == '0')
+ ++a;
+ while (b < be && *b == '0')
+ ++b;
+ int longer_zeros = (a - ia) - (b - ib);
+
+ // walk over digits, remembering first nonidentical digit comparison
+ int digit_compare = 0;
+ bool a_good, b_good;
+ while (1) {
+ a_good = a < ae && isdigit((unsigned char) *a);
+ b_good = b < be && isdigit((unsigned char) *b);
+ if (!a_good || !b_good)
+ break;
+ if (digit_compare == 0)
+ digit_compare = *a - *b;
+ ++a;
+ ++b;
+ }
+
+ // real number comparison: leading zeros are significant,
+ // digit comparisons take precedence
+ if (potential_decimal) {
+ const char *ax = a, *bx = b;
+ while (ax < ae && isdigit((unsigned char) *ax))
+ ++ax;
+ while (bx < be && isdigit((unsigned char) *bx))
+ ++bx;
+ // watch for IP addresses: don't treat "0.2." like a decimal
+ if (!(ax + 1 < ae && *ax == '.' && !isspace((unsigned char) ax[1]))
+ && !(bx + 1 < be && *bx == '.' && !isspace((unsigned char) bx[1]))) {
+ negative = aperiod_negative;
+ if (longer_zeros)
+ return negative ? 1 : -1;
+ if (digit_compare)
+ a_good = b_good;
+ }
+ }
+ // if one number is longer, it must also be larger
+ if (a_good != b_good)
+ return negative == a_good ? -1 : 1;
+ // otherwise, digit comparisons take precedence
+ if (digit_compare)
+ return negative == (digit_compare > 0) ? -1 : 1;
+ // as a last resort, the longer string of zeros is greater
+ if (longer_zeros)
+ return longer_zeros;
+ // prepare for potential decimal comparison later
+ if (!aperiod) {
+ a_good = a + 1 < ae && *a == '.'
+ && isdigit((unsigned char) a[1]);
+ b_good = b + 1 < be && *b == '.'
+ && isdigit((unsigned char) b[1]);
+ if (a_good != b_good)
+ return negative == b_good ? 1 : -1;
+ else if (a_good) {
+ aperiod = a + 1;
+ aperiod_negative = negative;
+ }
+ }
+
+ // if we get here, the numeric portions were byte-for-byte
+ // identical; move on
+ } else if (isdigit((unsigned char) *a))
+ return isalpha((unsigned char) *b) ? -1 : 1;
+ else if (isdigit((unsigned char) *b))
+ return isalpha((unsigned char) *a) ? 1 : -1;
+ else {
+ int alower = (unsigned char) tolower((unsigned char) *a);
+ int blower = (unsigned char) tolower((unsigned char) *b);
+ if (alower != blower)
+ return alower - blower;
+ if (raw_compare == 0)
+ raw_compare = (unsigned char) *a - (unsigned char) *b;
+ if (*a != '.')
+ aperiod = 0;
+ ++a;
+ ++b;
+ }
+ }
+
+ if ((ae - a) != (be - b))
+ return (ae - a) - (be - b);
+ else
+ return raw_compare;
+}
+
+hashcode_t
+String_generic::hashcode(const char *s, int len)
+{
+ if (len <= 0)
+ return 0;
+
+ uint32_t hash = len;
+ int rem = hash & 3;
+ const char *end = s + len - rem;
+ uint32_t last16;
+
+#if !HAVE_INDIFFERENT_ALIGNMENT
+ if (!(reinterpret_cast<uintptr_t>(s) & 1)) {
+#endif
+#define get16(p) (*reinterpret_cast<const uint16_t *>((p)))
+ for (; s != end; s += 4) {
+ hash += get16(s);
+ uint32_t tmp = (get16(s + 2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ hash += hash >> 11;
+ }
+ if (rem >= 2) {
+ last16 = get16(s);
+ goto rem2;
+ }
+#undef get16
+#if !HAVE_INDIFFERENT_ALIGNMENT
+ } else {
+# if !__i386__ && !__x86_64__ && !__arch_um__
+# define get16(p) (((unsigned char) (p)[0] << 8) + (unsigned char) (p)[1])
+# else
+# define get16(p) ((unsigned char) (p)[0] + ((unsigned char) (p)[1] << 8))
+# endif
+ // should be exactly the same as the code above
+ for (; s != end; s += 4) {
+ hash += get16(s);
+ uint32_t tmp = (get16(s + 2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ hash += hash >> 11;
+ }
+ if (rem >= 2) {
+ last16 = get16(s);
+ goto rem2;
+ }
+# undef get16
+ }
+#endif
+
+ /* Handle end cases */
+ if (0) { // weird organization avoids uninitialized
+ rem2: // variable warnings
+ if (rem == 3) {
+ hash += last16;
+ hash ^= hash << 16;
+ hash ^= ((unsigned char) s[2]) << 18;
+ hash += hash >> 11;
+ } else {
+ hash += last16;
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ }
+ } else if (rem == 1) {
+ hash += (unsigned char) *s;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
+
+int
+String_generic::find_left(const char *s, int len, int start,
+ char x)
+{
+ if (start < 0)
+ start = 0;
+ for (int i = start; i < len; ++i)
+ if (s[i] == x)
+ return i;
+ return -1;
+}
+
+int
+String_generic::find_left(const char *s, int len, int start,
+ const char *x, int x_len)
+{
+ if (start < 0)
+ start = 0;
+ if (x_len == 0)
+ return start <= len ? start : -1;
+ int max_pos = len - x_len;
+ for (int i = start; i <= max_pos; ++i)
+ if (memcmp(s + i, x, x_len) == 0)
+ return i;
+ return -1;
+}
+
+int
+String_generic::find_right(const char *s, int len, int start,
+ char x)
+{
+ if (start >= len)
+ start = len - 1;
+ for (int i = start; i >= 0; --i)
+ if (s[i] == x)
+ return i;
+ return -1;
+}
+
+int
+String_generic::find_right(const char *s, int len, int start,
+ const char *x, int x_len)
+{
+ if (start >= len)
+ start = len - x_len;
+ if (x_len == 0)
+ return start >= 0 ? start : -1;
+ for (int i = start; i >= 0; --i)
+ if (memcmp(s + i, x, x_len) == 0)
+ return i;
+ return -1;
+}
+
+long String_generic::to_i(const char* s, const char* ends) {
+ bool neg;
+ if (s != ends && (s[0] == '-' || s[0] == '+')) {
+ neg = s[0] == '-';
+ ++s;
+ } else
+ neg = false;
+ if (s == ends || !isdigit((unsigned char) *s))
+ return 0;
+ unsigned long x = (unsigned char) *s - '0';
+ for (++s; s != ends && isdigit((unsigned char) *s); ++s)
+ x = x * 10 + *s - '0'; // XXX overflow
+ return neg ? -x : x;
+}
+
+bool String_generic::glob_match(const char* sbegin, int slen,
+ const char* pbegin, int plen) {
+ const char* send = sbegin + slen;
+ const char* pend = pbegin + plen;
+
+ // quick common-case check for suffix matches
+ while (pbegin < pend && sbegin < send
+ && pend[-1] != '*' && pend[-1] != '?' && pend[-1] != ']'
+ && (pbegin + 1 == pend || pend[-2] != '\\'))
+ if (pend[-1] == send[-1])
+ --pend, --send;
+ else
+ return false;
+
+ std::vector<const char*> state, nextstate;
+ state.push_back(pbegin);
+
+ for (const char* s = sbegin; s != send && state.size(); ++s) {
+ nextstate.clear();
+ for (const char** pp = state.data(); pp != state.data() + state.size(); ++pp)
+ if (*pp != pend) {
+ reswitch:
+ switch (**pp) {
+ case '?':
+ nextstate.push_back(*pp + 1);
+ break;
+ case '*':
+ if (*pp + 1 == pend)
+ return true;
+ if (nextstate.empty() || nextstate.back() != *pp)
+ nextstate.push_back(*pp);
+ ++*pp;
+ goto reswitch;
+ case '\\':
+ if (*pp + 1 != pend)
+ ++*pp;
+ goto normal_char;
+ case '[': {
+ const char *ec = *pp + 1;
+ bool negated;
+ if (ec != pend && *ec == '^') {
+ negated = true;
+ ++ec;
+ } else
+ negated = false;
+ if (ec == pend)
+ goto normal_char;
+
+ bool found = false;
+ do {
+ if (*++ec == *s)
+ found = true;
+ } while (ec != pend && *ec != ']');
+ if (ec == pend)
+ goto normal_char;
+
+ if (found == !negated)
+ nextstate.push_back(ec + 1);
+ break;
+ }
+ normal_char:
+ default:
+ if (**pp == *s)
+ nextstate.push_back(*pp + 1);
+ break;
+ }
+ }
+ state.swap(nextstate);
+ }
+
+ for (const char** pp = state.data(); pp != state.data() + state.size(); ++pp) {
+ while (*pp != pend && **pp == '*')
+ ++*pp;
+ if (*pp == pend)
+ return true;
+ }
+ return false;
+}
+
+
+/** @cond never */
+#if HAVE_STRING_PROFILING
+void String::memo_type::account_new() {
+ int bucket = profile_memo_size_bucket(this->dirty, this->capacity);
+ ++memo_sizes[bucket];
+ ++live_memo_sizes[bucket];
+ live_memo_bytes[bucket] += this->capacity;
+ ++live_memo_count;
+# if HAVE_STRING_PROFILING > 1
+ this->pprev = &live_memos[bucket];
+ if ((this->next = *this->pprev))
+ this->next->pprev = &this->next;
+ *this->pprev = memo;
+# endif
+}
+
+void String::memo_type::account_destroy() {
+ int bucket = profile_memo_size_bucket(this->dirty, this->capacity);
+ --live_memo_sizes[bucket];
+ live_memo_bytes[bucket] -= this->capacity;
+ --live_memo_count;
+# if HAVE_STRING_PROFILING > 1
+ if ((*this->pprev = this->next))
+ this->next->pprev = this->pprev;
+# endif
+}
+#endif
+
+inline String::memo_type* String::create_memo(int capacity, int dirty) {
+ assert(capacity > 0 && capacity >= dirty);
+ memo_type *memo =
+ reinterpret_cast<memo_type *>(new char[capacity + MEMO_SPACE]);
+ if (memo)
+ memo->initialize(capacity, dirty);
+ return memo;
+}
+
+void
+String::delete_memo(memo_type *memo)
+{
+ assert(memo->capacity > 0);
+ assert(memo->capacity >= memo->dirty);
+ memo->account_destroy();
+ delete[] reinterpret_cast<char*>(memo);
+}
+
+
+#if HAVE_STRING_PROFILING
+void
+String::one_profile_report(StringAccum &sa, int i, int examples)
+{
+ if (i <= 16)
+ sa << "memo_dirty_" << i;
+ else if (i < 25) {
+ uint32_t s = (i - 17) * 2 + 17;
+ sa << "memo_cap_" << s << '_' << (s + 1);
+ } else if (i < 29) {
+ uint32_t s = (i - 25) * 8 + 33;
+ sa << "memo_cap_" << s << '_' << (s + 7);
+ } else {
+ uint32_t s1 = (1U << (i - 23)) + 1;
+ uint32_t s2 = (s1 - 1) << 1;
+ sa << "memo_cap_" << s1 << '_' << s2;
+ }
+ sa << '\t' << live_memo_sizes[i] << '\t' << memo_sizes[i] << '\t' << live_memo_bytes[i] << '\n';
+ if (examples) {
+# if HAVE_STRING_PROFILING > 1
+ for (memo_type *m = live_memos[i]; m; m = m->next) {
+ sa << " [" << m->dirty << "] ";
+ uint32_t dirty = m->dirty;
+ if (dirty > 0 && m->real_data[dirty - 1] == '\0')
+ --dirty;
+ sa.append(m->real_data, dirty > 128 ? 128 : dirty);
+ sa << '\n';
+ }
+# endif
+ }
+}
+
+void
+String::profile_report(StringAccum &sa, int examples)
+{
+ uint64_t all_live_sizes = 0, all_sizes = 0, all_live_bytes = 0;
+ for (int i = 0; i < 55; ++i) {
+ if (memo_sizes[i])
+ one_profile_report(sa, i, examples);
+ all_live_sizes += live_memo_sizes[i];
+ all_sizes += memo_sizes[i];
+ all_live_bytes += live_memo_bytes[i];
+ }
+ sa << "memo_total\t" << all_live_sizes << '\t' << all_sizes << '\t' << all_live_bytes << '\n';
+}
+#endif
+
+/** @endcond never */
+
+
+/** @brief Construct a base-10 string representation of @a x. */
+String::String(int x)
+{
+ if (x >= 0 && x < 10)
+ _r.assign(int_data + 2 * x, 1, 0);
+ else {
+ char buf[128];
+ sprintf(buf, "%d", x);
+ assign(buf, -1, false);
+ }
+}
+
+/** @overload */
+String::String(unsigned x)
+{
+ if (x < 10)
+ _r.assign(int_data + 2 * x, 1, 0);
+ else {
+ char buf[128];
+ sprintf(buf, "%u", x);
+ assign(buf, -1, false);
+ }
+}
+
+/** @overload */
+String::String(long x)
+{
+ if (x >= 0 && x < 10)
+ _r.assign(int_data + 2 * x, 1, 0);
+ else {
+ char buf[128];
+ sprintf(buf, "%ld", x);
+ assign(buf, -1, false);
+ }
+}
+
+/** @overload */
+String::String(unsigned long x)
+{
+ if (x < 10)
+ _r.assign(int_data + 2 * x, 1, 0);
+ else {
+ char buf[128];
+ sprintf(buf, "%lu", x);
+ assign(buf, -1, false);
+ }
+}
+
+/** @overload */
+String::String(long long x)
+{
+ if (x >= 0 && x < 10)
+ _r.assign(int_data + 2 * x, 1, 0);
+ else {
+ char buf[128];
+ sprintf(buf, "%lld", x);
+ assign(buf, -1, false);
+ }
+}
+
+/** @overload */
+String::String(unsigned long long x)
+{
+ if (x < 10)
+ _r.assign(int_data + 2 * x, 1, 0);
+ else {
+ char buf[128];
+ sprintf(buf, "%llu", x);
+ assign(buf, -1, false);
+ }
+}
+
+String::String(double x)
+{
+ char buf[128];
+ int len = sprintf(buf, "%.12g", x);
+ assign(buf, len, false);
+}
+
+String
+String::hard_make_stable(const char *s, int len)
+{
+ if (len < 0)
+ len = strlen(s);
+ return String(s, len, 0);
+}
+
+String
+String::make_fill(int c, int len)
+{
+ String s;
+ s.append_fill(c, len);
+ return s;
+}
+
+void
+String::assign_out_of_memory()
+{
+ _r.deref();
+ _r = oom_string_rep;
+}
+
+void
+String::assign(const char *s, int len, bool need_deref)
+{
+ if (!s) {
+ assert(len <= 0);
+ len = 0;
+ } else if (len < 0)
+ len = strlen(s);
+
+ // need to start with dereference
+ memo_type* m;
+ if (need_deref) {
+ if (unlikely((m = _r.memo())
+ && s >= m->real_data
+ && s + len <= m->real_data + m->capacity)) {
+ // Be careful about "String s = ...; s = s.c_str();"
+ _r.assign_noref(s, len, m);
+ return;
+ } else
+ deref();
+ }
+
+ if (len == 0) {
+ m = 0;
+ s = String_generic::empty_data;
+
+ } else {
+ // Make the memo a multiple of 16 characters and bigger than 'len'.
+ int memo_capacity = (len + 15 + MEMO_SPACE) & ~15;
+ m = create_memo(memo_capacity - MEMO_SPACE, len);
+ if (!m) {
+ _r.reset_ref();
+ assign_out_of_memory();
+ return;
+ }
+ memcpy(m->real_data, s, len);
+ s = m->real_data;
+ }
+
+ _r.assign_noref(s, len, m);
+}
+
+/** @brief Append @a len unknown characters to this string.
+ * @return Modifiable pointer to the appended characters.
+ *
+ * The caller may safely modify the returned memory. Null is returned if
+ * the string becomes out-of-memory. */
+char *
+String::append_uninitialized(int len)
+{
+ // Appending anything to "out of memory" leaves it as "out of memory"
+ if (unlikely(len <= 0) || out_of_memory())
+ return 0;
+
+ // If we can, append into unused space. First, we check that there's
+ // enough unused space for 'len' characters to fit; then, we check
+ // that the unused space immediately follows the data in '*this'.
+ uint32_t dirty;
+ memo_type* m = _r.memo();
+ if (m && ((dirty = m->dirty), m->capacity > dirty + len)) {
+ char *real_dirty = m->real_data + dirty;
+ if (real_dirty == _r.data + _r.length) {
+ m->dirty = dirty + len;
+ _r.length += len;
+ assert(m->dirty < m->capacity);
+#if HAVE_STRING_PROFILING
+ profile_update_memo_dirty(m, dirty, dirty + len, m->capacity);
+#endif
+ return real_dirty;
+ }
+ }
+
+ // Now we have to make new space. Make sure the memo is a multiple of 16
+ // bytes and that it is at least 16. But for large strings, allocate a
+ // power of 2, since power-of-2 sizes minimize waste in frequently-used
+ // allocators, like Linux kmalloc.
+ int want_memo_len = _r.length + len + MEMO_SPACE;
+ int memo_capacity;
+ if (want_memo_len <= 1024)
+ memo_capacity = (want_memo_len + 15) & ~15;
+ else
+ for (memo_capacity = 2048; memo_capacity < want_memo_len; )
+ memo_capacity *= 2;
+
+ m = create_memo(memo_capacity - MEMO_SPACE, _r.length + len);
+ if (!m) {
+ assign_out_of_memory();
+ return 0;
+ }
+
+ char *new_data = m->real_data;
+ memcpy(new_data, _r.data, _r.length);
+
+ deref();
+ _r.assign_noref(new_data, _r.length + len, m);
+ return const_cast<char*>(_r.data + _r.length - len);
+}
+
+void
+String::append(const char* s, int len, memo_type* memo)
+{
+ if (!s) {
+ assert(len <= 0);
+ len = 0;
+ } else if (len < 0)
+ len = strlen(s);
+
+ memo_type* my_memo;
+ if (unlikely(len == 0) || out_of_memory())
+ /* do nothing */;
+ else if (unlikely(s == String_generic::out_of_memory_data) && !memo)
+ // Appending "out of memory" to a regular string makes it "out of
+ // memory"
+ assign_out_of_memory();
+ else if (_r.length == 0 && reinterpret_cast<uintptr_t>(memo) > 1) {
+ deref();
+ _r.assign(s, len, memo);
+ } else if (likely(!((my_memo = _r.memo())
+ && s >= my_memo->real_data
+ && s + len <= my_memo->real_data + my_memo->capacity))) {
+ if (char *space = append_uninitialized(len))
+ memcpy(space, s, len);
+ } else {
+ String preserve_s(*this);
+ if (char *space = append_uninitialized(len))
+ memcpy(space, s, len);
+ }
+}
+
+/** @brief Append @a len copies of character @a c to this string. */
+void
+String::append_fill(int c, int len)
+{
+ assert(len >= 0);
+ if (char *space = append_uninitialized(len))
+ memset(space, c, len);
+}
+
+/** @brief Ensure the string's data is unshared and return a mutable
+ pointer to it. */
+char *
+String::mutable_data()
+{
+ // If _memo has a capacity (it's not one of the special strings) and it's
+ // uniquely referenced, return _data right away.
+ if (!is_shared())
+ return const_cast<char *>(_r.data);
+
+ // Otherwise, make a copy of it. Rely on: deref() doesn't change _data or
+ // _length; and if _capacity == 0, then deref() doesn't free _real_data.
+ // But in multithreaded situations we must hold a local copy of memo!
+ String do_not_delete_underlying_memo(*this);
+ deref();
+ assign(_r.data, _r.length, false);
+ return const_cast<char *>(_r.data);
+}
+
+const char *
+String::hard_c_str() const
+{
+ // See also c_str().
+ // We may already have a '\0' in the right place. If _memo has no
+ // capacity, then this is one of the special strings (null or
+ // stable). We are guaranteed, in these strings, that _data[_length]
+ // exists. Otherwise must check that _data[_length] exists.
+ const char *end_data = _r.data + _r.length;
+ memo_type* m = _r.memo();
+ if ((m && end_data >= m->real_data + m->dirty)
+ || *end_data != '\0') {
+ if (char *x = const_cast<String *>(this)->append_uninitialized(1)) {
+ *x = '\0';
+ --_r.length;
+ }
+ }
+ return _r.data;
+}
+
+/** @brief Null-terminate the string and return a mutable pointer to its
+ data.
+ @sa String::c_str */
+char *
+String::mutable_c_str()
+{
+ (void) mutable_data();
+ (void) c_str();
+ return const_cast<char *>(_r.data);
+}
+
+/** @brief Return a substring of this string, consisting of the @a len
+ characters starting at index @a pos.
+ @param pos substring's first position relative to the string
+ @param len length of substring
+
+ If @a pos is negative, starts that far from the end of the string. If @a
+ len is negative, leaves that many characters off the end of the string.
+ If @a pos and @a len specify a substring that is partly outside the
+ string, only the part within the string is returned. If the substring is
+ beyond either end of the string, returns an empty string (but this
+ should be considered a programming error; a future version may generate
+ a warning for this case).
+
+ @note String::substr() is intended to behave like Perl's substr(). */
+String
+String::substr(int pos, int len) const
+{
+ if (pos < 0)
+ pos += _r.length;
+
+ int pos2;
+ if (len < 0)
+ pos2 = _r.length + len;
+ else if (pos >= 0 && len >= _r.length) // avoid integer overflow
+ pos2 = _r.length;
+ else
+ pos2 = pos + len;
+
+ if (pos < 0)
+ pos = 0;
+ if (pos2 > _r.length)
+ pos2 = _r.length;
+
+ if (pos >= pos2)
+ return String();
+ else {
+ _r.ref();
+ return String(_r.data + pos, pos2 - pos, _r.memo());
+ }
+}
+
+static String
+hard_lower(const String &s, int pos)
+{
+ String new_s(s.data(), s.length());
+ char *x = const_cast<char *>(new_s.data()); // know it's mutable
+ int len = s.length();
+ for (; pos < len; pos++)
+ x[pos] = tolower((unsigned char) x[pos]);
+ return new_s;
+}
+
+/** @brief Return a lowercased version of this string.
+
+ Translates the ASCII characters 'A' through 'Z' into their lowercase
+ equivalents. */
+String
+String::lower() const
+{
+ // avoid copies
+ if (!out_of_memory())
+ for (int i = 0; i < _r.length; i++)
+ if (_r.data[i] >= 'A' && _r.data[i] <= 'Z')
+ return hard_lower(*this, i);
+ return *this;
+}
+
+static String
+hard_upper(const String &s, int pos)
+{
+ String new_s(s.data(), s.length());
+ char *x = const_cast<char *>(new_s.data()); // know it's mutable
+ int len = s.length();
+ for (; pos < len; pos++)
+ x[pos] = toupper((unsigned char) x[pos]);
+ return new_s;
+}
+
+/** @brief Return an uppercased version of this string.
+
+ Translates the ASCII characters 'a' through 'z' into their uppercase
+ equivalents. */
+String
+String::upper() const
+{
+ // avoid copies
+ for (int i = 0; i < _r.length; i++)
+ if (_r.data[i] >= 'a' && _r.data[i] <= 'z')
+ return hard_upper(*this, i);
+ return *this;
+}
+
+static String
+hard_printable(const String &s, int pos, int type)
+{
+ StringAccum sa(s.length() * 2);
+ sa.append(s.data(), pos);
+ const unsigned char *x = reinterpret_cast<const unsigned char *>(s.data());
+ int len = s.length();
+ for (; pos < len; pos++) {
+ if (type == 2 && (x[pos] == '\\' || x[pos] == '\"'))
+ sa << '\\' << x[pos];
+ else if (x[pos] >= 32 && x[pos] < 127)
+ sa << x[pos];
+ else if (x[pos] < 32 && type == 2) {
+ if (x[pos] >= 9 && x[pos] <= 13)
+ sa << '\\' << ("tnvfr"[x[pos] - 9]);
+ else if (char *buf = sa.extend(4, 1))
+ sprintf(buf, "\\%03o", x[pos]);
+ } else if (x[pos] < 32 && type != 1)
+ sa << '^' << (unsigned char)(x[pos] + 64);
+ else if (char *buf = sa.extend(4, 1))
+ sprintf(buf, "\\%03o", x[pos]);
+ }
+ return sa.take_string();
+}
+
+/** @brief Return a "printable" version of this string.
+ @param type quoting type
+
+ The default quoting type (0) translates control characters 0-31 into
+ "control" sequences, such as "^@" for the null character, and characters
+ 127-255 into octal escape sequences, such as "\377" for 255. Quoting
+ type 1 translates all characters outside of 32-126 into octal escape
+ sequences. Quoting type 2 uses C escapes, including "\\" and "\"". */
+String
+String::printable(int type) const
+{
+ // avoid copies
+ if (!out_of_memory())
+ for (int i = 0; i < _r.length; i++)
+ if (_r.data[i] < 32 || _r.data[i] > 126)
+ return hard_printable(*this, i, type);
+ return *this;
+}
+
+String String::to_hex() const {
+ StringAccum sa;
+ static const char hexval[] = "0123456789ABCDEF";
+ char* x = sa.extend(2 * _r.length);
+ for (int i = 0; i != _r.length; ++i) {
+ *x++ = hexval[(unsigned char) _r.data[i] >> 4];
+ *x++ = hexval[_r.data[i] & 15];
+ }
+ return sa.take_string();
+}
+
+/** @brief Return the substring with left whitespace removed. */
+String
+String::ltrim() const
+{
+ return String_generic::ltrim(*this);
+}
+
+/** @brief Return the substring with right whitespace removed. */
+String
+String::rtrim() const
+{
+ return String_generic::rtrim(*this);
+}
+
+/** @brief Return the substring with left and right whitespace removed. */
+String
+String::trim() const
+{
+ return String_generic::trim(*this);
+}
+
+void
+String::align(int n)
+{
+ int offset = reinterpret_cast<uintptr_t>(_r.data) % n;
+ if (offset) {
+ String s;
+ s.append_uninitialized(_r.length + n + 1);
+ offset = reinterpret_cast<uintptr_t>(s._r.data) % n;
+ memcpy((char *)s._r.data + n - offset, _r.data, _r.length);
+ s._r.data += n - offset;
+ s._r.length = _r.length;
+ *this = s;
+ }
+}
+
+/** @brief Return the pointer to the next character in UTF-8 encoding. */
+const unsigned char *
+String::skip_utf8_char(const unsigned char *s, const unsigned char *end)
+{
+ int c = *s;
+ if (c > 0 && c < 0x80)
+ return s + 1;
+ else if (c < 0xC2)
+ /* zero, or bad/overlong encoding */;
+ else if (c < 0xE0) { // 2 bytes: U+80-U+7FF
+ if (likely(s + 1 < end
+ && s[1] >= 0x80 && s[1] < 0xC0))
+ return s + 2;
+ } else if (c < 0xF0) { // 3 bytes: U+800-U+FFFF
+ if (likely(s + 2 < end
+ && s[1] >= 0x80 && s[1] < 0xC0
+ && s[2] >= 0x80 && s[2] < 0xC0
+ && (c != 0xE0 || s[1] >= 0xA0) /* not overlong encoding */
+ && (c != 0xED || s[1] < 0xA0) /* not surrogate */))
+ return s + 3;
+ } else if (c < 0xF5) { // 4 bytes: U+10000-U+10FFFF
+ if (likely(s + 3 < end
+ && s[1] >= 0x80 && s[1] < 0xC0
+ && s[2] >= 0x80 && s[2] < 0xC0
+ && s[3] >= 0x80 && s[3] < 0xC0
+ && (c != 0xF0 || s[1] >= 0x90) /* not overlong encoding */
+ && (c != 0xF4 || s[1] < 0x90) /* not >U+10FFFF */))
+ return s + 4;
+ }
+ return s;
+}
+
+int
+String::parse_cesu8_char(const unsigned char *s, const unsigned char *end)
+{
+ if (s + 5 < end
+ && s[0] == 0xED
+ && s[1] >= 0xA0 && s[1] < 0xB0
+ && s[2] >= 0x80 && s[2] < 0xC0
+ && s[3] == 0xED
+ && s[4] >= 0xB0 && s[4] < 0xC0
+ && s[5] >= 0x80 && s[5] < 0xC0)
+ return 0x10000
+ + ((s[1] & 0x0F) << 16) + ((s[2] & 0x3F) << 10)
+ + ((s[4] & 0x0F) << 6) + (s[5] & 0x3F);
+ else
+ return 0;
+}
+
+static const uint16_t windows1252_c1_mapping[] = {
+ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+ 0x20C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
+ 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+ 0x20DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
+};
+
+String
+String::windows1252_to_utf8() const
+{
+ const unsigned char *s = ubegin(), *last = s, *ends = uend();
+ StringAccum sa;
+ for (; s != ends; ++s) {
+ int c = *s;
+ if (unlikely(c == 0)) {
+ sa.append(last, s);
+ last = s + 1;
+ } else if (likely(c < 128))
+ /* do nothing */;
+ else {
+ sa.append(last, s);
+ if (unlikely(c < 160))
+ c = windows1252_c1_mapping[c - 128];
+ sa.append_utf8(c);
+ last = s + 1;
+ }
+ }
+ if (last == ubegin())
+ return *this;
+ else {
+ sa.append(last, s);
+ return sa.take_string();
+ }
+}
+
+String
+String::utf16be_to_utf8(int flags) const
+{
+ const unsigned char *s = ubegin(), *ends = uend();
+ if ((ends - s) & 1)
+ --ends;
+ if ((flags & utf_strip_bom) && s != ends
+ && s[0] == 0xFE && s[1] == 0xFF)
+ s += 2;
+ StringAccum sa;
+ sa.reserve((ends - s) / 2);
+ for (; s != ends; s += 2) {
+ int c = s[0] * 256 + s[1];
+ if (likely(c < 0xD800) || c >= 0xE000)
+ sa.append_utf8(c);
+ else if (c < 0xDC00 && s + 2 != ends
+ && s[2] >= 0xDC && s[2] < 0xE0) {
+ c = 0x10000 + ((c - 0xD800) << 10)
+ + ((s[2] & 0x03) << 8) + s[3];
+ sa.append_utf8(c);
+ s += 2;
+ } else if (c && (flags & utf_replacement))
+ sa.append_utf8(u_replacement);
+ }
+ return sa.take_string();
+}
+
+String
+String::utf16le_to_utf8(int flags) const
+{
+ const unsigned char *s = ubegin(), *ends = uend();
+ if ((ends - s) & 1)
+ --ends;
+ if ((flags & utf_strip_bom) && s != ends
+ && s[1] == 0xFE && s[0] == 0xFF)
+ s += 2;
+ StringAccum sa;
+ sa.reserve((ends - s) / 2);
+ for (; s != ends; s += 2) {
+ int c = s[1] * 256 + s[0];
+ if (likely(c < 0xD800) || c >= 0xE000)
+ sa.append_utf8(c);
+ else if (c < 0xDC00 && s + 2 != ends
+ && s[3] >= 0xDC && s[3] < 0xE0) {
+ c = 0x10000 + ((c - 0xD800) << 10)
+ + ((s[3] & 0x03) << 8) + s[2];
+ sa.append_utf8(c);
+ s += 2;
+ } else if (c && (flags & utf_replacement))
+ sa.append_utf8(u_replacement);
+ }
+ return sa.take_string();
+}
+
+String
+String::utf16_to_utf8(int flags) const
+{
+ if (length() < 2)
+ return String();
+ const unsigned char *s = ubegin(), *ends = uend();
+ if (ends - s < 2)
+ return String();
+ int c = s[0] * 256 + s[1];
+ if (c == 0xFEFF || (c != 0xFFFE && !(flags & utf_prefer_le)))
+ return utf16be_to_utf8(flags);
+ else
+ return utf16le_to_utf8(flags);
+}
+
+String
+String::cesu8_to_utf8(int flags) const
+{
+ const unsigned char *s = ubegin(), *last = s, *t, *ends = uend();
+ StringAccum sa;
+ if (flags & utf_strip_bom)
+ s = last = skip_utf8_bom(s, ends);
+ while (s != ends) {
+ int c = s[0];
+ if (likely(c > 0 && c < 0x7F))
+ ++s;
+ else if (likely((t = skip_utf8_char(s, ends)) != s))
+ s = t;
+ else if ((c = parse_cesu8_char(s, ends))) {
+ sa.append(last, s);
+ sa.append_utf8(c);
+ s += 6;
+ last = s;
+ } else {
+ sa.append(last, s);
+ if ((flags & utf_replacement) && c != 0)
+ sa.append_utf8(u_replacement);
+ for (++s; s != ends && *s >= 0x80 && *s < 0xC0; ++s)
+ /* do nothing */;
+ last = s;
+ }
+ }
+ if (last == ubegin())
+ return *this;
+ else {
+ sa.append(last, s);
+ return sa.take_string();
+ }
+}
+
+String
+String::utf8_to_utf8(int flags) const
+{
+ const unsigned char *s = ubegin(), *last = s, *t, *ends = uend();
+ StringAccum sa;
+ if (flags & utf_strip_bom)
+ s = last = skip_utf8_bom(s, ends);
+ while (s != ends) {
+ int c = s[0];
+ if (likely(c > 0 && c < 0x7F))
+ ++s;
+ else if (likely((t = skip_utf8_char(s, ends)) != s))
+ s = t;
+ else {
+ sa.append(last, s);
+ if ((flags & utf_replacement) && c != 0)
+ sa.append_utf8(u_replacement);
+ for (++s; s != ends && *s >= 0x80 && *s < 0xC0; ++s)
+ /* do nothing */;
+ last = s;
+ }
+ }
+ if (last == ubegin())
+ return *this;
+ else {
+ sa.append(last, s);
+ return sa.take_string();
+ }
+}
+
+/** @brief Return a version of the string converted to UTF-8.
+
+ Null characters are dropped. Then, if the result contains invalid
+ UTF-8, the string is assumed to be Windows-1252 encoded, and is
+ converted to UTF-8 accordingly. */
+String
+String::to_utf8(int flags) const
+{
+ const unsigned char *s = ubegin(), *ends = uend();
+ if (ends - s > 2) {
+ if ((s[0] == 0xFF && s[1] == 0xFE)
+ || (s[0] > 0 && s[0] < 0x80 && s[1] == 0))
+ return utf16le_to_utf8(flags);
+ else if ((s[0] == 0xFE && s[1] == 0xFF)
+ || (s[0] == 0 && s[1] > 0 && s[1] < 0x80))
+ return utf16be_to_utf8(flags);
+ }
+
+ const unsigned char *last = s, *t;
+ if (ends - s > 3 && (flags & utf_strip_bom))
+ s = last = skip_utf8_bom(s, ends);
+ StringAccum sa;
+ bool any_long_utf8 = false;
+ int c;
+ while (s != ends) {
+ if (likely(*s > 0 && *s < 128))
+ ++s;
+ else if (likely((t = skip_utf8_char(s, ends)) != s)) {
+ any_long_utf8 = true;
+ s = t;
+ } else if ((c = parse_cesu8_char(s, ends)) != 0) {
+ any_long_utf8 = true;
+ sa.append(last, s);
+ sa.append_utf8(c);
+ s += 6;
+ last = s;
+ } else if (*s == 0) {
+ sa.append(last, s);
+ ++s;
+ last = s;
+ } else if (!any_long_utf8)
+ goto windows1252;
+ else {
+ sa.append(last, s);
+ int c = *s;
+ if (c < 160)
+ c = windows1252_c1_mapping[c - 128];
+ sa.append_utf8(c);
+ ++s;
+ last = s;
+ }
+ }
+
+ exit:
+ if (last == ubegin())
+ return *this;
+ else {
+ sa.append(last, ends);
+ return sa.take_string();
+ }
+
+ windows1252:
+ while (s != ends) {
+ if (likely(*s > 0 && *s < 128))
+ ++s;
+ else {
+ sa.append(last, s);
+ if (*s != 0) {
+ int c = *s;
+ if (c < 160)
+ c = windows1252_c1_mapping[c - 128];
+ sa.append_utf8(c);
+ }
+ ++s;
+ last = s;
+ }
+ }
+ goto exit;
+}
+
+/** @brief Return this string's contents encoded for JSON.
+ @pre *this is encoded in UTF-8.
+
+ For instance, String("a\"").encode_json() == "a\\\"". Note that the
+ double-quote characters that usually surround a JSON string are not
+ included. */
+String String::encode_json() const {
+ StringAccum sa;
+ const char* last = encode_json_partial(sa);
+ if (last == begin())
+ return *this;
+ else {
+ sa.append(last, end());
+ return sa.take_string();
+ }
+}
+
+String String::encode_base64(bool pad) const {
+ StringAccum sa;
+ encode_base64(sa, pad);
+ return sa.take_string();
+}
+
+String String::decode_base64() const {
+ StringAccum sa;
+ if (!decode_base64(sa))
+ return String();
+ return sa.take_string();
+}
+
+} // namespace lcdf