benchmark silo added

[c11concurrency-benchmarks.git] / silo / masstree / string.cc
diff --git a/silo/masstree/string.cc b/silo/masstree/string.cc

new file mode 100644 (file)

index 0000000..2ebe7e8
--- /dev/null
+++ b/silo/masstree/string.cc
@@ -0,0 +1,1381 @@
+/* Masstree
+ * Eddie Kohler, Yandong Mao, Robert Morris
+ * Copyright (c) 2012-2013 President and Fellows of Harvard College
+ * Copyright (c) 2012-2013 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, subject to the conditions
+ * listed in the Masstree LICENSE file. These conditions include: you must
+ * preserve this copyright notice, and you cannot mention the copyright
+ * holders in advertising related to the Software without their permission.
+ * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
+ * notice is a summary of the Masstree LICENSE file; the license in that file
+ * is legally binding.
+ */
+/*
+ * string.{cc,hh} -- a String class with shared substrings
+ * Eddie Kohler
+ *
+ * Copyright (c) 1999-2000 Massachusetts Institute of Technology
+ * Copyright (c) 2001-2013 Eddie Kohler
+ * Copyright (c) 2008-2009 Meraki, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, subject to the conditions
+ * listed in the Click LICENSE file. These conditions include: you must
+ * preserve this copyright notice, and you cannot mention the copyright
+ * holders in advertising related to the Software without their permission.
+ * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
+ * notice is a summary of the Click LICENSE file; the license in that file
+ * is legally binding.
+ */
+
+#include "string.hh"
+#include "straccum.hh"
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <vector>
+namespace lcdf {
+
+/** @file string.hh
+ * @brief The LCDF String class.
+ */
+
+/** @class String
+ * @brief A string of characters.
+ *
+ * The String class represents a string of characters.  Strings may be
+ * constructed from C strings, characters, numbers, and so forth.  They may
+ * also be added together.  The underlying character arrays are dynamically
+ * allocated; String operations allocate and free memory as needed.  A String
+ * and its substrings generally share memory.  Accessing a character by index
+ * takes O(1) time; so does creating a substring.
+ *
+ * <h3>Out-of-memory strings</h3>
+ *
+ * When there is not enough memory to create a particular string, a special
+ * "out-of-memory" string is returned instead. Out-of-memory strings are
+ * contagious: the result of any concatenation operation involving an
+ * out-of-memory string is another out-of-memory string. Thus, the final
+ * result of a series of String operations will be an out-of-memory string,
+ * even if the out-of-memory condition occurs in the middle.
+ *
+ * The canonical out-of-memory string is 14 bytes long, and equals the UTF-8
+ * encoding of "\U0001F4A3ENOMEM\U0001F4A3" (that is, U+1F4A3 BOMB +
+ * "ENOMEM" + U+1F4A3 BOMB). This sequence is unlikely to show up in normal
+ * text, compares high relative to most other textual strings, and is valid
+ * UTF-8.
+ *
+ * All canonical out-of-memory strings are equal and share the same data(),
+ * which is different from the data() of any other string. See
+ * String::out_of_memory_data(). The String::make_out_of_memory() function
+ * returns a canonical out-of-memory string.
+ *
+ * Other strings may also be out-of-memory strings. For example,
+ * String::make_stable(String::out_of_memory_data()) ==
+ * String::make_out_of_memory(), and some (but not all) substrings of
+ * out-of-memory strings are also out-of-memory strings.
+ */
+
+const char String_generic::empty_data[] = "";
+// oom_data is the UTF-8 encoding of U+1F4A3 BOMB + "ENOMEM" + U+1F4A3 BOMB
+const char String_generic::out_of_memory_data[] = "\360\237\222\243ENOMEM\360\237\222\243";
+const char String_generic::bool_data[] = "false\0true";
+const char String_generic::base64_encoding_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+const unsigned char String_generic::base64_decoding_map[] =
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\x3F\0\0\0\x40"
+        "\x35\x36\x37\x38\x39\x3A\x3B\x3C\x3D\x3E\0\0\0\0\0\0"
+        "\0\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+        "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\0\0\0\0\0"
+        "\0\x1B\x1C\x1D\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29"
+        "\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31\x32\x33\x34\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+const char String::int_data[] = "0\0001\0002\0003\0004\0005\0006\0007\0008\0009";
+
+#if HAVE_STRING_PROFILING > 1
+# define MEMO_INITIALIZER_TAIL , 0, 0
+#else
+# define MEMO_INITIALIZER_TAIL
+#endif
+
+const String::rep_type String::null_string_rep = {
+    String_generic::empty_data, 0, 0
+};
+const String::rep_type String::oom_string_rep = {
+    String_generic::out_of_memory_data, String_generic::out_of_memory_length, 0
+};
+const String::rep_type String::zero_string_rep = {
+    &int_data[0], 0, 0
+};
+
+#if HAVE_STRING_PROFILING
+uint64_t String::live_memo_count;
+uint64_t String::memo_sizes[55];
+uint64_t String::live_memo_sizes[55];
+uint64_t String::live_memo_bytes[55];
+# if HAVE_STRING_PROFILING > 1
+String::memo_type *String::live_memos[55];
+# endif
+#endif
+
+int
+String_generic::compare(const char* a, int a_len, const char* b, int b_len)
+{
+    if (a != b) {
+        int len = a_len < b_len ? a_len : b_len;
+        int cmp = memcmp(a, b, len);
+        if (cmp != 0)
+            return cmp;
+    }
+    return a_len - b_len;
+}
+
+int String_generic::natural_compare(const char* a, int a_len,
+                                    const char* b, int b_len) {
+    const char* ae = a + a_len;
+    const char* be = b + b_len;
+    const char* aperiod = 0;
+    bool aperiod_negative = false;
+    int raw_compare = 0;
+
+    while (a < ae && b < be) {
+        if (isdigit((unsigned char) *a) && isdigit((unsigned char) *b)) {
+            // compare the two numbers, but treat them as strings
+            // (a decimal conversion might cause overflow)
+            bool potential_decimal = (a == aperiod);
+
+            // check if both are negative (note that if we get here, entire
+            // string prefixes are identical)
+            bool negative = false;
+            if (a > ae - a_len && a[-1] == '-'
+                && (a == ae - a_len + 1
+                    || isspace((unsigned char) a[-2])))
+                negative = true;
+
+            // skip initial '0's, but remember any difference in length
+            const char *ia = a, *ib = b;
+            while (a < ae && *a == '0')
+                ++a;
+            while (b < be && *b == '0')
+                ++b;
+            int longer_zeros = (a - ia) - (b - ib);
+
+            // walk over digits, remembering first nonidentical digit comparison
+            int digit_compare = 0;
+            bool a_good, b_good;
+            while (1) {
+                a_good = a < ae && isdigit((unsigned char) *a);
+                b_good = b < be && isdigit((unsigned char) *b);
+                if (!a_good || !b_good)
+                    break;
+                if (digit_compare == 0)
+                    digit_compare = *a - *b;
+                ++a;
+                ++b;
+            }
+
+            // real number comparison: leading zeros are significant,
+            // digit comparisons take precedence
+            if (potential_decimal) {
+                const char *ax = a, *bx = b;
+                while (ax < ae && isdigit((unsigned char) *ax))
+                    ++ax;
+                while (bx < be && isdigit((unsigned char) *bx))
+                    ++bx;
+                // watch for IP addresses: don't treat "0.2." like a decimal
+                if (!(ax + 1 < ae && *ax == '.' && !isspace((unsigned char) ax[1]))
+                    && !(bx + 1 < be && *bx == '.' && !isspace((unsigned char) bx[1]))) {
+                    negative = aperiod_negative;
+                    if (longer_zeros)
+                        return negative ? 1 : -1;
+                    if (digit_compare)
+                        a_good = b_good;
+                }
+            }
+            // if one number is longer, it must also be larger
+            if (a_good != b_good)
+                return negative == a_good ? -1 : 1;
+            // otherwise, digit comparisons take precedence
+            if (digit_compare)
+                return negative == (digit_compare > 0) ? -1 : 1;
+            // as a last resort, the longer string of zeros is greater
+            if (longer_zeros)
+                return longer_zeros;
+            // prepare for potential decimal comparison later
+            if (!aperiod) {
+                a_good = a + 1 < ae && *a == '.'
+                    && isdigit((unsigned char) a[1]);
+                b_good = b + 1 < be && *b == '.'
+                    && isdigit((unsigned char) b[1]);
+                if (a_good != b_good)
+                    return negative == b_good ? 1 : -1;
+                else if (a_good) {
+                    aperiod = a + 1;
+                    aperiod_negative = negative;
+                }
+            }
+
+            // if we get here, the numeric portions were byte-for-byte
+            // identical; move on
+        } else if (isdigit((unsigned char) *a))
+            return isalpha((unsigned char) *b) ? -1 : 1;
+        else if (isdigit((unsigned char) *b))
+            return isalpha((unsigned char) *a) ? 1 : -1;
+        else {
+            int alower = (unsigned char) tolower((unsigned char) *a);
+            int blower = (unsigned char) tolower((unsigned char) *b);
+            if (alower != blower)
+                return alower - blower;
+            if (raw_compare == 0)
+                raw_compare = (unsigned char) *a - (unsigned char) *b;
+            if (*a != '.')
+                aperiod = 0;
+            ++a;
+            ++b;
+        }
+    }
+
+    if ((ae - a) != (be - b))
+        return (ae - a) - (be - b);
+    else
+        return raw_compare;
+}
+
+hashcode_t
+String_generic::hashcode(const char *s, int len)
+{
+    if (len <= 0)
+        return 0;
+
+    uint32_t hash = len;
+    int rem = hash & 3;
+    const char *end = s + len - rem;
+    uint32_t last16;
+
+#if !HAVE_INDIFFERENT_ALIGNMENT
+    if (!(reinterpret_cast<uintptr_t>(s) & 1)) {
+#endif
+#define get16(p) (*reinterpret_cast<const uint16_t *>((p)))
+        for (; s != end; s += 4) {
+            hash += get16(s);
+            uint32_t tmp = (get16(s + 2) << 11) ^ hash;
+            hash = (hash << 16) ^ tmp;
+            hash += hash >> 11;
+        }
+        if (rem >= 2) {
+            last16 = get16(s);
+            goto rem2;
+        }
+#undef get16
+#if !HAVE_INDIFFERENT_ALIGNMENT
+    } else {
+# if !__i386__ && !__x86_64__ && !__arch_um__
+#  define get16(p) (((unsigned char) (p)[0] << 8) + (unsigned char) (p)[1])
+# else
+#  define get16(p) ((unsigned char) (p)[0] + ((unsigned char) (p)[1] << 8))
+# endif
+        // should be exactly the same as the code above
+        for (; s != end; s += 4) {
+            hash += get16(s);
+            uint32_t tmp = (get16(s + 2) << 11) ^ hash;
+            hash = (hash << 16) ^ tmp;
+            hash += hash >> 11;
+        }
+        if (rem >= 2) {
+            last16 = get16(s);
+            goto rem2;
+        }
+# undef get16
+    }
+#endif
+
+    /* Handle end cases */
+    if (0) {                    // weird organization avoids uninitialized
+      rem2:                     // variable warnings
+        if (rem == 3) {
+            hash += last16;
+            hash ^= hash << 16;
+            hash ^= ((unsigned char) s[2]) << 18;
+            hash += hash >> 11;
+        } else {
+            hash += last16;
+            hash ^= hash << 11;
+            hash += hash >> 17;
+        }
+    } else if (rem == 1) {
+        hash += (unsigned char) *s;
+        hash ^= hash << 10;
+        hash += hash >> 1;
+    }
+
+    /* Force "avalanching" of final 127 bits */
+    hash ^= hash << 3;
+    hash += hash >> 5;
+    hash ^= hash << 4;
+    hash += hash >> 17;
+    hash ^= hash << 25;
+    hash += hash >> 6;
+
+    return hash;
+}
+
+int
+String_generic::find_left(const char *s, int len, int start,
+                          char x)
+{
+    if (start < 0)
+        start = 0;
+    for (int i = start; i < len; ++i)
+        if (s[i] == x)
+            return i;
+    return -1;
+}
+
+int
+String_generic::find_left(const char *s, int len, int start,
+                          const char *x, int x_len)
+{
+    if (start < 0)
+        start = 0;
+    if (x_len == 0)
+        return start <= len ? start : -1;
+    int max_pos = len - x_len;
+    for (int i = start; i <= max_pos; ++i)
+        if (memcmp(s + i, x, x_len) == 0)
+            return i;
+    return -1;
+}
+
+int
+String_generic::find_right(const char *s, int len, int start,
+                           char x)
+{
+    if (start >= len)
+        start = len - 1;
+    for (int i = start; i >= 0; --i)
+        if (s[i] == x)
+            return i;
+    return -1;
+}
+
+int
+String_generic::find_right(const char *s, int len, int start,
+                           const char *x, int x_len)
+{
+    if (start >= len)
+        start = len - x_len;
+    if (x_len == 0)
+        return start >= 0 ? start : -1;
+    for (int i = start; i >= 0; --i)
+        if (memcmp(s + i, x, x_len) == 0)
+            return i;
+    return -1;
+}
+
+long String_generic::to_i(const char* s, const char* ends) {
+    bool neg;
+    if (s != ends && (s[0] == '-' || s[0] == '+')) {
+        neg = s[0] == '-';
+        ++s;
+    } else
+        neg = false;
+    if (s == ends || !isdigit((unsigned char) *s))
+        return 0;
+    unsigned long x = (unsigned char) *s - '0';
+    for (++s; s != ends && isdigit((unsigned char) *s); ++s)
+        x = x * 10 + *s - '0';  // XXX overflow
+    return neg ? -x : x;
+}
+
+bool String_generic::glob_match(const char* sbegin, int slen,
+                                const char* pbegin, int plen) {
+    const char* send = sbegin + slen;
+    const char* pend = pbegin + plen;
+
+    // quick common-case check for suffix matches
+    while (pbegin < pend && sbegin < send
+           && pend[-1] != '*' && pend[-1] != '?' && pend[-1] != ']'
+           && (pbegin + 1 == pend || pend[-2] != '\\'))
+        if (pend[-1] == send[-1])
+            --pend, --send;
+        else
+            return false;
+
+    std::vector<const char*> state, nextstate;
+    state.push_back(pbegin);
+
+    for (const char* s = sbegin; s != send && state.size(); ++s) {
+        nextstate.clear();
+        for (const char** pp = state.data(); pp != state.data() + state.size(); ++pp)
+            if (*pp != pend) {
+              reswitch:
+                switch (**pp) {
+                  case '?':
+                    nextstate.push_back(*pp + 1);
+                    break;
+                  case '*':
+                    if (*pp + 1 == pend)
+                        return true;
+                    if (nextstate.empty() || nextstate.back() != *pp)
+                        nextstate.push_back(*pp);
+                    ++*pp;
+                    goto reswitch;
+                  case '\\':
+                    if (*pp + 1 != pend)
+                        ++*pp;
+                    goto normal_char;
+                  case '[': {
+                      const char *ec = *pp + 1;
+                      bool negated;
+                      if (ec != pend && *ec == '^') {
+                          negated = true;
+                          ++ec;
+                      } else
+                          negated = false;
+                      if (ec == pend)
+                          goto normal_char;
+
+                      bool found = false;
+                      do {
+                          if (*++ec == *s)
+                              found = true;
+                      } while (ec != pend && *ec != ']');
+                      if (ec == pend)
+                          goto normal_char;
+
+                      if (found == !negated)
+                          nextstate.push_back(ec + 1);
+                      break;
+                  }
+                  normal_char:
+                  default:
+                    if (**pp == *s)
+                        nextstate.push_back(*pp + 1);
+                    break;
+                }
+            }
+        state.swap(nextstate);
+    }
+
+    for (const char** pp = state.data(); pp != state.data() + state.size(); ++pp) {
+        while (*pp != pend && **pp == '*')
+            ++*pp;
+        if (*pp == pend)
+            return true;
+    }
+    return false;
+}
+
+
+/** @cond never */
+#if HAVE_STRING_PROFILING
+void String::memo_type::account_new() {
+    int bucket = profile_memo_size_bucket(this->dirty, this->capacity);
+    ++memo_sizes[bucket];
+    ++live_memo_sizes[bucket];
+    live_memo_bytes[bucket] += this->capacity;
+    ++live_memo_count;
+# if HAVE_STRING_PROFILING > 1
+    this->pprev = &live_memos[bucket];
+    if ((this->next = *this->pprev))
+        this->next->pprev = &this->next;
+    *this->pprev = memo;
+# endif
+}
+
+void String::memo_type::account_destroy() {
+    int bucket = profile_memo_size_bucket(this->dirty, this->capacity);
+    --live_memo_sizes[bucket];
+    live_memo_bytes[bucket] -= this->capacity;
+    --live_memo_count;
+# if HAVE_STRING_PROFILING > 1
+    if ((*this->pprev = this->next))
+        this->next->pprev = this->pprev;
+# endif
+}
+#endif
+
+inline String::memo_type* String::create_memo(int capacity, int dirty) {
+    assert(capacity > 0 && capacity >= dirty);
+    memo_type *memo =
+        reinterpret_cast<memo_type *>(new char[capacity + MEMO_SPACE]);
+    if (memo)
+        memo->initialize(capacity, dirty);
+    return memo;
+}
+
+void
+String::delete_memo(memo_type *memo)
+{
+    assert(memo->capacity > 0);
+    assert(memo->capacity >= memo->dirty);
+    memo->account_destroy();
+    delete[] reinterpret_cast<char*>(memo);
+}
+
+
+#if HAVE_STRING_PROFILING
+void
+String::one_profile_report(StringAccum &sa, int i, int examples)
+{
+    if (i <= 16)
+        sa << "memo_dirty_" << i;
+    else if (i < 25) {
+        uint32_t s = (i - 17) * 2 + 17;
+        sa << "memo_cap_" << s << '_' << (s + 1);
+    } else if (i < 29) {
+        uint32_t s = (i - 25) * 8 + 33;
+        sa << "memo_cap_" << s << '_' << (s + 7);
+    } else {
+        uint32_t s1 = (1U << (i - 23)) + 1;
+        uint32_t s2 = (s1 - 1) << 1;
+        sa << "memo_cap_" << s1 << '_' << s2;
+    }
+    sa << '\t' << live_memo_sizes[i] << '\t' << memo_sizes[i] << '\t' << live_memo_bytes[i] << '\n';
+    if (examples) {
+# if HAVE_STRING_PROFILING > 1
+        for (memo_type *m = live_memos[i]; m; m = m->next) {
+            sa << "    [" << m->dirty << "] ";
+            uint32_t dirty = m->dirty;
+            if (dirty > 0 && m->real_data[dirty - 1] == '\0')
+                --dirty;
+            sa.append(m->real_data, dirty > 128 ? 128 : dirty);
+            sa << '\n';
+        }
+# endif
+    }
+}
+
+void
+String::profile_report(StringAccum &sa, int examples)
+{
+    uint64_t all_live_sizes = 0, all_sizes = 0, all_live_bytes = 0;
+    for (int i = 0; i < 55; ++i) {
+        if (memo_sizes[i])
+            one_profile_report(sa, i, examples);
+        all_live_sizes += live_memo_sizes[i];
+        all_sizes += memo_sizes[i];
+        all_live_bytes += live_memo_bytes[i];
+    }
+    sa << "memo_total\t" << all_live_sizes << '\t' << all_sizes << '\t' << all_live_bytes << '\n';
+}
+#endif
+
+/** @endcond never */
+
+
+/** @brief Construct a base-10 string representation of @a x. */
+String::String(int x)
+{
+    if (x >= 0 && x < 10)
+        _r.assign(int_data + 2 * x, 1, 0);
+    else {
+        char buf[128];
+        sprintf(buf, "%d", x);
+        assign(buf, -1, false);
+    }
+}
+
+/** @overload */
+String::String(unsigned x)
+{
+    if (x < 10)
+        _r.assign(int_data + 2 * x, 1, 0);
+    else {
+        char buf[128];
+        sprintf(buf, "%u", x);
+        assign(buf, -1, false);
+    }
+}
+
+/** @overload */
+String::String(long x)
+{
+    if (x >= 0 && x < 10)
+        _r.assign(int_data + 2 * x, 1, 0);
+    else {
+        char buf[128];
+        sprintf(buf, "%ld", x);
+        assign(buf, -1, false);
+    }
+}
+
+/** @overload */
+String::String(unsigned long x)
+{
+    if (x < 10)
+        _r.assign(int_data + 2 * x, 1, 0);
+    else {
+        char buf[128];
+        sprintf(buf, "%lu", x);
+        assign(buf, -1, false);
+    }
+}
+
+/** @overload */
+String::String(long long x)
+{
+    if (x >= 0 && x < 10)
+        _r.assign(int_data + 2 * x, 1, 0);
+    else {
+        char buf[128];
+        sprintf(buf, "%lld", x);
+        assign(buf, -1, false);
+    }
+}
+
+/** @overload */
+String::String(unsigned long long x)
+{
+    if (x < 10)
+        _r.assign(int_data + 2 * x, 1, 0);
+    else {
+        char buf[128];
+        sprintf(buf, "%llu", x);
+        assign(buf, -1, false);
+    }
+}
+
+String::String(double x)
+{
+    char buf[128];
+    int len = sprintf(buf, "%.12g", x);
+    assign(buf, len, false);
+}
+
+String
+String::hard_make_stable(const char *s, int len)
+{
+    if (len < 0)
+        len = strlen(s);
+    return String(s, len, 0);
+}
+
+String
+String::make_fill(int c, int len)
+{
+    String s;
+    s.append_fill(c, len);
+    return s;
+}
+
+void
+String::assign_out_of_memory()
+{
+    _r.deref();
+    _r = oom_string_rep;
+}
+
+void
+String::assign(const char *s, int len, bool need_deref)
+{
+    if (!s) {
+        assert(len <= 0);
+        len = 0;
+    } else if (len < 0)
+        len = strlen(s);
+
+    // need to start with dereference
+    memo_type* m;
+    if (need_deref) {
+        if (unlikely((m = _r.memo())
+                     && s >= m->real_data
+                     && s + len <= m->real_data + m->capacity)) {
+            // Be careful about "String s = ...; s = s.c_str();"
+            _r.assign_noref(s, len, m);
+            return;
+        } else
+            deref();
+    }
+
+    if (len == 0) {
+        m = 0;
+        s = String_generic::empty_data;
+
+    } else {
+        // Make the memo a multiple of 16 characters and bigger than 'len'.
+        int memo_capacity = (len + 15 + MEMO_SPACE) & ~15;
+        m = create_memo(memo_capacity - MEMO_SPACE, len);
+        if (!m) {
+            _r.reset_ref();
+            assign_out_of_memory();
+            return;
+        }
+        memcpy(m->real_data, s, len);
+        s = m->real_data;
+    }
+
+    _r.assign_noref(s, len, m);
+}
+
+/** @brief Append @a len unknown characters to this string.
+ * @return Modifiable pointer to the appended characters.
+ *
+ * The caller may safely modify the returned memory. Null is returned if
+ * the string becomes out-of-memory. */
+char *
+String::append_uninitialized(int len)
+{
+    // Appending anything to "out of memory" leaves it as "out of memory"
+    if (unlikely(len <= 0) || out_of_memory())
+        return 0;
+
+    // If we can, append into unused space. First, we check that there's
+    // enough unused space for 'len' characters to fit; then, we check
+    // that the unused space immediately follows the data in '*this'.
+    uint32_t dirty;
+    memo_type* m = _r.memo();
+    if (m && ((dirty = m->dirty), m->capacity > dirty + len)) {
+        char *real_dirty = m->real_data + dirty;
+        if (real_dirty == _r.data + _r.length) {
+            m->dirty = dirty + len;
+            _r.length += len;
+            assert(m->dirty < m->capacity);
+#if HAVE_STRING_PROFILING
+            profile_update_memo_dirty(m, dirty, dirty + len, m->capacity);
+#endif
+            return real_dirty;
+        }
+    }
+
+    // Now we have to make new space. Make sure the memo is a multiple of 16
+    // bytes and that it is at least 16. But for large strings, allocate a
+    // power of 2, since power-of-2 sizes minimize waste in frequently-used
+    // allocators, like Linux kmalloc.
+    int want_memo_len = _r.length + len + MEMO_SPACE;
+    int memo_capacity;
+    if (want_memo_len <= 1024)
+        memo_capacity = (want_memo_len + 15) & ~15;
+    else
+        for (memo_capacity = 2048; memo_capacity < want_memo_len; )
+            memo_capacity *= 2;
+
+    m = create_memo(memo_capacity - MEMO_SPACE, _r.length + len);
+    if (!m) {
+        assign_out_of_memory();
+        return 0;
+    }
+
+    char *new_data = m->real_data;
+    memcpy(new_data, _r.data, _r.length);
+
+    deref();
+    _r.assign_noref(new_data, _r.length + len, m);
+    return const_cast<char*>(_r.data + _r.length - len);
+}
+
+void
+String::append(const char* s, int len, memo_type* memo)
+{
+    if (!s) {
+        assert(len <= 0);
+        len = 0;
+    } else if (len < 0)
+        len = strlen(s);
+
+    memo_type* my_memo;
+    if (unlikely(len == 0) || out_of_memory())
+        /* do nothing */;
+    else if (unlikely(s == String_generic::out_of_memory_data) && !memo)
+        // Appending "out of memory" to a regular string makes it "out of
+        // memory"
+        assign_out_of_memory();
+    else if (_r.length == 0 && reinterpret_cast<uintptr_t>(memo) > 1) {
+        deref();
+        _r.assign(s, len, memo);
+    } else if (likely(!((my_memo = _r.memo())
+                        && s >= my_memo->real_data
+                        && s + len <= my_memo->real_data + my_memo->capacity))) {
+        if (char *space = append_uninitialized(len))
+            memcpy(space, s, len);
+    } else {
+        String preserve_s(*this);
+        if (char *space = append_uninitialized(len))
+            memcpy(space, s, len);
+    }
+}
+
+/** @brief Append @a len copies of character @a c to this string. */
+void
+String::append_fill(int c, int len)
+{
+    assert(len >= 0);
+    if (char *space = append_uninitialized(len))
+        memset(space, c, len);
+}
+
+/** @brief Ensure the string's data is unshared and return a mutable
+    pointer to it. */
+char *
+String::mutable_data()
+{
+    // If _memo has a capacity (it's not one of the special strings) and it's
+    // uniquely referenced, return _data right away.
+    if (!is_shared())
+        return const_cast<char *>(_r.data);
+
+    // Otherwise, make a copy of it. Rely on: deref() doesn't change _data or
+    // _length; and if _capacity == 0, then deref() doesn't free _real_data.
+    // But in multithreaded situations we must hold a local copy of memo!
+    String do_not_delete_underlying_memo(*this);
+    deref();
+    assign(_r.data, _r.length, false);
+    return const_cast<char *>(_r.data);
+}
+
+const char *
+String::hard_c_str() const
+{
+    // See also c_str().
+    // We may already have a '\0' in the right place.  If _memo has no
+    // capacity, then this is one of the special strings (null or
+    // stable). We are guaranteed, in these strings, that _data[_length]
+    // exists. Otherwise must check that _data[_length] exists.
+    const char *end_data = _r.data + _r.length;
+    memo_type* m = _r.memo();
+    if ((m && end_data >= m->real_data + m->dirty)
+        || *end_data != '\0') {
+        if (char *x = const_cast<String *>(this)->append_uninitialized(1)) {
+            *x = '\0';
+            --_r.length;
+        }
+    }
+    return _r.data;
+}
+
+/** @brief Null-terminate the string and return a mutable pointer to its
+    data.
+    @sa String::c_str */
+char *
+String::mutable_c_str()
+{
+    (void) mutable_data();
+    (void) c_str();
+    return const_cast<char *>(_r.data);
+}
+
+/** @brief Return a substring of this string, consisting of the @a len
+    characters starting at index @a pos.
+    @param pos substring's first position relative to the string
+    @param len length of substring
+
+    If @a pos is negative, starts that far from the end of the string. If @a
+    len is negative, leaves that many characters off the end of the string.
+    If @a pos and @a len specify a substring that is partly outside the
+    string, only the part within the string is returned. If the substring is
+    beyond either end of the string, returns an empty string (but this
+    should be considered a programming error; a future version may generate
+    a warning for this case).
+
+    @note String::substr() is intended to behave like Perl's substr(). */
+String
+String::substr(int pos, int len) const
+{
+    if (pos < 0)
+        pos += _r.length;
+
+    int pos2;
+    if (len < 0)
+        pos2 = _r.length + len;
+    else if (pos >= 0 && len >= _r.length) // avoid integer overflow
+        pos2 = _r.length;
+    else
+        pos2 = pos + len;
+
+    if (pos < 0)
+        pos = 0;
+    if (pos2 > _r.length)
+        pos2 = _r.length;
+
+    if (pos >= pos2)
+        return String();
+    else {
+        _r.ref();
+        return String(_r.data + pos, pos2 - pos, _r.memo());
+    }
+}
+
+static String
+hard_lower(const String &s, int pos)
+{
+    String new_s(s.data(), s.length());
+    char *x = const_cast<char *>(new_s.data()); // know it's mutable
+    int len = s.length();
+    for (; pos < len; pos++)
+        x[pos] = tolower((unsigned char) x[pos]);
+    return new_s;
+}
+
+/** @brief Return a lowercased version of this string.
+
+    Translates the ASCII characters 'A' through 'Z' into their lowercase
+    equivalents. */
+String
+String::lower() const
+{
+    // avoid copies
+    if (!out_of_memory())
+        for (int i = 0; i < _r.length; i++)
+            if (_r.data[i] >= 'A' && _r.data[i] <= 'Z')
+                return hard_lower(*this, i);
+    return *this;
+}
+
+static String
+hard_upper(const String &s, int pos)
+{
+    String new_s(s.data(), s.length());
+    char *x = const_cast<char *>(new_s.data()); // know it's mutable
+    int len = s.length();
+    for (; pos < len; pos++)
+        x[pos] = toupper((unsigned char) x[pos]);
+    return new_s;
+}
+
+/** @brief Return an uppercased version of this string.
+
+    Translates the ASCII characters 'a' through 'z' into their uppercase
+    equivalents. */
+String
+String::upper() const
+{
+    // avoid copies
+    for (int i = 0; i < _r.length; i++)
+        if (_r.data[i] >= 'a' && _r.data[i] <= 'z')
+            return hard_upper(*this, i);
+    return *this;
+}
+
+static String
+hard_printable(const String &s, int pos, int type)
+{
+    StringAccum sa(s.length() * 2);
+    sa.append(s.data(), pos);
+    const unsigned char *x = reinterpret_cast<const unsigned char *>(s.data());
+    int len = s.length();
+    for (; pos < len; pos++) {
+        if (type == 2 && (x[pos] == '\\' || x[pos] == '\"'))
+            sa << '\\' << x[pos];
+        else if (x[pos] >= 32 && x[pos] < 127)
+            sa << x[pos];
+        else if (x[pos] < 32 && type == 2) {
+            if (x[pos] >= 9 && x[pos] <= 13)
+                sa << '\\' << ("tnvfr"[x[pos] - 9]);
+            else if (char *buf = sa.extend(4, 1))
+                sprintf(buf, "\\%03o", x[pos]);
+        } else if (x[pos] < 32 && type != 1)
+            sa << '^' << (unsigned char)(x[pos] + 64);
+        else if (char *buf = sa.extend(4, 1))
+            sprintf(buf, "\\%03o", x[pos]);
+    }
+    return sa.take_string();
+}
+
+/** @brief Return a "printable" version of this string.
+    @param type quoting type
+
+    The default quoting type (0) translates control characters 0-31 into
+    "control" sequences, such as "^@" for the null character, and characters
+    127-255 into octal escape sequences, such as "\377" for 255. Quoting
+    type 1 translates all characters outside of 32-126 into octal escape
+    sequences. Quoting type 2 uses C escapes, including "\\" and "\"". */
+String
+String::printable(int type) const
+{
+    // avoid copies
+    if (!out_of_memory())
+        for (int i = 0; i < _r.length; i++)
+            if (_r.data[i] < 32 || _r.data[i] > 126)
+                return hard_printable(*this, i, type);
+    return *this;
+}
+
+String String::to_hex() const {
+    StringAccum sa;
+    static const char hexval[] = "0123456789ABCDEF";
+    char* x = sa.extend(2 * _r.length);
+    for (int i = 0; i != _r.length; ++i) {
+        *x++ = hexval[(unsigned char) _r.data[i] >> 4];
+        *x++ = hexval[_r.data[i] & 15];
+    }
+    return sa.take_string();
+}
+
+/** @brief Return the substring with left whitespace removed. */
+String
+String::ltrim() const
+{
+    return String_generic::ltrim(*this);
+}
+
+/** @brief Return the substring with right whitespace removed. */
+String
+String::rtrim() const
+{
+    return String_generic::rtrim(*this);
+}
+
+/** @brief Return the substring with left and right whitespace removed. */
+String
+String::trim() const
+{
+    return String_generic::trim(*this);
+}
+
+void
+String::align(int n)
+{
+    int offset = reinterpret_cast<uintptr_t>(_r.data) % n;
+    if (offset) {
+        String s;
+        s.append_uninitialized(_r.length + n + 1);
+        offset = reinterpret_cast<uintptr_t>(s._r.data) % n;
+        memcpy((char *)s._r.data + n - offset, _r.data, _r.length);
+        s._r.data += n - offset;
+        s._r.length = _r.length;
+        *this = s;
+    }
+}
+
+/** @brief Return the pointer to the next character in UTF-8 encoding. */
+const unsigned char *
+String::skip_utf8_char(const unsigned char *s, const unsigned char *end)
+{
+    int c = *s;
+    if (c > 0 && c < 0x80)
+        return s + 1;
+    else if (c < 0xC2)
+        /* zero, or bad/overlong encoding */;
+    else if (c < 0xE0) {        // 2 bytes: U+80-U+7FF
+        if (likely(s + 1 < end
+                   && s[1] >= 0x80 && s[1] < 0xC0))
+            return s + 2;
+    } else if (c < 0xF0) {      // 3 bytes: U+800-U+FFFF
+        if (likely(s + 2 < end
+                   && s[1] >= 0x80 && s[1] < 0xC0
+                   && s[2] >= 0x80 && s[2] < 0xC0
+                   && (c != 0xE0 || s[1] >= 0xA0) /* not overlong encoding */
+                   && (c != 0xED || s[1] < 0xA0) /* not surrogate */))
+            return s + 3;
+    } else if (c < 0xF5) {      // 4 bytes: U+10000-U+10FFFF
+        if (likely(s + 3 < end
+                   && s[1] >= 0x80 && s[1] < 0xC0
+                   && s[2] >= 0x80 && s[2] < 0xC0
+                   && s[3] >= 0x80 && s[3] < 0xC0
+                   && (c != 0xF0 || s[1] >= 0x90) /* not overlong encoding */
+                   && (c != 0xF4 || s[1] < 0x90) /* not >U+10FFFF */))
+            return s + 4;
+    }
+    return s;
+}
+
+int
+String::parse_cesu8_char(const unsigned char *s, const unsigned char *end)
+{
+    if (s + 5 < end
+        && s[0] == 0xED
+        && s[1] >= 0xA0 && s[1] < 0xB0
+        && s[2] >= 0x80 && s[2] < 0xC0
+        && s[3] == 0xED
+        && s[4] >= 0xB0 && s[4] < 0xC0
+        && s[5] >= 0x80 && s[5] < 0xC0)
+        return 0x10000
+            + ((s[1] & 0x0F) << 16) + ((s[2] & 0x3F) << 10)
+            + ((s[4] & 0x0F) << 6) + (s[5] & 0x3F);
+    else
+        return 0;
+}
+
+static const uint16_t windows1252_c1_mapping[] = {
+    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+    0x20C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
+    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+    0x20DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
+};
+
+String
+String::windows1252_to_utf8() const
+{
+    const unsigned char *s = ubegin(), *last = s, *ends = uend();
+    StringAccum sa;
+    for (; s != ends; ++s) {
+        int c = *s;
+        if (unlikely(c == 0)) {
+            sa.append(last, s);
+            last = s + 1;
+        } else if (likely(c < 128))
+            /* do nothing */;
+        else {
+            sa.append(last, s);
+            if (unlikely(c < 160))
+                c = windows1252_c1_mapping[c - 128];
+            sa.append_utf8(c);
+            last = s + 1;
+        }
+    }
+    if (last == ubegin())
+        return *this;
+    else {
+        sa.append(last, s);
+        return sa.take_string();
+    }
+}
+
+String
+String::utf16be_to_utf8(int flags) const
+{
+    const unsigned char *s = ubegin(), *ends = uend();
+    if ((ends - s) & 1)
+        --ends;
+    if ((flags & utf_strip_bom) && s != ends
+        && s[0] == 0xFE && s[1] == 0xFF)
+        s += 2;
+    StringAccum sa;
+    sa.reserve((ends - s) / 2);
+    for (; s != ends; s += 2) {
+        int c = s[0] * 256 + s[1];
+        if (likely(c < 0xD800) || c >= 0xE000)
+            sa.append_utf8(c);
+        else if (c < 0xDC00 && s + 2 != ends
+                 && s[2] >= 0xDC && s[2] < 0xE0) {
+            c = 0x10000 + ((c - 0xD800) << 10)
+                + ((s[2] & 0x03) << 8) + s[3];
+            sa.append_utf8(c);
+            s += 2;
+        } else if (c && (flags & utf_replacement))
+            sa.append_utf8(u_replacement);
+    }
+    return sa.take_string();
+}
+
+String
+String::utf16le_to_utf8(int flags) const
+{
+    const unsigned char *s = ubegin(), *ends = uend();
+    if ((ends - s) & 1)
+        --ends;
+    if ((flags & utf_strip_bom) && s != ends
+        && s[1] == 0xFE && s[0] == 0xFF)
+        s += 2;
+    StringAccum sa;
+    sa.reserve((ends - s) / 2);
+    for (; s != ends; s += 2) {
+        int c = s[1] * 256 + s[0];
+        if (likely(c < 0xD800) || c >= 0xE000)
+            sa.append_utf8(c);
+        else if (c < 0xDC00 && s + 2 != ends
+                 && s[3] >= 0xDC && s[3] < 0xE0) {
+            c = 0x10000 + ((c - 0xD800) << 10)
+                + ((s[3] & 0x03) << 8) + s[2];
+            sa.append_utf8(c);
+            s += 2;
+        } else if (c && (flags & utf_replacement))
+            sa.append_utf8(u_replacement);
+    }
+    return sa.take_string();
+}
+
+String
+String::utf16_to_utf8(int flags) const
+{
+    if (length() < 2)
+        return String();
+    const unsigned char *s = ubegin(), *ends = uend();
+    if (ends - s < 2)
+        return String();
+    int c = s[0] * 256 + s[1];
+    if (c == 0xFEFF || (c != 0xFFFE && !(flags & utf_prefer_le)))
+        return utf16be_to_utf8(flags);
+    else
+        return utf16le_to_utf8(flags);
+}
+
+String
+String::cesu8_to_utf8(int flags) const
+{
+    const unsigned char *s = ubegin(), *last = s, *t, *ends = uend();
+    StringAccum sa;
+    if (flags & utf_strip_bom)
+        s = last = skip_utf8_bom(s, ends);
+    while (s != ends) {
+        int c = s[0];
+        if (likely(c > 0 && c < 0x7F))
+            ++s;
+        else if (likely((t = skip_utf8_char(s, ends)) != s))
+            s = t;
+        else if ((c = parse_cesu8_char(s, ends))) {
+            sa.append(last, s);
+            sa.append_utf8(c);
+            s += 6;
+            last = s;
+        } else {
+            sa.append(last, s);
+            if ((flags & utf_replacement) && c != 0)
+                sa.append_utf8(u_replacement);
+            for (++s; s != ends && *s >= 0x80 && *s < 0xC0; ++s)
+                /* do nothing */;
+            last = s;
+        }
+    }
+    if (last == ubegin())
+        return *this;
+    else {
+        sa.append(last, s);
+        return sa.take_string();
+    }
+}
+
+String
+String::utf8_to_utf8(int flags) const
+{
+    const unsigned char *s = ubegin(), *last = s, *t, *ends = uend();
+    StringAccum sa;
+    if (flags & utf_strip_bom)
+        s = last = skip_utf8_bom(s, ends);
+    while (s != ends) {
+        int c = s[0];
+        if (likely(c > 0 && c < 0x7F))
+            ++s;
+        else if (likely((t = skip_utf8_char(s, ends)) != s))
+            s = t;
+        else {
+            sa.append(last, s);
+            if ((flags & utf_replacement) && c != 0)
+                sa.append_utf8(u_replacement);
+            for (++s; s != ends && *s >= 0x80 && *s < 0xC0; ++s)
+                /* do nothing */;
+            last = s;
+        }
+    }
+    if (last == ubegin())
+        return *this;
+    else {
+        sa.append(last, s);
+        return sa.take_string();
+    }
+}
+
+/** @brief Return a version of the string converted to UTF-8.
+
+    Null characters are dropped.  Then, if the result contains invalid
+    UTF-8, the string is assumed to be Windows-1252 encoded, and is
+    converted to UTF-8 accordingly. */
+String
+String::to_utf8(int flags) const
+{
+    const unsigned char *s = ubegin(), *ends = uend();
+    if (ends - s > 2) {
+        if ((s[0] == 0xFF && s[1] == 0xFE)
+            || (s[0] > 0 && s[0] < 0x80 && s[1] == 0))
+            return utf16le_to_utf8(flags);
+        else if ((s[0] == 0xFE && s[1] == 0xFF)
+                 || (s[0] == 0 && s[1] > 0 && s[1] < 0x80))
+            return utf16be_to_utf8(flags);
+    }
+
+    const unsigned char *last = s, *t;
+    if (ends - s > 3 && (flags & utf_strip_bom))
+        s = last = skip_utf8_bom(s, ends);
+    StringAccum sa;
+    bool any_long_utf8 = false;
+    int c;
+    while (s != ends) {
+        if (likely(*s > 0 && *s < 128))
+            ++s;
+        else if (likely((t = skip_utf8_char(s, ends)) != s)) {
+            any_long_utf8 = true;
+            s = t;
+        } else if ((c = parse_cesu8_char(s, ends)) != 0) {
+            any_long_utf8 = true;
+            sa.append(last, s);
+            sa.append_utf8(c);
+            s += 6;
+            last = s;
+        } else if (*s == 0) {
+            sa.append(last, s);
+            ++s;
+            last = s;
+        } else if (!any_long_utf8)
+            goto windows1252;
+        else {
+            sa.append(last, s);
+            int c = *s;
+            if (c < 160)
+                c = windows1252_c1_mapping[c - 128];
+            sa.append_utf8(c);
+            ++s;
+            last = s;
+        }
+    }
+
+ exit:
+    if (last == ubegin())
+        return *this;
+    else {
+        sa.append(last, ends);
+        return sa.take_string();
+    }
+
+ windows1252:
+    while (s != ends) {
+        if (likely(*s > 0 && *s < 128))
+            ++s;
+        else {
+            sa.append(last, s);
+            if (*s != 0) {
+                int c = *s;
+                if (c < 160)
+                    c = windows1252_c1_mapping[c - 128];
+                sa.append_utf8(c);
+            }
+            ++s;
+            last = s;
+        }
+    }
+    goto exit;
+}
+
+/** @brief Return this string's contents encoded for JSON.
+    @pre *this is encoded in UTF-8.
+
+    For instance, String("a\"").encode_json() == "a\\\"". Note that the
+    double-quote characters that usually surround a JSON string are not
+    included. */
+String String::encode_json() const {
+    StringAccum sa;
+    const char* last = encode_json_partial(sa);
+    if (last == begin())
+        return *this;
+    else {
+        sa.append(last, end());
+        return sa.take_string();
+    }
+}
+
+String String::encode_base64(bool pad) const {
+    StringAccum sa;
+    encode_base64(sa, pad);
+    return sa.take_string();
+}
+
+String String::decode_base64() const {
+    StringAccum sa;
+    if (!decode_base64(sa))
+        return String();
+    return sa.take_string();
+}
+
+} // namespace lcdf