folly: fbstring: conditionally write terminator in c_str()
authorLucian Grijincu <lucian@fb.com>
Thu, 8 May 2014 18:50:50 +0000 (11:50 -0700)
committerDave Watson <davejwatson@fb.com>
Tue, 20 May 2014 19:53:58 +0000 (12:53 -0700)
Summary:
c_str/data writes a terminator '\0' on each call. When
multiple threads call c_str/data on the same string (e.g. small global
constant) the string's cache line will bounce from cpu to cpu.

Cpus report that most instructions are stalled
(stalled-cycles-frontend/stalled-cycles-backend near 100%).

Fix: check before setting '\0'.

Test Plan:
** BEFORE **

- nothing defined  ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.10s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.40s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 17:27:39.373522 28745 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              934.10ps    1.07G
privat_std_1t                                    100.32%   931.08ps    1.07G
static_fbs_1t                                    100.30%   931.33ps    1.07G
privat_fbs_1t                                    100.42%   930.22ps    1.08G
static_sp__1t                                   86986.52%     1.07ps  931.24G
privat_sp__1t                                   81459.50%     1.15ps  872.07G
static_std_32t                                             392.41ns    2.55M
privat_std_32t                                  15072.69%     2.60ns  384.10M
static_fbs_32t                                   102.41%   383.17ns    2.61M
privat_fbs_32t                                  13643.61%     2.88ns  347.68M
static_sp__32t                                  61032.25%   642.96ps    1.56G
privat_sp__32t                                  74172.72%   529.06ps    1.89G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

2417478.451740 task-clock                #   27.005 CPUs utilized
65,374 context-switches          #    0.027 K/sec
4,270 CPU-migrations            #    0.002 K/sec
2,594 page-faults               #    0.001 K/sec
5,261,195,623,299 cycles                    #    2.176 GHz                     [25.09%]
5,072,634,235,091 stalled-cycles-frontend   #   96.42% frontend cycles idle    [30.01%]
3,848,759,938,052 stalled-cycles-backend    #   73.15% backend  cycles idle    [30.01%]
614,983,033,652 instructions              #    0.12  insns per cycle
#    8.25  stalled cycles per insn [25.09%]
152,995,596,390 branches                  #   63.287 M/sec                   [30.01%]
24,501,018 branch-misses             #    0.02% of all branches         [25.94%]
66,298,370,215 L1-dcache-loads           #   27.425 M/sec                   [30.03%]
1,642,188,850 L1-dcache-load-misses     #    2.48% of all L1-dcache hits   [30.02%]
968,142,977 LLC-loads                 #    0.400 M/sec                   [30.02%]
480,786,261 LLC-load-misses           #   49.66% of all LL-cache hits    [30.02%]

89.520960614 seconds time elapsed

- #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.25s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.45s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 17:24:26.487848 23720 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              932.63ps    1.07G
privat_std_1t                                    100.95%   923.85ps    1.08G
static_fbs_1t                                    100.04%   932.23ps    1.07G
privat_fbs_1t                                    101.14%   922.07ps    1.08G
static_sp__1t                                   30418.15%     3.07ps  326.16G
privat_sp__1t                                   56775.39%     1.64ps  608.77G
static_std_32t                                             388.57ns    2.57M
privat_std_32t                                  16465.70%     2.36ns  423.75M
static_fbs_32t                                    96.19%   403.98ns    2.48M
privat_fbs_32t                                  16248.69%     2.39ns  418.16M
static_sp__32t                                  81185.06%   478.63ps    2.09G
privat_sp__32t                                  81470.69%   476.95ps    2.10G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

2486009.171842 task-clock                #   27.241 CPUs utilized
64,538 context-switches          #    0.026 K/sec
4,501 CPU-migrations            #    0.002 K/sec
2,863 page-faults               #    0.001 K/sec
5,413,507,764,726 cycles                    #    2.178 GHz                     [25.62%]
5,207,550,928,974 stalled-cycles-frontend   #   96.20% frontend cycles idle    [30.01%]
3,919,627,007,209 stalled-cycles-backend    #   72.40% backend  cycles idle    [30.01%]
685,365,706,158 instructions              #    0.13  insns per cycle
#    7.60  stalled cycles per insn [25.62%]
148,798,151,350 branches                  #   59.854 M/sec                   [30.01%]
24,386,492 branch-misses             #    0.02% of all branches         [26.08%]
72,683,962,699 L1-dcache-loads           #   29.237 M/sec                   [30.02%]
1,687,684,775 L1-dcache-load-misses     #    2.32% of all L1-dcache hits   [30.01%]
989,352,938 LLC-loads                 #    0.398 M/sec                   [30.02%]
484,825,665 LLC-load-misses           #   49.00% of all LL-cache hits    [30.02%]

91.259265010 seconds time elapsed

- #define FBSTRING_CONSERVATIVE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
0.18s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.43s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 17:30:58.246834  1571 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                                1.44ps  692.61G
privat_std_1t                                    193.01%   748.06fs    1.34T
static_fbs_1t                                    399.47%   361.43fs    2.77T
privat_fbs_1t                                    129.29%     1.12ps  895.50G
static_sp__1t                                     78.99%     1.83ps  547.10G
privat_sp__1t                                     56.22%     2.57ps  389.36G
static_std_32t                                             478.67ps    2.09G
privat_std_32t                                   100.30%   477.26ps    2.10G
static_fbs_32t                                   100.52%   476.21ps    2.10G
privat_fbs_32t                                   100.52%   476.22ps    2.10G
static_sp__32t                                    99.57%   480.76ps    2.08G
privat_sp__32t                                   100.06%   478.40ps    2.09G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

198000.887222 task-clock                #   14.738 CPUs utilized
5,016 context-switches          #    0.025 K/sec
2,536 CPU-migrations            #    0.013 K/sec
4,717 page-faults               #    0.024 K/sec
421,850,242,358 cycles                    #    2.131 GHz                     [25.31%]
209,633,195,159 stalled-cycles-frontend   #   49.69% frontend cycles idle    [30.32%]
6,252,451,713 stalled-cycles-backend    #    1.48% backend  cycles idle    [30.21%]
874,096,942,441 instructions              #    2.07  insns per cycle
#    0.24  stalled cycles per insn [25.37%]
218,114,341,006 branches                  # 1101.583 M/sec                   [30.34%]
3,080,267 branch-misses             #    0.00% of all branches         [26.23%]
240,532,859 L1-dcache-loads           #    1.215 M/sec                   [30.73%]
23,151,832 L1-dcache-load-misses     #    9.63% of all L1-dcache hits   [30.61%]
5,283,803 LLC-loads                 #    0.027 M/sec                   [30.60%]
1,079,973 LLC-load-misses           #   20.44% of all LL-cache hits    [30.60%]

13.434898734 seconds time elapsed

- #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
I0507 17:37:01.095785 28744 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                                2.80ps  357.44G
privat_std_1t                                    118.65%     2.36ps  424.09G
static_fbs_1t                                      0.30%   937.44ps    1.07G
privat_fbs_1t                                      0.30%   924.33ps    1.08G
static_sp__1t                                    229.14%     1.22ps  819.03G
privat_sp__1t                                    212.22%     1.32ps  758.55G
static_std_32t                                             662.00ps    1.51G
privat_std_32t                                   134.15%   493.47ps    2.03G
static_fbs_32t                                     1.27%    52.24ns   19.14M
privat_fbs_32t                                    28.02%     2.36ns  423.31M
static_sp__32t                                   138.51%   477.94ps    2.09G
privat_sp__32t                                   138.39%   478.38ps    2.09G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

316918.699522 task-clock                #   17.561 CPUs utilized
8,761 context-switches          #    0.028 K/sec
2,190 CPU-migrations            #    0.007 K/sec
5,361 page-faults               #    0.017 K/sec
683,482,498,044 cycles                    #    2.157 GHz                     [25.13%]
488,046,572,692 stalled-cycles-frontend   #   71.41% frontend cycles idle    [30.18%]
269,765,079,972 stalled-cycles-backend    #   39.47% backend  cycles idle    [30.17%]
756,213,442,723 instructions              #    1.11  insns per cycle
#    0.65  stalled cycles per insn [25.17%]
175,643,190,326 branches                  #  554.222 M/sec                   [30.14%]
3,737,621 branch-misses             #    0.00% of all branches         [25.63%]
35,169,313,417 L1-dcache-loads           #  110.973 M/sec                   [30.31%]
215,454,174 L1-dcache-load-misses     #    0.61% of all L1-dcache hits   [30.25%]
148,883,234 LLC-loads                 #    0.470 M/sec                   [30.25%]
48,768,585 LLC-load-misses           #   32.76% of all LL-cache hits    [30.26%]

18.046902132 seconds time elapsed

** AFTER **

- define nothing ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
I0507 20:34:15.943284 16852 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              936.05ps    1.07G
privat_std_1t                                    100.67%   929.82ps    1.08G
static_fbs_1t                                     98.63%   949.07ps    1.05G
privat_fbs_1t                                    101.56%   921.69ps    1.08G
static_sp__1t                                   40342.12%     2.32ps  430.98G
privat_sp__1t                                   16370.35%     5.72ps  174.89G
static_std_32t                                             409.26ns    2.44M
privat_std_32t                                  16777.41%     2.44ns  409.95M
static_fbs_32t                                  17398.58%     2.35ns  425.13M
privat_fbs_32t                                  17475.47%     2.34ns  427.01M
static_sp__32t                                  85426.96%   479.07ps    2.09G
privat_sp__32t                                  85033.14%   481.29ps    2.08G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

1387268.222731 task-clock                #   25.553 CPUs utilized
34,698 context-switches          #    0.025 K/sec
2,770 CPU-migrations            #    0.002 K/sec
3,039 page-faults               #    0.002 K/sec
3,019,637,816,074 cycles                    #    2.177 GHz                     [25.21%]
2,801,192,631,479 stalled-cycles-frontend   #   92.77% frontend cycles idle    [30.02%]
2,020,857,118,698 stalled-cycles-backend    #   66.92% backend  cycles idle    [30.00%]
801,090,224,478 instructions              #    0.27  insns per cycle
#    3.50  stalled cycles per insn [25.20%]
206,099,842,649 branches                  #  148.565 M/sec                   [30.03%]
12,954,936 branch-misses             #    0.01% of all branches         [25.64%]
104,129,435,254 L1-dcache-loads           #   75.061 M/sec                   [30.07%]
883,390,641 L1-dcache-load-misses     #    0.85% of all L1-dcache hits   [30.06%]
516,975,218 LLC-loads                 #    0.373 M/sec                   [30.04%]
255,887,523 LLC-load-misses           #   49.50% of all LL-cache hits    [30.04%]

54.289185806 seconds time elapsed

- #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.20s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.47s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 20:19:07.272021 18610 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              936.15ps    1.07G
privat_std_1t                                    100.84%   928.34ps    1.08G
static_fbs_1t                                     97.92%   956.06ps    1.05G
privat_fbs_1t                                    101.41%   923.14ps    1.08G
static_sp__1t                                       inf%     0.00fs  infinity
privat_sp__1t                                       inf%     0.00fs  infinity
static_std_32t                                             413.49ns    2.42M
privat_std_32t                                  17375.60%     2.38ns  420.22M
static_fbs_32t                                  17443.07%     2.37ns  421.85M
privat_fbs_32t                                  15354.32%     2.69ns  371.33M
static_sp__32t                                  82034.91%   504.05ps    1.98G
privat_sp__32t                                  63367.22%   652.53ps    1.53G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

1390538.848103 task-clock                #   25.373 CPUs utilized
37,417 context-switches          #    0.027 K/sec
3,147 CPU-migrations            #    0.002 K/sec
2,876 page-faults               #    0.002 K/sec
3,024,513,016,946 cycles                    #    2.175 GHz                     [25.08%]
2,813,021,108,191 stalled-cycles-frontend   #   93.01% frontend cycles idle    [30.02%]
2,043,124,392,473 stalled-cycles-backend    #   67.55% backend  cycles idle    [30.01%]
774,663,686,661 instructions              #    0.26  insns per cycle
#    3.63  stalled cycles per insn [25.09%]
197,666,485,664 branches                  #  142.151 M/sec                   [30.03%]
15,077,576 branch-misses             #    0.01% of all branches         [25.73%]
104,720,369,589 L1-dcache-loads           #   75.309 M/sec                   [30.05%]
886,090,434 L1-dcache-load-misses     #    0.85% of all L1-dcache hits   [30.04%]
520,015,584 LLC-loads                 #    0.374 M/sec                   [30.03%]
256,990,100 LLC-load-misses           #   49.42% of all LL-cache hits    [30.04%]

54.804099454 seconds time elapsed

- #define FBSTRING_CONSERVATIVE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.02s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.45s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 20:31:31.120209 10543 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              929.96ps    1.08G
privat_std_1t                                    100.37%   926.53ps    1.08G
static_fbs_1t                                       inf%     0.00fs  infinity
privat_fbs_1t                                       inf%     0.00fs  infinity
static_sp__1t                                       inf%     0.00fs  infinity
privat_sp__1t                                       inf%     0.00fs  infinity
static_std_32t                                             381.41ns    2.62M
privat_std_32t                                  16080.74%     2.37ns  421.62M
static_fbs_32t                                  80498.05%   473.81ps    2.11G
privat_fbs_32t                                  80368.84%   474.57ps    2.11G
static_sp__32t                                  80287.07%   475.05ps    2.11G
privat_sp__32t                                  80410.51%   474.33ps    2.11G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

1293727.223839 task-clock                #   25.403 CPUs utilized
33,881 context-switches          #    0.026 K/sec
3,718 CPU-migrations            #    0.003 K/sec
3,829 page-faults               #    0.003 K/sec
2,813,078,184,066 cycles                    #    2.174 GHz                     [25.11%]
2,597,561,032,630 stalled-cycles-frontend   #   92.34% frontend cycles idle    [30.04%]
1,883,652,860,583 stalled-cycles-backend    #   66.96% backend  cycles idle    [30.02%]
800,150,465,648 instructions              #    0.28  insns per cycle
#    3.25  stalled cycles per insn [25.10%]
197,970,559,157 branches                  #  153.023 M/sec                   [30.02%]
15,385,773 branch-misses             #    0.01% of all branches         [25.95%]
36,541,946,374 L1-dcache-loads           #   28.245 M/sec                   [30.08%]
813,994,917 L1-dcache-load-misses     #    2.23% of all L1-dcache hits   [30.07%]
467,241,616 LLC-loads                 #    0.361 M/sec                   [30.07%]
247,980,582 LLC-load-misses           #   53.07% of all LL-cache hits    [30.08%]

50.928965545 seconds time elapsed

Reviewed By: njormrod@fb.com

FB internal diff: D1318048

folly/FBString.h
folly/test/FBStringTerminatorBenchmark.cpp [new file with mode: 0644]

index dd5b0642c267c59ca017bb9826bab56b99061feb..09407bda6fd9d2af5b6c8d661bf3fb8ec5211966 100644 (file)
@@ -528,12 +528,16 @@ public:
     if (c == isSmall) {
       assert(small_[smallSize()] == TERMINATOR || smallSize() == maxSmallSize
              || small_[smallSize()] == '\0');
-      small_[smallSize()] = '\0';
+      if (small_[smallSize()] != '\0') {
+        small_[smallSize()] = '\0';
+      }
       return small_;
     }
     assert(c == isMedium || c == isLarge);
     assert(ml_.data_[ml_.size_] == TERMINATOR || ml_.data_[ml_.size_] == '\0');
-    ml_.data_[ml_.size_] = '\0';
+    if (ml_.data_[ml_.size_] != '\0') {
+      ml_.data_[ml_.size_] = '\0';
+    }
 #elif defined(FBSTRING_CONSERVATIVE)
     if (c == isSmall) {
       assert(small_[smallSize()] == '\0');
@@ -543,11 +547,15 @@ public:
     assert(ml_.data_[ml_.size_] == '\0');
 #else
     if (c == isSmall) {
-      small_[smallSize()] = '\0';
+      if (small_[smallSize()] != '\0') {
+        small_[smallSize()] = '\0';
+      }
       return small_;
     }
     assert(c == isMedium || c == isLarge);
-    ml_.data_[ml_.size_] = '\0';
+    if (ml_.data_[ml_.size_] != '\0') {
+      ml_.data_[ml_.size_] = '\0';
+    }
 #endif
     return ml_.data_;
   }
diff --git a/folly/test/FBStringTerminatorBenchmark.cpp b/folly/test/FBStringTerminatorBenchmark.cpp
new file mode 100644 (file)
index 0000000..cbd0d59
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2014 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <functional>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <gflags/gflags.h>
+
+/**
+ * fbstring::c_str/data used to always write '\0' if
+ * FBSTRING_CONSERVATIVE was not defined.
+ *
+ * Multiple threads calling c_str/data on the same fbstring leads to
+ * cache thrashing.
+ *
+ * fbstring::c_str was changed to conditionally write '\0'.
+ *
+ */
+
+// #define FBSTRING_PERVERSE
+// #define FBSTRING_CONSERVATIVE
+#include "folly/FBString.h"
+#include "folly/Benchmark.h"
+#include "folly/Range.h"
+
+
+template <class S>
+class Bench {
+ public:
+  void benchStatic(size_t n) const {
+    static S what = "small string";
+    for (size_t i = 0; i < n; i++) {
+      folly::doNotOptimizeAway(what.data());
+    }
+  }
+
+  void benchPrivate(size_t n) const {
+    S what = "small string";
+    for (size_t i = 0; i < n; i++) {
+      folly::doNotOptimizeAway(what.data());
+    }
+  }
+};
+
+void threadify(std::function<void()> fn, int nrThreads) {
+  std::vector<std::thread> threads;
+  for (int i = 0; i < nrThreads; i++) {
+    threads.emplace_back(fn);
+  }
+  for (auto& t : threads) { t.join(); }
+}
+
+/*
+ * Private/static benchmark pairs should give the same numbers if run
+ * in single threaded mode (regardless of '\0' being conditionally
+ * written or not).
+ */
+BENCHMARK(static_std_1t, n) {
+  threadify([n] { Bench<std::string>().benchStatic(n); }, 1);
+}
+
+BENCHMARK_RELATIVE(privat_std_1t, n) {
+  threadify([n] { Bench<std::string>().benchPrivate(n); }, 1);
+}
+
+BENCHMARK_RELATIVE(static_fbs_1t, n) {
+  threadify([n] { Bench<folly::fbstring>().benchStatic(n); }, 1);
+}
+
+BENCHMARK_RELATIVE(privat_fbs_1t, n) {
+  threadify([n] { Bench<folly::fbstring>().benchPrivate(n); }, 1);
+}
+
+BENCHMARK_RELATIVE(static_sp__1t, n) {
+  threadify([n] { Bench<folly::StringPiece>().benchStatic(n); }, 1);
+}
+
+BENCHMARK_RELATIVE(privat_sp__1t, n) {
+  threadify([n] { Bench<folly::StringPiece>().benchPrivate(n); }, 1);
+}
+
+/*
+ * Private/static benchmark pairs when run on multiple threads:
+ *
+ * - should be similar if '\0' is conditionally written or when
+ *   FBSTRING_CONSERVATIVE is defined (in this case '\0' is not written at all)
+ *
+ * - static used to be significanlty slower than private for fbstring
+ *   with unconditional '\0' written at the end.
+ */
+BENCHMARK(static_std_32t, n) {
+  threadify([n] { Bench<std::string>().benchStatic(n); }, 32);
+}
+
+BENCHMARK_RELATIVE(privat_std_32t, n) {
+  threadify([n] { Bench<std::string>().benchPrivate(n); }, 32);
+}
+
+BENCHMARK_RELATIVE(static_fbs_32t, n) {
+  threadify([n] { Bench<folly::fbstring>().benchStatic(n); }, 32);
+}
+
+BENCHMARK_RELATIVE(privat_fbs_32t, n) {
+  threadify([n] { Bench<folly::fbstring>().benchPrivate(n); }, 32);
+}
+
+BENCHMARK_RELATIVE(static_sp__32t, n) {
+  threadify([n] { Bench<folly::StringPiece>().benchStatic(n); }, 32);
+}
+
+BENCHMARK_RELATIVE(privat_sp__32t, n) {
+  threadify([n] { Bench<folly::StringPiece>().benchPrivate(n); }, 32);
+}
+
+int main(int argc, char *argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  folly::runBenchmarks();
+  return 0;
+}