folly: fbstring: conditionally write terminator in c_str()
authorLucian Grijincu <lucian@fb.com>
Thu, 8 May 2014 18:50:50 +0000 (11:50 -0700)
committerDave Watson <davejwatson@fb.com>
Tue, 20 May 2014 19:53:58 +0000 (12:53 -0700)
commit9529709d994bdd827e81a269ed5417a6aabaae3f
tree55aa31fc3183ef011a59ab713d04ecd20dc6ec15
parentf67e9f29f1c7a608c0939118f4117eb58015e3d6
folly: fbstring: conditionally write terminator in c_str()

Summary:
c_str/data writes a terminator '\0' on each call. When
multiple threads call c_str/data on the same string (e.g. small global
constant) the string's cache line will bounce from cpu to cpu.

Cpus report that most instructions are stalled
(stalled-cycles-frontend/stalled-cycles-backend near 100%).

Fix: check before setting '\0'.

Test Plan:
** BEFORE **

- nothing defined  ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.10s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.40s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 17:27:39.373522 28745 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              934.10ps    1.07G
privat_std_1t                                    100.32%   931.08ps    1.07G
static_fbs_1t                                    100.30%   931.33ps    1.07G
privat_fbs_1t                                    100.42%   930.22ps    1.08G
static_sp__1t                                   86986.52%     1.07ps  931.24G
privat_sp__1t                                   81459.50%     1.15ps  872.07G
static_std_32t                                             392.41ns    2.55M
privat_std_32t                                  15072.69%     2.60ns  384.10M
static_fbs_32t                                   102.41%   383.17ns    2.61M
privat_fbs_32t                                  13643.61%     2.88ns  347.68M
static_sp__32t                                  61032.25%   642.96ps    1.56G
privat_sp__32t                                  74172.72%   529.06ps    1.89G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

2417478.451740 task-clock                #   27.005 CPUs utilized
65,374 context-switches          #    0.027 K/sec
4,270 CPU-migrations            #    0.002 K/sec
2,594 page-faults               #    0.001 K/sec
5,261,195,623,299 cycles                    #    2.176 GHz                     [25.09%]
5,072,634,235,091 stalled-cycles-frontend   #   96.42% frontend cycles idle    [30.01%]
3,848,759,938,052 stalled-cycles-backend    #   73.15% backend  cycles idle    [30.01%]
614,983,033,652 instructions              #    0.12  insns per cycle
#    8.25  stalled cycles per insn [25.09%]
152,995,596,390 branches                  #   63.287 M/sec                   [30.01%]
24,501,018 branch-misses             #    0.02% of all branches         [25.94%]
66,298,370,215 L1-dcache-loads           #   27.425 M/sec                   [30.03%]
1,642,188,850 L1-dcache-load-misses     #    2.48% of all L1-dcache hits   [30.02%]
968,142,977 LLC-loads                 #    0.400 M/sec                   [30.02%]
480,786,261 LLC-load-misses           #   49.66% of all LL-cache hits    [30.02%]

89.520960614 seconds time elapsed

- #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.25s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.45s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 17:24:26.487848 23720 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              932.63ps    1.07G
privat_std_1t                                    100.95%   923.85ps    1.08G
static_fbs_1t                                    100.04%   932.23ps    1.07G
privat_fbs_1t                                    101.14%   922.07ps    1.08G
static_sp__1t                                   30418.15%     3.07ps  326.16G
privat_sp__1t                                   56775.39%     1.64ps  608.77G
static_std_32t                                             388.57ns    2.57M
privat_std_32t                                  16465.70%     2.36ns  423.75M
static_fbs_32t                                    96.19%   403.98ns    2.48M
privat_fbs_32t                                  16248.69%     2.39ns  418.16M
static_sp__32t                                  81185.06%   478.63ps    2.09G
privat_sp__32t                                  81470.69%   476.95ps    2.10G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

2486009.171842 task-clock                #   27.241 CPUs utilized
64,538 context-switches          #    0.026 K/sec
4,501 CPU-migrations            #    0.002 K/sec
2,863 page-faults               #    0.001 K/sec
5,413,507,764,726 cycles                    #    2.178 GHz                     [25.62%]
5,207,550,928,974 stalled-cycles-frontend   #   96.20% frontend cycles idle    [30.01%]
3,919,627,007,209 stalled-cycles-backend    #   72.40% backend  cycles idle    [30.01%]
685,365,706,158 instructions              #    0.13  insns per cycle
#    7.60  stalled cycles per insn [25.62%]
148,798,151,350 branches                  #   59.854 M/sec                   [30.01%]
24,386,492 branch-misses             #    0.02% of all branches         [26.08%]
72,683,962,699 L1-dcache-loads           #   29.237 M/sec                   [30.02%]
1,687,684,775 L1-dcache-load-misses     #    2.32% of all L1-dcache hits   [30.01%]
989,352,938 LLC-loads                 #    0.398 M/sec                   [30.02%]
484,825,665 LLC-load-misses           #   49.00% of all LL-cache hits    [30.02%]

91.259265010 seconds time elapsed

- #define FBSTRING_CONSERVATIVE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
0.18s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.43s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 17:30:58.246834  1571 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                                1.44ps  692.61G
privat_std_1t                                    193.01%   748.06fs    1.34T
static_fbs_1t                                    399.47%   361.43fs    2.77T
privat_fbs_1t                                    129.29%     1.12ps  895.50G
static_sp__1t                                     78.99%     1.83ps  547.10G
privat_sp__1t                                     56.22%     2.57ps  389.36G
static_std_32t                                             478.67ps    2.09G
privat_std_32t                                   100.30%   477.26ps    2.10G
static_fbs_32t                                   100.52%   476.21ps    2.10G
privat_fbs_32t                                   100.52%   476.22ps    2.10G
static_sp__32t                                    99.57%   480.76ps    2.08G
privat_sp__32t                                   100.06%   478.40ps    2.09G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

198000.887222 task-clock                #   14.738 CPUs utilized
5,016 context-switches          #    0.025 K/sec
2,536 CPU-migrations            #    0.013 K/sec
4,717 page-faults               #    0.024 K/sec
421,850,242,358 cycles                    #    2.131 GHz                     [25.31%]
209,633,195,159 stalled-cycles-frontend   #   49.69% frontend cycles idle    [30.32%]
6,252,451,713 stalled-cycles-backend    #    1.48% backend  cycles idle    [30.21%]
874,096,942,441 instructions              #    2.07  insns per cycle
#    0.24  stalled cycles per insn [25.37%]
218,114,341,006 branches                  # 1101.583 M/sec                   [30.34%]
3,080,267 branch-misses             #    0.00% of all branches         [26.23%]
240,532,859 L1-dcache-loads           #    1.215 M/sec                   [30.73%]
23,151,832 L1-dcache-load-misses     #    9.63% of all L1-dcache hits   [30.61%]
5,283,803 LLC-loads                 #    0.027 M/sec                   [30.60%]
1,079,973 LLC-load-misses           #   20.44% of all LL-cache hits    [30.60%]

13.434898734 seconds time elapsed

- #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
I0507 17:37:01.095785 28744 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                                2.80ps  357.44G
privat_std_1t                                    118.65%     2.36ps  424.09G
static_fbs_1t                                      0.30%   937.44ps    1.07G
privat_fbs_1t                                      0.30%   924.33ps    1.08G
static_sp__1t                                    229.14%     1.22ps  819.03G
privat_sp__1t                                    212.22%     1.32ps  758.55G
static_std_32t                                             662.00ps    1.51G
privat_std_32t                                   134.15%   493.47ps    2.03G
static_fbs_32t                                     1.27%    52.24ns   19.14M
privat_fbs_32t                                    28.02%     2.36ns  423.31M
static_sp__32t                                   138.51%   477.94ps    2.09G
privat_sp__32t                                   138.39%   478.38ps    2.09G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

316918.699522 task-clock                #   17.561 CPUs utilized
8,761 context-switches          #    0.028 K/sec
2,190 CPU-migrations            #    0.007 K/sec
5,361 page-faults               #    0.017 K/sec
683,482,498,044 cycles                    #    2.157 GHz                     [25.13%]
488,046,572,692 stalled-cycles-frontend   #   71.41% frontend cycles idle    [30.18%]
269,765,079,972 stalled-cycles-backend    #   39.47% backend  cycles idle    [30.17%]
756,213,442,723 instructions              #    1.11  insns per cycle
#    0.65  stalled cycles per insn [25.17%]
175,643,190,326 branches                  #  554.222 M/sec                   [30.14%]
3,737,621 branch-misses             #    0.00% of all branches         [25.63%]
35,169,313,417 L1-dcache-loads           #  110.973 M/sec                   [30.31%]
215,454,174 L1-dcache-load-misses     #    0.61% of all L1-dcache hits   [30.25%]
148,883,234 LLC-loads                 #    0.470 M/sec                   [30.25%]
48,768,585 LLC-load-misses           #   32.76% of all LL-cache hits    [30.26%]

18.046902132 seconds time elapsed

** AFTER **

- define nothing ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
I0507 20:34:15.943284 16852 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              936.05ps    1.07G
privat_std_1t                                    100.67%   929.82ps    1.08G
static_fbs_1t                                     98.63%   949.07ps    1.05G
privat_fbs_1t                                    101.56%   921.69ps    1.08G
static_sp__1t                                   40342.12%     2.32ps  430.98G
privat_sp__1t                                   16370.35%     5.72ps  174.89G
static_std_32t                                             409.26ns    2.44M
privat_std_32t                                  16777.41%     2.44ns  409.95M
static_fbs_32t                                  17398.58%     2.35ns  425.13M
privat_fbs_32t                                  17475.47%     2.34ns  427.01M
static_sp__32t                                  85426.96%   479.07ps    2.09G
privat_sp__32t                                  85033.14%   481.29ps    2.08G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

1387268.222731 task-clock                #   25.553 CPUs utilized
34,698 context-switches          #    0.025 K/sec
2,770 CPU-migrations            #    0.002 K/sec
3,039 page-faults               #    0.002 K/sec
3,019,637,816,074 cycles                    #    2.177 GHz                     [25.21%]
2,801,192,631,479 stalled-cycles-frontend   #   92.77% frontend cycles idle    [30.02%]
2,020,857,118,698 stalled-cycles-backend    #   66.92% backend  cycles idle    [30.00%]
801,090,224,478 instructions              #    0.27  insns per cycle
#    3.50  stalled cycles per insn [25.20%]
206,099,842,649 branches                  #  148.565 M/sec                   [30.03%]
12,954,936 branch-misses             #    0.01% of all branches         [25.64%]
104,129,435,254 L1-dcache-loads           #   75.061 M/sec                   [30.07%]
883,390,641 L1-dcache-load-misses     #    0.85% of all L1-dcache hits   [30.06%]
516,975,218 LLC-loads                 #    0.373 M/sec                   [30.04%]
255,887,523 LLC-load-misses           #   49.50% of all LL-cache hits    [30.04%]

54.289185806 seconds time elapsed

- #define FBSTRING_PERVERSE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.20s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.47s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 20:19:07.272021 18610 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              936.15ps    1.07G
privat_std_1t                                    100.84%   928.34ps    1.08G
static_fbs_1t                                     97.92%   956.06ps    1.05G
privat_fbs_1t                                    101.41%   923.14ps    1.08G
static_sp__1t                                       inf%     0.00fs  infinity
privat_sp__1t                                       inf%     0.00fs  infinity
static_std_32t                                             413.49ns    2.42M
privat_std_32t                                  17375.60%     2.38ns  420.22M
static_fbs_32t                                  17443.07%     2.37ns  421.85M
privat_fbs_32t                                  15354.32%     2.69ns  371.33M
static_sp__32t                                  82034.91%   504.05ps    1.98G
privat_sp__32t                                  63367.22%   652.53ps    1.53G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

1390538.848103 task-clock                #   25.373 CPUs utilized
37,417 context-switches          #    0.027 K/sec
3,147 CPU-migrations            #    0.002 K/sec
2,876 page-faults               #    0.002 K/sec
3,024,513,016,946 cycles                    #    2.175 GHz                     [25.08%]
2,813,021,108,191 stalled-cycles-frontend   #   93.01% frontend cycles idle    [30.02%]
2,043,124,392,473 stalled-cycles-backend    #   67.55% backend  cycles idle    [30.01%]
774,663,686,661 instructions              #    0.26  insns per cycle
#    3.63  stalled cycles per insn [25.09%]
197,666,485,664 branches                  #  142.151 M/sec                   [30.03%]
15,077,576 branch-misses             #    0.01% of all branches         [25.73%]
104,720,369,589 L1-dcache-loads           #   75.309 M/sec                   [30.05%]
886,090,434 L1-dcache-load-misses     #    0.85% of all L1-dcache hits   [30.04%]
520,015,584 LLC-loads                 #    0.374 M/sec                   [30.03%]
256,990,100 LLC-load-misses           #   49.42% of all LL-cache hits    [30.04%]

54.804099454 seconds time elapsed

- #define FBSTRING_CONSERVATIVE ** gcc-4.8.1-glibc-2.17-fb

$ perf stat --detailed _build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000
4.02s experimental/lucian/bench/StringTerminatorBenchmark.cpp
Linking _build/opt/experimental/lucian/bench/string_terminator_benchmark...
2.45s _build/opt/experimental/lucian/bench/string_terminator_benchmark
I0507 20:31:31.120209 10543 StringTerminatorBenchmark.cpp:98]  --bm_min_iters=100000000
============================================================================
experimental/lucian/bench/StringTerminatorBenchmark.cpprelative  time/iter  iters/s
============================================================================
static_std_1t                                              929.96ps    1.08G
privat_std_1t                                    100.37%   926.53ps    1.08G
static_fbs_1t                                       inf%     0.00fs  infinity
privat_fbs_1t                                       inf%     0.00fs  infinity
static_sp__1t                                       inf%     0.00fs  infinity
privat_sp__1t                                       inf%     0.00fs  infinity
static_std_32t                                             381.41ns    2.62M
privat_std_32t                                  16080.74%     2.37ns  421.62M
static_fbs_32t                                  80498.05%   473.81ps    2.11G
privat_fbs_32t                                  80368.84%   474.57ps    2.11G
static_sp__32t                                  80287.07%   475.05ps    2.11G
privat_sp__32t                                  80410.51%   474.33ps    2.11G
============================================================================

Performance counter stats for '_build/opt/experimental/lucian/bench/string_terminator_benchmark --bm_min_iters=100000000':

1293727.223839 task-clock                #   25.403 CPUs utilized
33,881 context-switches          #    0.026 K/sec
3,718 CPU-migrations            #    0.003 K/sec
3,829 page-faults               #    0.003 K/sec
2,813,078,184,066 cycles                    #    2.174 GHz                     [25.11%]
2,597,561,032,630 stalled-cycles-frontend   #   92.34% frontend cycles idle    [30.04%]
1,883,652,860,583 stalled-cycles-backend    #   66.96% backend  cycles idle    [30.02%]
800,150,465,648 instructions              #    0.28  insns per cycle
#    3.25  stalled cycles per insn [25.10%]
197,970,559,157 branches                  #  153.023 M/sec                   [30.02%]
15,385,773 branch-misses             #    0.01% of all branches         [25.95%]
36,541,946,374 L1-dcache-loads           #   28.245 M/sec                   [30.08%]
813,994,917 L1-dcache-load-misses     #    2.23% of all L1-dcache hits   [30.07%]
467,241,616 LLC-loads                 #    0.361 M/sec                   [30.07%]
247,980,582 LLC-load-misses           #   53.07% of all LL-cache hits    [30.08%]

50.928965545 seconds time elapsed

Reviewed By: njormrod@fb.com

FB internal diff: D1318048
folly/FBString.h
folly/test/FBStringTerminatorBenchmark.cpp [new file with mode: 0644]