split()
authorTom Jackson <tjackson@fb.com>
Tue, 13 Nov 2012 23:12:04 +0000 (15:12 -0800)
committerJordan DeLong <jdelong@fb.com>
Sun, 16 Dec 2012 22:44:41 +0000 (14:44 -0800)
Summary: Normal split to complement resplit, and it's a bit faster than folly::split().

Test Plan: Unit tests, Benchmarks (result in code comment)

Reviewed By: tudorb@fb.com

FB internal diff: D629998

folly/experimental/StringGen-inl.h
folly/experimental/StringGen.h
folly/experimental/test/GenBenchmark.cpp
folly/experimental/test/GenTest.cpp

index 3330e8b5c53df696a1157c00977f9a7fef026824..10ae814a3d29fd443442e1d958ecbc22b9f007a4 100644 (file)
@@ -124,6 +124,34 @@ class StringResplitter : public Operator<StringResplitter> {
   }
 };
 
+class SplitStringSource : public GenImpl<StringPiece, SplitStringSource> {
+  StringPiece source_;
+  char delimiter_;
+ public:
+  SplitStringSource(const StringPiece& source,
+                    char delimiter)
+    : source_(source)
+    , delimiter_(delimiter) { }
+
+  template <class Body>
+  bool apply(Body&& body) const {
+    StringPiece rest(source_);
+    StringPiece prefix;
+    while (splitPrefix(rest, prefix, this->delimiter_)) {
+      if (!body(prefix)) {
+        return false;
+      }
+    }
+    if (!rest.empty()) {
+      if (!body(rest)) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+
 }  // namespace detail
 }  // namespace gen
 }  // namespace folly
index 8eba76466b0ce3c800e6afa0fa3c61db8edeb7b4..c614cce56b5428a100e423b4506f48e010773a88 100644 (file)
@@ -24,6 +24,7 @@ namespace gen {
 
 namespace detail {
 class StringResplitter;
+class SplitStringSource;
 }  // namespace detail
 
 /**
@@ -40,6 +41,11 @@ S resplit(char delimiter) {
   return S(delimiter);
 }
 
+template <class S=detail::SplitStringSource>
+S split(const StringPiece& source, char delimiter) {
+  return S(source, delimiter);
+}
+
 }  // namespace gen
 }  // namespace folly
 
index 70db7be7a853ce7e74afa125f0de80e29da7a3f6..f8ee2b899d2afce6e62d046d587837c4ebbae90f 100644 (file)
@@ -17,6 +17,7 @@
 #include "folly/experimental/Gen.h"
 #include "folly/experimental/StringGen.h"
 #include "folly/experimental/FileGen.h"
+#include "folly/String.h"
 
 #include <atomic>
 #include <thread>
@@ -318,6 +319,73 @@ BENCHMARK_RELATIVE(StringResplitter_Small, iters) {
 
 BENCHMARK_DRAW_LINE()
 
+BENCHMARK(StringSplit_Old, iters) {
+  size_t s = 0;
+  std::string line(kLine);
+  while (iters--) {
+    std::vector<StringPiece> parts;
+    split(' ', line, parts);
+    s += parts.size();
+  }
+  folly::doNotOptimizeAway(s);
+}
+
+
+BENCHMARK_RELATIVE(StringSplit_Gen_Vector, iters) {
+  size_t s = 0;
+  StringPiece line(kLine);
+  while (iters--) {
+    s += (split(line, ' ') | as<vector>()).size();
+  }
+  folly::doNotOptimizeAway(s);
+}
+
+BENCHMARK_DRAW_LINE()
+
+BENCHMARK(StringSplit_Old_ReuseVector, iters) {
+  size_t s = 0;
+  std::string line(kLine);
+  std::vector<StringPiece> parts;
+  while (iters--) {
+    parts.clear();
+    split(' ', line, parts);
+    s += parts.size();
+  }
+  folly::doNotOptimizeAway(s);
+}
+
+BENCHMARK_RELATIVE(StringSplit_Gen_ReuseVector, iters) {
+  size_t s = 0;
+  StringPiece line(kLine);
+  std::vector<StringPiece> parts;
+  while (iters--) {
+    parts.clear();
+    split(line, ' ') | appendTo(parts);
+    s += parts.size();
+  }
+  folly::doNotOptimizeAway(s);
+}
+
+BENCHMARK_RELATIVE(StringSplit_Gen, iters) {
+  size_t s = 0;
+  StringPiece line(kLine);
+  while (iters--) {
+    s += split(line, ' ') | count;
+  }
+  folly::doNotOptimizeAway(s);
+}
+
+BENCHMARK_RELATIVE(StringSplit_Gen_Take, iters) {
+  size_t s = 0;
+  StringPiece line(kLine);
+  while (iters--) {
+    s += split(line, ' ') | take(10) | count;
+  }
+  folly::doNotOptimizeAway(s);
+}
+
+BENCHMARK_DRAW_LINE()
+
 BENCHMARK(ByLine_Pipes, iters) {
   std::thread thread;
   int rfd;
@@ -356,30 +424,43 @@ BENCHMARK(ByLine_Pipes, iters) {
 // ============================================================================
 // folly/experimental/test/GenBenchmark.cpp        relative  time/iter  iters/s
 // ============================================================================
-// Sum_Basic_NoGen                                            301.60ns    3.32M
-// Sum_Basic_Gen                                    104.27%   289.24ns    3.46M
+// Sum_Basic_NoGen                                            293.77ns    3.40M
+// Sum_Basic_Gen                                    100.24%   293.08ns    3.41M
+// ----------------------------------------------------------------------------
+// Sum_Vector_NoGen                                           199.09ns    5.02M
+// Sum_Vector_Gen                                    98.57%   201.98ns    4.95M
+// ----------------------------------------------------------------------------
+// Count_Vector_NoGen                                          12.40us   80.66K
+// Count_Vector_Gen                                 103.07%    12.03us   83.13K
+// ----------------------------------------------------------------------------
+// Fib_Sum_NoGen                                                3.65us  274.29K
+// Fib_Sum_Gen                                       41.95%     8.69us  115.06K
+// Fib_Sum_Gen_Static                                86.10%     4.23us  236.15K
+// ----------------------------------------------------------------------------
+// VirtualGen_0Virtual                                         10.10us   99.03K
+// VirtualGen_1Virtual                               29.67%    34.04us   29.38K
+// VirtualGen_2Virtual                               20.53%    49.19us   20.33K
+// VirtualGen_3Virtual                               15.22%    66.36us   15.07K
 // ----------------------------------------------------------------------------
-// Sum_Vector_NoGen                                           200.33ns    4.99M
-// Sum_Vector_Gen                                    99.81%   200.70ns    4.98M
+// Concat_NoGen                                                 2.33us  428.35K
+// Concat_Gen                                        85.36%     2.74us  365.62K
 // ----------------------------------------------------------------------------
-// Count_Vector_NoGen                                          12.37us   80.84K
-// Count_Vector_Gen                                 103.09%    12.00us   83.33K
+// Composed_NoGen                                             552.78ns    1.81M
+// Composed_Gen                                     100.48%   550.14ns    1.82M
+// Composed_GenRegular                              100.60%   549.50ns    1.82M
 // ----------------------------------------------------------------------------
-// Fib_Sum_NoGen                                                3.66us  273.21K
-// Fib_Sum_Gen                                       43.06%     8.50us  117.65K
-// Fib_Sum_Gen_Static                                87.81%     4.17us  239.89K
+// StringResplitter_Big                                       118.40us    8.45K
+// StringResplitter_Small                            12.96%   913.23us    1.10K
 // ----------------------------------------------------------------------------
-// VirtualGen_0Virtual                                         10.04us   99.61K
-// VirtualGen_1Virtual                               29.59%    33.93us   29.47K
-// VirtualGen_2Virtual                               20.45%    49.10us   20.37K
-// VirtualGen_3Virtual                               15.49%    64.82us   15.43K
+// StringSplit_Old                                            567.61ns    1.76M
+// StringSplit_Gen_Vector                           146.52%   387.41ns    2.58M
 // ----------------------------------------------------------------------------
-// Concat_NoGen                                                 2.50us  400.37K
-// Concat_Gen                                       102.50%     2.44us  410.37K
+// StringSplit_Old_ReuseVector                                 74.90ns   13.35M
+// StringSplit_Gen_ReuseVector                      112.29%    66.71ns   14.99M
+// StringSplit_Gen                                  122.42%    61.18ns   16.34M
+// StringSplit_Gen_Take                             134.49%    55.70ns   17.95M
 // ----------------------------------------------------------------------------
-// Composed_NoGen                                             549.54ns    1.82M
-// Composed_Gen                                     101.39%   542.00ns    1.85M
-// Composed_GenRegular                               99.66%   551.40ns    1.81M
+// ByLine_Pipes                                               131.18ns    7.62M
 // ============================================================================
 
 int main(int argc, char *argv[]) {
index c362783165e1cdd1a3a4b077cc208aaac522e555..64a738432ff1d11ccef67e446b2d9a6378a52dc5 100644 (file)
@@ -596,6 +596,65 @@ TEST(Gen, Dynamic) {
 }
 
 TEST(StringGen, EmptySplit) {
+  auto collect = eachTo<std::string>() | as<vector>();
+  {
+    auto pieces = split("", ',') | collect;
+    EXPECT_EQ(0, pieces.size());
+  }
+
+  // The last delimiter is eaten, just like std::getline
+  {
+    auto pieces = split(",", ',') | collect;
+    EXPECT_EQ(1, pieces.size());
+    EXPECT_EQ("", pieces[0]);
+  }
+
+  {
+    auto pieces = split(",,", ',') | collect;
+    EXPECT_EQ(2, pieces.size());
+    EXPECT_EQ("", pieces[0]);
+    EXPECT_EQ("", pieces[1]);
+  }
+
+  {
+    auto pieces = split(",,", ',') | take(1) | collect;
+    EXPECT_EQ(1, pieces.size());
+    EXPECT_EQ("", pieces[1]);
+  }
+}
+
+TEST(StringGen, Split) {
+  auto collect = eachTo<std::string>() | as<vector>();
+  {
+    auto pieces = split("hello,, world, goodbye, meow", ',') | collect;
+    EXPECT_EQ(5, pieces.size());
+    EXPECT_EQ("hello", pieces[0]);
+    EXPECT_EQ("", pieces[1]);
+    EXPECT_EQ(" world", pieces[2]);
+    EXPECT_EQ(" goodbye", pieces[3]);
+    EXPECT_EQ(" meow", pieces[4]);
+  }
+
+  {
+    auto pieces = split("hello,, world, goodbye, meow", ',')
+                | take(3) | collect;
+    EXPECT_EQ(3, pieces.size());
+    EXPECT_EQ("hello", pieces[0]);
+    EXPECT_EQ("", pieces[1]);
+    EXPECT_EQ(" world", pieces[2]);
+  }
+
+  {
+    auto pieces = split("hello,, world, goodbye, meow", ',')
+                | take(5) | collect;
+    EXPECT_EQ(5, pieces.size());
+    EXPECT_EQ("hello", pieces[0]);
+    EXPECT_EQ("", pieces[1]);
+    EXPECT_EQ(" world", pieces[2]);
+  }
+}
+
+TEST(StringGen, EmptyResplit) {
   auto collect = eachTo<std::string>() | as<vector>();
   {
     auto pieces = from({""}) | resplit(',') | collect;
@@ -617,7 +676,7 @@ TEST(StringGen, EmptySplit) {
   }
 }
 
-TEST(StringGen, Split) {
+TEST(StringGen, Resplit) {
   auto collect = eachTo<std::string>() | as<vector>();
   {
     auto pieces = from({"hello,, world, goodbye, meow"}) |