5cac88875ee1003a4256d5c062c5f51ff3feb3b2
[folly.git] / folly / io / test / CompressionTest.cpp
1 /*
2  * Copyright 2017 Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include <folly/io/Compression.h>
18
19 #include <random>
20 #include <set>
21 #include <thread>
22 #include <unordered_map>
23
24 #include <boost/noncopyable.hpp>
25 #include <glog/logging.h>
26
27 #include <folly/Benchmark.h>
28 #include <folly/Hash.h>
29 #include <folly/Random.h>
30 #include <folly/Varint.h>
31 #include <folly/io/IOBufQueue.h>
32 #include <folly/portability/GTest.h>
33
34 namespace folly { namespace io { namespace test {
35
36 class DataHolder : private boost::noncopyable {
37  public:
38   uint64_t hash(size_t size) const;
39   ByteRange data(size_t size) const;
40
41  protected:
42   explicit DataHolder(size_t sizeLog2);
43   const size_t size_;
44   std::unique_ptr<uint8_t[]> data_;
45   mutable std::unordered_map<uint64_t, uint64_t> hashCache_;
46 };
47
48 DataHolder::DataHolder(size_t sizeLog2)
49   : size_(size_t(1) << sizeLog2),
50     data_(new uint8_t[size_]) {
51 }
52
53 uint64_t DataHolder::hash(size_t size) const {
54   CHECK_LE(size, size_);
55   auto p = hashCache_.find(size);
56   if (p != hashCache_.end()) {
57     return p->second;
58   }
59
60   uint64_t h = folly::hash::fnv64_buf(data_.get(), size);
61   hashCache_[size] = h;
62   return h;
63 }
64
65 ByteRange DataHolder::data(size_t size) const {
66   CHECK_LE(size, size_);
67   return ByteRange(data_.get(), size);
68 }
69
70 uint64_t hashIOBuf(const IOBuf* buf) {
71   uint64_t h = folly::hash::FNV_64_HASH_START;
72   for (auto& range : *buf) {
73     h = folly::hash::fnv64_buf(range.data(), range.size(), h);
74   }
75   return h;
76 }
77
78 class RandomDataHolder : public DataHolder {
79  public:
80   explicit RandomDataHolder(size_t sizeLog2);
81 };
82
83 RandomDataHolder::RandomDataHolder(size_t sizeLog2)
84   : DataHolder(sizeLog2) {
85   constexpr size_t numThreadsLog2 = 3;
86   constexpr size_t numThreads = size_t(1) << numThreadsLog2;
87
88   uint32_t seed = randomNumberSeed();
89
90   std::vector<std::thread> threads;
91   threads.reserve(numThreads);
92   for (size_t t = 0; t < numThreads; ++t) {
93     threads.emplace_back(
94         [this, seed, t, numThreadsLog2, sizeLog2] () {
95           std::mt19937 rng(seed + t);
96           size_t countLog2 = sizeLog2 - numThreadsLog2;
97           size_t start = size_t(t) << countLog2;
98           for (size_t i = 0; i < countLog2; ++i) {
99             this->data_[start + i] = rng();
100           }
101         });
102   }
103
104   for (auto& t : threads) {
105     t.join();
106   }
107 }
108
109 class ConstantDataHolder : public DataHolder {
110  public:
111   explicit ConstantDataHolder(size_t sizeLog2);
112 };
113
114 ConstantDataHolder::ConstantDataHolder(size_t sizeLog2)
115   : DataHolder(sizeLog2) {
116   memset(data_.get(), 'a', size_);
117 }
118
119 constexpr size_t dataSizeLog2 = 27;  // 128MiB
120 RandomDataHolder randomDataHolder(dataSizeLog2);
121 ConstantDataHolder constantDataHolder(dataSizeLog2);
122
123 // The intersection of the provided codecs & those that are compiled in.
124 static std::vector<CodecType> supportedCodecs(std::vector<CodecType> const& v) {
125   std::vector<CodecType> supported;
126
127   std::copy_if(
128       std::begin(v),
129       std::end(v),
130       std::back_inserter(supported),
131       hasCodec);
132
133   return supported;
134 }
135
136 // All compiled-in compression codecs.
137 static std::vector<CodecType> availableCodecs() {
138   std::vector<CodecType> codecs;
139
140   for (size_t i = 0; i < static_cast<size_t>(CodecType::NUM_CODEC_TYPES); ++i) {
141     auto type = static_cast<CodecType>(i);
142     if (hasCodec(type)) {
143       codecs.push_back(type);
144     }
145   }
146
147   return codecs;
148 }
149
150 TEST(CompressionTestNeedsUncompressedLength, Simple) {
151   static const struct { CodecType type; bool needsUncompressedLength; }
152     expectations[] = {
153       { CodecType::NO_COMPRESSION, false },
154       { CodecType::LZ4, true },
155       { CodecType::SNAPPY, false },
156       { CodecType::ZLIB, false },
157       { CodecType::LZ4_VARINT_SIZE, false },
158       { CodecType::LZMA2, false },
159       { CodecType::LZMA2_VARINT_SIZE, false },
160       { CodecType::ZSTD, false },
161       { CodecType::GZIP, false },
162       { CodecType::LZ4_FRAME, false },
163     };
164
165   for (auto const& test : expectations) {
166     if (hasCodec(test.type)) {
167       EXPECT_EQ(getCodec(test.type)->needsUncompressedLength(),
168                 test.needsUncompressedLength);
169     }
170   }
171 }
172
173 class CompressionTest
174     : public testing::TestWithParam<std::tr1::tuple<int, int, CodecType>> {
175  protected:
176   void SetUp() override {
177     auto tup = GetParam();
178     uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
179     chunks_ = std::tr1::get<1>(tup);
180     codec_ = getCodec(std::tr1::get<2>(tup));
181   }
182
183   void runSimpleIOBufTest(const DataHolder& dh);
184
185   void runSimpleStringTest(const DataHolder& dh);
186
187  private:
188   std::unique_ptr<IOBuf> split(std::unique_ptr<IOBuf> data) const;
189
190   uint64_t uncompressedLength_;
191   size_t chunks_;
192   std::unique_ptr<Codec> codec_;
193 };
194
195 void CompressionTest::runSimpleIOBufTest(const DataHolder& dh) {
196   const auto original = split(IOBuf::wrapBuffer(dh.data(uncompressedLength_)));
197   const auto compressed = split(codec_->compress(original.get()));
198   if (!codec_->needsUncompressedLength()) {
199     auto uncompressed = codec_->uncompress(compressed.get());
200     EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
201     EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
202   }
203   {
204     auto uncompressed = codec_->uncompress(compressed.get(),
205                                            uncompressedLength_);
206     EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
207     EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
208   }
209 }
210
211 void CompressionTest::runSimpleStringTest(const DataHolder& dh) {
212   const auto original = std::string(
213       reinterpret_cast<const char*>(dh.data(uncompressedLength_).data()),
214       uncompressedLength_);
215   const auto compressed = codec_->compress(original);
216   if (!codec_->needsUncompressedLength()) {
217     auto uncompressed = codec_->uncompress(compressed);
218     EXPECT_EQ(uncompressedLength_, uncompressed.length());
219     EXPECT_EQ(uncompressed, original);
220   }
221   {
222     auto uncompressed = codec_->uncompress(compressed, uncompressedLength_);
223     EXPECT_EQ(uncompressedLength_, uncompressed.length());
224     EXPECT_EQ(uncompressed, original);
225   }
226 }
227
228 // Uniformly split data into (potentially empty) chunks.
229 std::unique_ptr<IOBuf> CompressionTest::split(
230     std::unique_ptr<IOBuf> data) const {
231   if (data->isChained()) {
232     data->coalesce();
233   }
234
235   const size_t size = data->computeChainDataLength();
236
237   std::multiset<size_t> splits;
238   for (size_t i = 1; i < chunks_; ++i) {
239     splits.insert(Random::rand64(size));
240   }
241
242   folly::IOBufQueue result;
243
244   size_t offset = 0;
245   for (size_t split : splits) {
246     result.append(IOBuf::copyBuffer(data->data() + offset, split - offset));
247     offset = split;
248   }
249   result.append(IOBuf::copyBuffer(data->data() + offset, size - offset));
250
251   return result.move();
252 }
253
254 TEST_P(CompressionTest, RandomData) {
255   runSimpleIOBufTest(randomDataHolder);
256 }
257
258 TEST_P(CompressionTest, ConstantData) {
259   runSimpleIOBufTest(constantDataHolder);
260 }
261
262 TEST_P(CompressionTest, RandomDataString) {
263   runSimpleStringTest(randomDataHolder);
264 }
265
266 TEST_P(CompressionTest, ConstantDataString) {
267   runSimpleStringTest(constantDataHolder);
268 }
269
270 INSTANTIATE_TEST_CASE_P(
271     CompressionTest,
272     CompressionTest,
273     testing::Combine(
274         testing::Values(0, 1, 12, 22, 25, 27),
275         testing::Values(1, 2, 3, 8, 65),
276         testing::ValuesIn(availableCodecs())));
277
278 class CompressionVarintTest
279     : public testing::TestWithParam<std::tr1::tuple<int, CodecType>> {
280  protected:
281   void SetUp() override {
282     auto tup = GetParam();
283     uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
284     codec_ = getCodec(std::tr1::get<1>(tup));
285   }
286
287   void runSimpleTest(const DataHolder& dh);
288
289   uint64_t uncompressedLength_;
290   std::unique_ptr<Codec> codec_;
291 };
292
293 inline uint64_t oneBasedMsbPos(uint64_t number) {
294   uint64_t pos = 0;
295   for (; number > 0; ++pos, number >>= 1) {
296   }
297   return pos;
298 }
299
300 void CompressionVarintTest::runSimpleTest(const DataHolder& dh) {
301   auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength_));
302   auto compressed = codec_->compress(original.get());
303   auto breakPoint =
304       1UL +
305       Random::rand64(
306           std::max(uint64_t(9), oneBasedMsbPos(uncompressedLength_)) / 9UL);
307   auto tinyBuf = IOBuf::copyBuffer(compressed->data(),
308                                    std::min(compressed->length(), breakPoint));
309   compressed->trimStart(breakPoint);
310   tinyBuf->prependChain(std::move(compressed));
311   compressed = std::move(tinyBuf);
312
313   auto uncompressed = codec_->uncompress(compressed.get());
314
315   EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
316   EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
317 }
318
319 TEST_P(CompressionVarintTest, RandomData) {
320   runSimpleTest(randomDataHolder);
321 }
322
323 TEST_P(CompressionVarintTest, ConstantData) {
324   runSimpleTest(constantDataHolder);
325 }
326
327 INSTANTIATE_TEST_CASE_P(
328     CompressionVarintTest,
329     CompressionVarintTest,
330     testing::Combine(
331         testing::Values(0, 1, 12, 22, 25, 27),
332         testing::ValuesIn(supportedCodecs({
333             CodecType::LZ4_VARINT_SIZE,
334             CodecType::LZMA2_VARINT_SIZE,
335             }))));
336
337 class CompressionCorruptionTest : public testing::TestWithParam<CodecType> {
338  protected:
339   void SetUp() override { codec_ = getCodec(GetParam()); }
340
341   void runSimpleTest(const DataHolder& dh);
342
343   std::unique_ptr<Codec> codec_;
344 };
345
346 void CompressionCorruptionTest::runSimpleTest(const DataHolder& dh) {
347   constexpr uint64_t uncompressedLength = 42;
348   auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
349   auto compressed = codec_->compress(original.get());
350
351   if (!codec_->needsUncompressedLength()) {
352     auto uncompressed = codec_->uncompress(compressed.get());
353     EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
354     EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
355   }
356   {
357     auto uncompressed = codec_->uncompress(compressed.get(),
358                                            uncompressedLength);
359     EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
360     EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
361   }
362
363   EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength + 1),
364                std::runtime_error);
365
366   // Corrupt the first character
367   ++(compressed->writableData()[0]);
368
369   if (!codec_->needsUncompressedLength()) {
370     EXPECT_THROW(codec_->uncompress(compressed.get()),
371                  std::runtime_error);
372   }
373
374   EXPECT_THROW(codec_->uncompress(compressed.get(), uncompressedLength),
375                std::runtime_error);
376 }
377
378 TEST_P(CompressionCorruptionTest, RandomData) {
379   runSimpleTest(randomDataHolder);
380 }
381
382 TEST_P(CompressionCorruptionTest, ConstantData) {
383   runSimpleTest(constantDataHolder);
384 }
385
386 INSTANTIATE_TEST_CASE_P(
387     CompressionCorruptionTest,
388     CompressionCorruptionTest,
389     testing::ValuesIn(
390         // NO_COMPRESSION can't detect corruption
391         // LZ4 can't detect corruption reliably (sigh)
392         supportedCodecs({
393             CodecType::SNAPPY,
394             CodecType::ZLIB,
395             CodecType::LZMA2,
396             CodecType::ZSTD,
397             CodecType::LZ4_FRAME,
398         })));
399 }}}  // namespaces
400
401 int main(int argc, char *argv[]) {
402   testing::InitGoogleTest(&argc, argv);
403   gflags::ParseCommandLineFlags(&argc, &argv, true);
404
405   auto ret = RUN_ALL_TESTS();
406   if (!ret) {
407     folly::runBenchmarksOnFlag();
408   }
409   return ret;
410 }