AutomaticCodec
[folly.git] / folly / io / test / CompressionTest.cpp
index 020d74876ae3d7464902abee12638af37efcf3d1..197d50fdf408f0fc4daacdb4092545f9cb35a773 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2016 Facebook, Inc.
+ * Copyright 2017 Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
 #include <folly/io/Compression.h>
 
 #include <random>
+#include <set>
 #include <thread>
 #include <unordered_map>
 
 #include <boost/noncopyable.hpp>
 #include <glog/logging.h>
-#include <gtest/gtest.h>
 
 #include <folly/Benchmark.h>
 #include <folly/Hash.h>
+#include <folly/Memory.h>
 #include <folly/Random.h>
 #include <folly/Varint.h>
 #include <folly/io/IOBufQueue.h>
+#include <folly/portability/GTest.h>
 
 namespace folly { namespace io { namespace test {
 
@@ -92,7 +94,7 @@ RandomDataHolder::RandomDataHolder(size_t sizeLog2)
     threads.emplace_back(
         [this, seed, t, numThreadsLog2, sizeLog2] () {
           std::mt19937 rng(seed + t);
-          size_t countLog2 = size_t(1) << (sizeLog2 - numThreadsLog2);
+          size_t countLog2 = sizeLog2 - numThreadsLog2;
           size_t start = size_t(t) << countLog2;
           for (size_t i = 0; i < countLog2; ++i) {
             this->data_[start + i] = rng();
@@ -119,40 +121,83 @@ constexpr size_t dataSizeLog2 = 27;  // 128MiB
 RandomDataHolder randomDataHolder(dataSizeLog2);
 ConstantDataHolder constantDataHolder(dataSizeLog2);
 
+// The intersection of the provided codecs & those that are compiled in.
+static std::vector<CodecType> supportedCodecs(std::vector<CodecType> const& v) {
+  std::vector<CodecType> supported;
+
+  std::copy_if(
+      std::begin(v),
+      std::end(v),
+      std::back_inserter(supported),
+      hasCodec);
+
+  return supported;
+}
+
+// All compiled-in compression codecs.
+static std::vector<CodecType> availableCodecs() {
+  std::vector<CodecType> codecs;
+
+  for (size_t i = 0; i < static_cast<size_t>(CodecType::NUM_CODEC_TYPES); ++i) {
+    auto type = static_cast<CodecType>(i);
+    if (hasCodec(type)) {
+      codecs.push_back(type);
+    }
+  }
+
+  return codecs;
+}
+
 TEST(CompressionTestNeedsUncompressedLength, Simple) {
-  EXPECT_FALSE(getCodec(CodecType::NO_COMPRESSION)->needsUncompressedLength());
-  EXPECT_TRUE(getCodec(CodecType::LZ4)->needsUncompressedLength());
-  EXPECT_FALSE(getCodec(CodecType::SNAPPY)->needsUncompressedLength());
-  EXPECT_FALSE(getCodec(CodecType::ZLIB)->needsUncompressedLength());
-  EXPECT_FALSE(getCodec(CodecType::LZ4_VARINT_SIZE)->needsUncompressedLength());
-  EXPECT_TRUE(getCodec(CodecType::LZMA2)->needsUncompressedLength());
-  EXPECT_FALSE(getCodec(CodecType::LZMA2_VARINT_SIZE)
-    ->needsUncompressedLength());
-  EXPECT_TRUE(getCodec(CodecType::ZSTD_BETA)->needsUncompressedLength());
-  EXPECT_FALSE(getCodec(CodecType::GZIP)->needsUncompressedLength());
+  static const struct { CodecType type; bool needsUncompressedLength; }
+    expectations[] = {
+      { CodecType::NO_COMPRESSION, false },
+      { CodecType::LZ4, true },
+      { CodecType::SNAPPY, false },
+      { CodecType::ZLIB, false },
+      { CodecType::LZ4_VARINT_SIZE, false },
+      { CodecType::LZMA2, false },
+      { CodecType::LZMA2_VARINT_SIZE, false },
+      { CodecType::ZSTD, false },
+      { CodecType::GZIP, false },
+      { CodecType::LZ4_FRAME, false },
+    };
+
+  for (auto const& test : expectations) {
+    if (hasCodec(test.type)) {
+      EXPECT_EQ(getCodec(test.type)->needsUncompressedLength(),
+                test.needsUncompressedLength);
+    }
+  }
 }
 
 class CompressionTest
-    : public testing::TestWithParam<std::tr1::tuple<int, CodecType>> {
-  protected:
-   void SetUp() override {
-     auto tup = GetParam();
-     uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
-     codec_ = getCodec(std::tr1::get<1>(tup));
-   }
+    : public testing::TestWithParam<std::tr1::tuple<int, int, CodecType>> {
+ protected:
+  void SetUp() override {
+    auto tup = GetParam();
+    uncompressedLength_ = uint64_t(1) << std::tr1::get<0>(tup);
+    chunks_ = std::tr1::get<1>(tup);
+    codec_ = getCodec(std::tr1::get<2>(tup));
+  }
+
+  void runSimpleIOBufTest(const DataHolder& dh);
 
-   void runSimpleTest(const DataHolder& dh);
+  void runSimpleStringTest(const DataHolder& dh);
 
-   uint64_t uncompressedLength_;
-   std::unique_ptr<Codec> codec_;
+ private:
+  std::unique_ptr<IOBuf> split(std::unique_ptr<IOBuf> data) const;
+
+  uint64_t uncompressedLength_;
+  size_t chunks_;
+  std::unique_ptr<Codec> codec_;
 };
 
-void CompressionTest::runSimpleTest(const DataHolder& dh) {
-  auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength_));
-  auto compressed = codec_->compress(original.get());
+void CompressionTest::runSimpleIOBufTest(const DataHolder& dh) {
+  const auto original = split(IOBuf::wrapBuffer(dh.data(uncompressedLength_)));
+  const auto compressed = split(codec_->compress(original.get()));
   if (!codec_->needsUncompressedLength()) {
     auto uncompressed = codec_->uncompress(compressed.get());
-
     EXPECT_EQ(uncompressedLength_, uncompressed->computeChainDataLength());
     EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
   }
@@ -164,27 +209,72 @@ void CompressionTest::runSimpleTest(const DataHolder& dh) {
   }
 }
 
+void CompressionTest::runSimpleStringTest(const DataHolder& dh) {
+  const auto original = std::string(
+      reinterpret_cast<const char*>(dh.data(uncompressedLength_).data()),
+      uncompressedLength_);
+  const auto compressed = codec_->compress(original);
+  if (!codec_->needsUncompressedLength()) {
+    auto uncompressed = codec_->uncompress(compressed);
+    EXPECT_EQ(uncompressedLength_, uncompressed.length());
+    EXPECT_EQ(uncompressed, original);
+  }
+  {
+    auto uncompressed = codec_->uncompress(compressed, uncompressedLength_);
+    EXPECT_EQ(uncompressedLength_, uncompressed.length());
+    EXPECT_EQ(uncompressed, original);
+  }
+}
+
+// Uniformly split data into (potentially empty) chunks.
+std::unique_ptr<IOBuf> CompressionTest::split(
+    std::unique_ptr<IOBuf> data) const {
+  if (data->isChained()) {
+    data->coalesce();
+  }
+
+  const size_t size = data->computeChainDataLength();
+
+  std::multiset<size_t> splits;
+  for (size_t i = 1; i < chunks_; ++i) {
+    splits.insert(Random::rand64(size));
+  }
+
+  folly::IOBufQueue result;
+
+  size_t offset = 0;
+  for (size_t split : splits) {
+    result.append(IOBuf::copyBuffer(data->data() + offset, split - offset));
+    offset = split;
+  }
+  result.append(IOBuf::copyBuffer(data->data() + offset, size - offset));
+
+  return result.move();
+}
+
 TEST_P(CompressionTest, RandomData) {
-  runSimpleTest(randomDataHolder);
+  runSimpleIOBufTest(randomDataHolder);
 }
 
 TEST_P(CompressionTest, ConstantData) {
-  runSimpleTest(constantDataHolder);
+  runSimpleIOBufTest(constantDataHolder);
+}
+
+TEST_P(CompressionTest, RandomDataString) {
+  runSimpleStringTest(randomDataHolder);
+}
+
+TEST_P(CompressionTest, ConstantDataString) {
+  runSimpleStringTest(constantDataHolder);
 }
 
 INSTANTIATE_TEST_CASE_P(
     CompressionTest,
     CompressionTest,
-    testing::Combine(testing::Values(0, 1, 12, 22, 25, 27),
-                     testing::Values(CodecType::NO_COMPRESSION,
-                                     CodecType::LZ4,
-                                     CodecType::SNAPPY,
-                                     CodecType::ZLIB,
-                                     CodecType::LZ4_VARINT_SIZE,
-                                     CodecType::LZMA2,
-                                     CodecType::LZMA2_VARINT_SIZE,
-                                     CodecType::ZSTD_BETA,
-                                     CodecType::GZIP)));
+    testing::Combine(
+        testing::Values(0, 1, 12, 22, 25, 27),
+        testing::Values(1, 2, 3, 8, 65),
+        testing::ValuesIn(availableCodecs())));
 
 class CompressionVarintTest
     : public testing::TestWithParam<std::tr1::tuple<int, CodecType>> {
@@ -213,7 +303,8 @@ void CompressionVarintTest::runSimpleTest(const DataHolder& dh) {
   auto compressed = codec_->compress(original.get());
   auto breakPoint =
       1UL +
-      Random::rand64(std::max(9UL, oneBasedMsbPos(uncompressedLength_)) / 9UL);
+      Random::rand64(
+          std::max(uint64_t(9), oneBasedMsbPos(uncompressedLength_)) / 9UL);
   auto tinyBuf = IOBuf::copyBuffer(compressed->data(),
                                    std::min(compressed->length(), breakPoint));
   compressed->trimStart(breakPoint);
@@ -226,7 +317,9 @@ void CompressionVarintTest::runSimpleTest(const DataHolder& dh) {
   EXPECT_EQ(dh.hash(uncompressedLength_), hashIOBuf(uncompressed.get()));
 }
 
-TEST_P(CompressionVarintTest, RandomData) { runSimpleTest(randomDataHolder); }
+TEST_P(CompressionVarintTest, RandomData) {
+  runSimpleTest(randomDataHolder);
+}
 
 TEST_P(CompressionVarintTest, ConstantData) {
   runSimpleTest(constantDataHolder);
@@ -235,9 +328,12 @@ TEST_P(CompressionVarintTest, ConstantData) {
 INSTANTIATE_TEST_CASE_P(
     CompressionVarintTest,
     CompressionVarintTest,
-    testing::Combine(testing::Values(0, 1, 12, 22, 25, 27),
-                     testing::Values(CodecType::LZ4_VARINT_SIZE,
-                                     CodecType::LZMA2_VARINT_SIZE)));
+    testing::Combine(
+        testing::Values(0, 1, 12, 22, 25, 27),
+        testing::ValuesIn(supportedCodecs({
+            CodecType::LZ4_VARINT_SIZE,
+            CodecType::LZMA2_VARINT_SIZE,
+            }))));
 
 class CompressionCorruptionTest : public testing::TestWithParam<CodecType> {
  protected:
@@ -291,12 +387,266 @@ TEST_P(CompressionCorruptionTest, ConstantData) {
 INSTANTIATE_TEST_CASE_P(
     CompressionCorruptionTest,
     CompressionCorruptionTest,
-    testing::Values(
+    testing::ValuesIn(
         // NO_COMPRESSION can't detect corruption
         // LZ4 can't detect corruption reliably (sigh)
-        CodecType::SNAPPY,
-        CodecType::ZLIB));
+        supportedCodecs({
+            CodecType::SNAPPY,
+            CodecType::ZLIB,
+            CodecType::LZMA2,
+            CodecType::ZSTD,
+            CodecType::LZ4_FRAME,
+        })));
+
+class AutomaticCodecTest : public testing::TestWithParam<CodecType> {
+ protected:
+  void SetUp() override {
+    codec_ = getCodec(GetParam());
+    auto_ = getAutoUncompressionCodec();
+  }
+
+  void runSimpleTest(const DataHolder& dh);
+
+  std::unique_ptr<Codec> codec_;
+  std::unique_ptr<Codec> auto_;
+};
+
+void AutomaticCodecTest::runSimpleTest(const DataHolder& dh) {
+  constexpr uint64_t uncompressedLength = 1000;
+  auto original = IOBuf::wrapBuffer(dh.data(uncompressedLength));
+  auto compressed = codec_->compress(original.get());
+
+  if (!codec_->needsUncompressedLength()) {
+    auto uncompressed = auto_->uncompress(compressed.get());
+    EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
+    EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
+  }
+  {
+    auto uncompressed = auto_->uncompress(compressed.get(), uncompressedLength);
+    EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
+    EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
+  }
+  ASSERT_GE(compressed->computeChainDataLength(), 8);
+  for (size_t i = 0; i < 8; ++i) {
+    auto split = compressed->clone();
+    auto rest = compressed->clone();
+    split->trimEnd(split->length() - i);
+    rest->trimStart(i);
+    split->appendChain(std::move(rest));
+    auto uncompressed = auto_->uncompress(split.get(), uncompressedLength);
+    EXPECT_EQ(uncompressedLength, uncompressed->computeChainDataLength());
+    EXPECT_EQ(dh.hash(uncompressedLength), hashIOBuf(uncompressed.get()));
+  }
+}
+
+TEST_P(AutomaticCodecTest, RandomData) {
+  runSimpleTest(randomDataHolder);
+}
+
+TEST_P(AutomaticCodecTest, ConstantData) {
+  runSimpleTest(constantDataHolder);
+}
+
+TEST_P(AutomaticCodecTest, ValidPrefixes) {
+  const auto prefixes = codec_->validPrefixes();
+  for (const auto& prefix : prefixes) {
+    EXPECT_FALSE(prefix.empty());
+    // Ensure that all strings are at least 8 bytes for LZMA2.
+    // The bytes after the prefix should be ignored by `canUncompress()`.
+    IOBuf data{IOBuf::COPY_BUFFER, prefix, 0, 8};
+    data.append(8);
+    EXPECT_TRUE(codec_->canUncompress(&data));
+    EXPECT_TRUE(auto_->canUncompress(&data));
+  }
+}
+
+TEST_P(AutomaticCodecTest, NeedsUncompressedLength) {
+  if (codec_->needsUncompressedLength()) {
+    EXPECT_TRUE(auto_->needsUncompressedLength());
+  }
+}
+
+TEST_P(AutomaticCodecTest, maxUncompressedLength) {
+  EXPECT_LE(codec_->maxUncompressedLength(), auto_->maxUncompressedLength());
+}
+
+TEST_P(AutomaticCodecTest, DefaultCodec) {
+  const uint64_t length = 42;
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(getCodec(CodecType::ZSTD));
+  auto automatic = getAutoUncompressionCodec(std::move(codecs));
+  auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
+  auto compressed = codec_->compress(original.get());
+  auto decompressed = automatic->uncompress(compressed.get());
+
+  EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
+}
+
+namespace {
+class CustomCodec : public Codec {
+ public:
+  static std::unique_ptr<Codec> create(std::string prefix, CodecType type) {
+    return make_unique<CustomCodec>(std::move(prefix), type);
+  }
+  explicit CustomCodec(std::string prefix, CodecType type)
+      : Codec(CodecType::USER_DEFINED),
+        prefix_(std::move(prefix)),
+        codec_(getCodec(type)) {}
+
+ private:
+  std::vector<std::string> validPrefixes() const override {
+    return {prefix_};
+  }
+
+  bool canUncompress(const IOBuf* data, uint64_t) const override {
+    auto clone = data->cloneCoalescedAsValue();
+    if (clone.length() < prefix_.size()) {
+      return false;
+    }
+    return memcmp(clone.data(), prefix_.data(), prefix_.size()) == 0;
+  }
+
+  std::unique_ptr<IOBuf> doCompress(const IOBuf* data) override {
+    auto result = IOBuf::copyBuffer(prefix_);
+    result->appendChain(codec_->compress(data));
+    EXPECT_TRUE(canUncompress(result.get(), data->computeChainDataLength()));
+    return result;
+  }
 
+  std::unique_ptr<IOBuf> doUncompress(
+      const IOBuf* data,
+      uint64_t uncompressedLength) override {
+    EXPECT_TRUE(canUncompress(data, uncompressedLength));
+    auto clone = data->cloneCoalescedAsValue();
+    clone.trimStart(prefix_.size());
+    return codec_->uncompress(&clone, uncompressedLength);
+  }
+
+  std::string prefix_;
+  std::unique_ptr<Codec> codec_;
+};
+}
+
+TEST_P(AutomaticCodecTest, CustomCodec) {
+  const uint64_t length = 42;
+  auto ab = CustomCodec::create("ab", CodecType::ZSTD);
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("ab", CodecType::ZSTD));
+  auto automatic = getAutoUncompressionCodec(std::move(codecs));
+  auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
+
+  auto abCompressed = ab->compress(original.get());
+  auto abDecompressed = automatic->uncompress(abCompressed.get());
+  EXPECT_TRUE(automatic->canUncompress(abCompressed.get()));
+  EXPECT_FALSE(auto_->canUncompress(abCompressed.get()));
+  EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(abDecompressed.get()));
+
+  auto compressed = codec_->compress(original.get());
+  auto decompressed = automatic->uncompress(compressed.get());
+  EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
+}
+
+TEST_P(AutomaticCodecTest, CustomDefaultCodec) {
+  const uint64_t length = 42;
+  auto none = CustomCodec::create("none", CodecType::NO_COMPRESSION);
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
+  codecs.push_back(getCodec(CodecType::LZ4_FRAME));
+  auto automatic = getAutoUncompressionCodec(std::move(codecs));
+  auto original = IOBuf::wrapBuffer(constantDataHolder.data(length));
+
+  auto noneCompressed = none->compress(original.get());
+  auto noneDecompressed = automatic->uncompress(noneCompressed.get());
+  EXPECT_TRUE(automatic->canUncompress(noneCompressed.get()));
+  EXPECT_FALSE(auto_->canUncompress(noneCompressed.get()));
+  EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(noneDecompressed.get()));
+
+  auto compressed = codec_->compress(original.get());
+  auto decompressed = automatic->uncompress(compressed.get());
+  EXPECT_EQ(constantDataHolder.hash(length), hashIOBuf(decompressed.get()));
+}
+
+TEST_P(AutomaticCodecTest, canUncompressOneBytes) {
+  // No default codec can uncompress 1 bytes.
+  IOBuf buf{IOBuf::CREATE, 1};
+  buf.append(1);
+  EXPECT_FALSE(codec_->canUncompress(&buf, 1));
+  EXPECT_FALSE(codec_->canUncompress(&buf, Codec::UNKNOWN_UNCOMPRESSED_LENGTH));
+  EXPECT_FALSE(auto_->canUncompress(&buf, 1));
+  EXPECT_FALSE(auto_->canUncompress(&buf, Codec::UNKNOWN_UNCOMPRESSED_LENGTH));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AutomaticCodecTest,
+    AutomaticCodecTest,
+    testing::Values(
+        CodecType::LZ4_FRAME,
+        CodecType::ZSTD,
+        CodecType::ZLIB,
+        CodecType::GZIP,
+        CodecType::LZMA2));
+
+TEST(ValidPrefixesTest, CustomCodec) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("none", CodecType::NO_COMPRESSION));
+  const auto none = getAutoUncompressionCodec(std::move(codecs));
+  const auto prefixes = none->validPrefixes();
+  const auto it = std::find(prefixes.begin(), prefixes.end(), "none");
+  EXPECT_TRUE(it != prefixes.end());
+}
+
+#define EXPECT_THROW_IF_DEBUG(statement, expected_exception) \
+  do {                                                       \
+    if (kIsDebug) {                                          \
+      EXPECT_THROW((statement), expected_exception);         \
+    } else {                                                 \
+      EXPECT_NO_THROW((statement));                          \
+    }                                                        \
+  } while (false)
+
+TEST(CheckCompatibleTest, SimplePrefixSecond) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
+  codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
+  EXPECT_THROW_IF_DEBUG(
+      getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
+}
+
+TEST(CheckCompatibleTest, SimplePrefixFirst) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("ab", CodecType::NO_COMPRESSION));
+  codecs.push_back(CustomCodec::create("abc", CodecType::NO_COMPRESSION));
+  EXPECT_THROW_IF_DEBUG(
+      getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
+}
+
+TEST(CheckCompatibleTest, Empty) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("", CodecType::NO_COMPRESSION));
+  EXPECT_THROW_IF_DEBUG(
+      getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
+}
+
+TEST(CheckCompatibleTest, ZstdPrefix) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("\x28\xB5\x2F", CodecType::ZSTD));
+  EXPECT_THROW_IF_DEBUG(
+      getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
+}
+
+TEST(CheckCompatibleTest, ZstdDuplicate) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("\x28\xB5\x2F\xFD", CodecType::ZSTD));
+  EXPECT_THROW_IF_DEBUG(
+      getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
+}
+
+TEST(CheckCompatibleTest, ZlibIsPrefix) {
+  std::vector<std::unique_ptr<Codec>> codecs;
+  codecs.push_back(CustomCodec::create("\x18\x76zzasdf", CodecType::ZSTD));
+  EXPECT_THROW_IF_DEBUG(
+      getAutoUncompressionCodec(std::move(codecs)), std::invalid_argument);
+}
 }}}  // namespaces
 
 int main(int argc, char *argv[]) {