X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=blobdiff_plain;f=folly%2Fio%2FCompression.h;h=0a2423bf7d115c305d2ca463a687c5b6e19f0237;hp=3d46d4c5a0b2e91ddea14902769b4986ce6d7a73;hb=b367f0fada0a53564f83e5072bbd45994a1c0795;hpb=c78f92a01dfdd0cc4eff81e90dd9e0832fd77bfc diff --git a/folly/io/Compression.h b/folly/io/Compression.h index 3d46d4c5..0a2423bf 100644 --- a/folly/io/Compression.h +++ b/folly/io/Compression.h @@ -1,5 +1,5 @@ /* - * Copyright 2016 Facebook, Inc. + * Copyright 2017 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,11 @@ #include #include #include +#include +#include +#include +#include #include /** @@ -84,13 +88,26 @@ enum class CodecType { */ GZIP = 9, - NUM_CODEC_TYPES = 10, + /** + * Use LZ4 frame compression. + * Levels supported: 0 = fast, 16 = best; default = 0 + */ + LZ4_FRAME = 10, + + /** + * Use bzip2 compression. + * Levels supported: 1 = fast, 9 = best; default = 9 + */ + BZIP2 = 11, + + NUM_CODEC_TYPES = 12, }; class Codec { public: virtual ~Codec() { } + static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1); /** * Return the maximum length of data that may be compressed with this codec. * NO_COMPRESSION and ZLIB support arbitrary lengths; @@ -119,6 +136,13 @@ class Codec { */ std::unique_ptr compress(const folly::IOBuf* data); + /** + * Compresses data. May involve additional copies compared to the overload + * that takes and returns IOBufs. Has the same error semantics as the IOBuf + * version. + */ + std::string compress(StringPiece data); + /** * Uncompress data. Throws std::runtime_error on decompression error. * @@ -131,28 +155,242 @@ class Codec { * Regardless of the behavior of the underlying compressor, uncompressing * an empty IOBuf chain will return an empty IOBuf chain. */ - static constexpr uint64_t UNKNOWN_UNCOMPRESSED_LENGTH = uint64_t(-1); - static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-2); - std::unique_ptr uncompress( const IOBuf* data, - uint64_t uncompressedLength = UNKNOWN_UNCOMPRESSED_LENGTH); + folly::Optional uncompressedLength = folly::none); + + /** + * Uncompresses data. May involve additional copies compared to the overload + * that takes and returns IOBufs. Has the same error semantics as the IOBuf + * version. + */ + std::string uncompress( + StringPiece data, + folly::Optional uncompressedLength = folly::none); + + /** + * Returns a bound on the maximum compressed length when compressing data with + * the given uncompressed length. + */ + uint64_t maxCompressedLength(uint64_t uncompressedLength) const; + + /** + * Extracts the uncompressed length from the compressed data if possible. + * If the codec doesn't store the uncompressed length, or the data is + * corrupted it returns the given uncompressedLength. + * If the uncompressed length is stored in the compressed data and + * uncompressedLength is not none and they do not match a std::runtime_error + * is thrown. + */ + folly::Optional getUncompressedLength( + const folly::IOBuf* data, + folly::Optional uncompressedLength = folly::none) const; protected: explicit Codec(CodecType type); + public: + /** + * Returns a superset of the set of prefixes for which canUncompress() will + * return true. A superset is allowed for optimizations in canUncompress() + * based on other knowledge such as length. None of the prefixes may be empty. + * default: No prefixes. + */ + virtual std::vector validPrefixes() const; + + /** + * Returns true if the codec thinks it can uncompress the data. + * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy, + * it can always return false. + * default: Returns false. + */ + virtual bool canUncompress( + const folly::IOBuf* data, + folly::Optional uncompressedLength = folly::none) const; + private: // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH) virtual uint64_t doMaxUncompressedLength() const; // default: doesn't need uncompressed length virtual bool doNeedsUncompressedLength() const; virtual std::unique_ptr doCompress(const folly::IOBuf* data) = 0; - virtual std::unique_ptr doUncompress(const folly::IOBuf* data, - uint64_t uncompressedLength) = 0; + virtual std::unique_ptr doUncompress( + const folly::IOBuf* data, + folly::Optional uncompressedLength) = 0; + // default: an implementation is provided by default to wrap the strings into + // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output + // from IOBuf to string. Implementers, at their discretion, can override + // these methods to avoid the copy. + virtual std::string doCompressString(StringPiece data); + virtual std::string doUncompressString( + StringPiece data, + folly::Optional uncompressedLength); + + virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0; + // default: returns the passed uncompressedLength. + virtual folly::Optional doGetUncompressedLength( + const folly::IOBuf* data, + folly::Optional uncompressedLength) const; CodecType type_; }; +class StreamCodec : public Codec { + public: + ~StreamCodec() override {} + + /** + * Does the codec need the data length before compression streaming? + */ + bool needsDataLength() const; + + /***************************************************************************** + * Streaming API + ***************************************************************************** + * A low-level stateful streaming API. + * Streaming operations can be started in two ways: + * 1. From a clean Codec on which no non-const methods have been called. + * 2. A call to resetStream(), which will reset any codec to a clean state. + * After a streaming operation has begun, either compressStream() or + * uncompressStream() must be called until the streaming operation ends. + * compressStream() ends when it returns true with flushOp END. + * uncompressStream() ends when it returns true. At this point the codec + * may be reused by calling resetStream(). + * + * compress() and uncompress() can be called at any time, but they interrupt + * any ongoing streaming operations (state is lost and resetStream() must be + * called before another streaming operation). + */ + + /** + * Reset the state of the codec, and set the uncompressed length for the next + * streaming operation. If uncompressedLength is not none it must be exactly + * the uncompressed length. compressStream() must be passed exactly + * uncompressedLength input bytes before the stream is ended. + * uncompressStream() must be passed a compressed frame that uncompresses to + * uncompressedLength. + */ + void resetStream(folly::Optional uncompressedLength = folly::none); + + enum class FlushOp { NONE, FLUSH, END }; + + /** + * Compresses some data from the input buffer and writes the compressed data + * into the output buffer. It may read input without producing any output, + * except when forced to flush. + * + * The input buffer is advanced to point to the range of data that hasn't yet + * been read. Compression will resume at this point for the next call to + * compressStream(). The output buffer is advanced one byte past the last byte + * written. + * + * The default flushOp is NONE, which allows compressStream() complete + * discretion in how much data to gather before writing any output. + * + * If flushOp is END, all pending and input data is flushed to the output + * buffer, and the frame is ended. compressStream() must be called with the + * same input and flushOp END until it returns true. At this point the caller + * must call resetStream() to use the codec again. + * + * If flushOp is FLUSH, all pending and input data is flushed to the output + * buffer, but the frame is not ended. compressStream() must be called with + * the same input and flushOp END until it returns true. At this point the + * caller can continue to compressStream() with any input data and flushOp. + * The uncompressor, if passed all the produced output data, will be able to + * uncompress all the input data passed to compressStream() so far. Excessive + * use of flushOp FLUSH will deteriorate compression ratio. This is useful for + * stateful streaming across a network. Most users don't need to use this + * flushOp. + * + * A std::logic_error is thrown on incorrect usage of the API. + * A std::runtime_error is thrown upon error conditions. + */ + bool compressStream( + folly::ByteRange& input, + folly::MutableByteRange& output, + FlushOp flushOp = StreamCodec::FlushOp::NONE); + + /** + * Uncompresses some data from the input buffer and writes the uncompressed + * data into the output buffer. It may read input without producing any + * output. + * + * The input buffer is advanced to point to the range of data that hasn't yet + * been read. Uncompression will resume at this point for the next call to + * uncompressStream(). The output buffer is advanced one byte past the last + * byte written. + * + * The default flushOp is NONE, which allows uncompressStream() complete + * discretion in how much output data to flush. The uncompressor may not make + * maximum forward progress, but will make some forward progress when + * possible. + * + * If flushOp is END, the caller guarantees that no more input will be + * presented to uncompressStream(). uncompressStream() must be called with the + * same input and flushOp END until it returns true. This is not mandatory, + * but if the input is all available in one buffer, and there is enough output + * space to write the entire frame, codecs can uncompress faster. + * + * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum + * amount of forward progress possible. When using this flushOp and + * uncompressStream() returns with `!output.empty()` the caller knows that all + * pending output has been flushed. This is useful for stateful streaming + * across a network, and it should be used in conjunction with + * compressStream() with flushOp FLUSH. Most users don't need to use this + * flushOp. + * + * Returns true at the end of a frame. At this point resetStream() must be + * called to reuse the codec. + */ + bool uncompressStream( + folly::ByteRange& input, + folly::MutableByteRange& output, + FlushOp flushOp = StreamCodec::FlushOp::NONE); + + protected: + explicit StreamCodec(CodecType type) : Codec(type) {} + + // Returns the uncompressed length last passed to resetStream() or none if it + // hasn't been called yet. + folly::Optional uncompressedLength() const { + return uncompressedLength_; + } + + private: + // default: Implemented using the streaming API. + std::unique_ptr doCompress(const folly::IOBuf* data) override; + std::unique_ptr doUncompress( + const folly::IOBuf* data, + folly::Optional uncompressedLength) override; + + // default: Returns false + virtual bool doNeedsDataLength() const; + virtual void doResetStream() = 0; + virtual bool doCompressStream( + folly::ByteRange& input, + folly::MutableByteRange& output, + FlushOp flushOp) = 0; + virtual bool doUncompressStream( + folly::ByteRange& input, + folly::MutableByteRange& output, + FlushOp flushOp) = 0; + + enum class State { + RESET, + COMPRESS, + COMPRESS_FLUSH, + COMPRESS_END, + UNCOMPRESS, + END, + }; + void assertStateIs(State expected) const; + + CodecType type_; + State state_{State::RESET}; + ByteRange previousInput_{}; + folly::Optional uncompressedLength_{}; +}; + constexpr int COMPRESSION_LEVEL_FASTEST = -1; constexpr int COMPRESSION_LEVEL_DEFAULT = -2; constexpr int COMPRESSION_LEVEL_BEST = -3; @@ -173,7 +411,59 @@ constexpr int COMPRESSION_LEVEL_BEST = -3; * decompress all data compressed with the a codec of the same type, regardless * of compression level. */ -std::unique_ptr getCodec(CodecType type, - int level = COMPRESSION_LEVEL_DEFAULT); +std::unique_ptr getCodec( + CodecType type, + int level = COMPRESSION_LEVEL_DEFAULT); -}} // namespaces +/** + * Return a codec for the given type. Throws on error. The level + * is a non-negative codec-dependent integer indicating the level of + * compression desired, or one of the following constants: + * + * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory, + * worst compression) + * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between + * FASTEST and BEST) + * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory, + * best compression) + * + * When decompressing, the compression level is ignored. All codecs will + * decompress all data compressed with the a codec of the same type, regardless + * of compression level. + */ +std::unique_ptr getStreamCodec( + CodecType type, + int level = COMPRESSION_LEVEL_DEFAULT); + +/** + * Returns a codec that can uncompress any of the given codec types as well as + * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to + * customCodecs in order, so long as a codec with the same type() isn't already + * present. When uncompress() is called, each codec's canUncompress() is called + * in the order that they are given. Appended default codecs are checked last. + * uncompress() is called on the first codec whose canUncompress() returns true. + * An exception is thrown if no codec canUncompress() the data. + * An exception is thrown if the chosen codec's uncompress() throws on the data. + * An exception is thrown if compress() is called on the returned codec. + * + * Requirements are checked in debug mode and are as follows: + * Let headers be the concatenation of every codec's validPrefixes(). + * 1. Each codec must override validPrefixes() and canUncompress(). + * 2. No codec's validPrefixes() may be empty. + * 3. No header in headers may be empty. + * 4. headers must not contain any duplicate elements. + * 5. No strict non-empty prefix of any header in headers may be in headers. + */ +std::unique_ptr getAutoUncompressionCodec( + std::vector> customCodecs = {}); + +/** + * Check if a specified codec is supported. + */ +bool hasCodec(CodecType type); + +/** + * Check if a specified codec is supported and supports streaming. + */ +bool hasStreamCodec(CodecType type); +}} // namespaces