2 * Copyright 2017 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
25 #include <folly/Optional.h>
26 #include <folly/Range.h>
27 #include <folly/io/IOBuf.h>
30 * Compression / decompression over IOBufs
36 enum class CodecType {
38 * This codec type is not defined; getCodec() will throw an exception
39 * if used. Useful if deriving your own classes from Codec without
40 * going through the getCodec() interface.
51 * Use LZ4 compression.
52 * Levels supported: 1 = fast, 2 = best; default = 1
57 * Use Snappy compression.
63 * Use zlib compression.
64 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
69 * Use LZ4 compression, prefixed with size (as Varint).
74 * Use LZMA2 compression.
75 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
78 LZMA2_VARINT_SIZE = 7,
81 * Use ZSTD compression.
86 * Use gzip compression. This is the same compression algorithm as ZLIB but
87 * gzip-compressed files tend to be easier to work with from the command line.
88 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
93 * Use LZ4 frame compression.
94 * Levels supported: 0 = fast, 16 = best; default = 0
99 * Use bzip2 compression.
100 * Levels supported: 1 = fast, 9 = best; default = 9
104 NUM_CODEC_TYPES = 12,
111 static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1);
113 * Return the maximum length of data that may be compressed with this codec.
114 * NO_COMPRESSION and ZLIB support arbitrary lengths;
115 * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB.
116 * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited.
118 uint64_t maxUncompressedLength() const;
121 * Return the codec's type.
123 CodecType type() const { return type_; }
126 * Does this codec need the exact uncompressed length on decompression?
128 bool needsUncompressedLength() const;
131 * Compress data, returning an IOBuf (which may share storage with data).
132 * Throws std::invalid_argument if data is larger than
133 * maxUncompressedLength().
135 std::unique_ptr<IOBuf> compress(const folly::IOBuf* data);
138 * Compresses data. May involve additional copies compared to the overload
139 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
142 std::string compress(StringPiece data);
145 * Uncompress data. Throws std::runtime_error on decompression error.
147 * Some codecs (LZ4) require the exact uncompressed length; this is indicated
148 * by needsUncompressedLength().
150 * For other codes (zlib), knowing the exact uncompressed length ahead of
151 * time might be faster.
153 * Regardless of the behavior of the underlying compressor, uncompressing
154 * an empty IOBuf chain will return an empty IOBuf chain.
156 std::unique_ptr<IOBuf> uncompress(
158 folly::Optional<uint64_t> uncompressedLength = folly::none);
161 * Uncompresses data. May involve additional copies compared to the overload
162 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
165 std::string uncompress(
167 folly::Optional<uint64_t> uncompressedLength = folly::none);
170 * Returns a bound on the maximum compressed length when compressing data with
171 * the given uncompressed length.
173 uint64_t maxCompressedLength(uint64_t uncompressedLength) const;
176 * Extracts the uncompressed length from the compressed data if possible.
177 * If the codec doesn't store the uncompressed length, or the data is
178 * corrupted it returns the given uncompressedLength.
179 * If the uncompressed length is stored in the compressed data and
180 * uncompressedLength is not none and they do not match a std::runtime_error
183 folly::Optional<uint64_t> getUncompressedLength(
184 const folly::IOBuf* data,
185 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
188 explicit Codec(CodecType type);
192 * Returns a superset of the set of prefixes for which canUncompress() will
193 * return true. A superset is allowed for optimizations in canUncompress()
194 * based on other knowledge such as length. None of the prefixes may be empty.
195 * default: No prefixes.
197 virtual std::vector<std::string> validPrefixes() const;
200 * Returns true if the codec thinks it can uncompress the data.
201 * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy,
202 * it can always return false.
203 * default: Returns false.
205 virtual bool canUncompress(
206 const folly::IOBuf* data,
207 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
210 // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH)
211 virtual uint64_t doMaxUncompressedLength() const;
212 // default: doesn't need uncompressed length
213 virtual bool doNeedsUncompressedLength() const;
214 virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0;
215 virtual std::unique_ptr<IOBuf> doUncompress(
216 const folly::IOBuf* data,
217 folly::Optional<uint64_t> uncompressedLength) = 0;
218 // default: an implementation is provided by default to wrap the strings into
219 // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output
220 // from IOBuf to string. Implementers, at their discretion, can override
221 // these methods to avoid the copy.
222 virtual std::string doCompressString(StringPiece data);
223 virtual std::string doUncompressString(
225 folly::Optional<uint64_t> uncompressedLength);
227 virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0;
228 // default: returns the passed uncompressedLength.
229 virtual folly::Optional<uint64_t> doGetUncompressedLength(
230 const folly::IOBuf* data,
231 folly::Optional<uint64_t> uncompressedLength) const;
236 class StreamCodec : public Codec {
238 ~StreamCodec() override {}
241 * Does the codec need the data length before compression streaming?
243 bool needsDataLength() const;
245 /*****************************************************************************
247 *****************************************************************************
248 * A low-level stateful streaming API.
249 * Streaming operations can be started in two ways:
250 * 1. From a clean Codec on which no non-const methods have been called.
251 * 2. A call to resetStream(), which will reset any codec to a clean state.
252 * After a streaming operation has begun, either compressStream() or
253 * uncompressStream() must be called until the streaming operation ends.
254 * compressStream() ends when it returns true with flushOp END.
255 * uncompressStream() ends when it returns true. At this point the codec
256 * may be reused by calling resetStream().
258 * compress() and uncompress() can be called at any time, but they interrupt
259 * any ongoing streaming operations (state is lost and resetStream() must be
260 * called before another streaming operation).
264 * Reset the state of the codec, and set the uncompressed length for the next
265 * streaming operation. If uncompressedLength is not none it must be exactly
266 * the uncompressed length. compressStream() must be passed exactly
267 * uncompressedLength input bytes before the stream is ended.
268 * uncompressStream() must be passed a compressed frame that uncompresses to
269 * uncompressedLength.
271 void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none);
273 enum class FlushOp { NONE, FLUSH, END };
276 * Compresses some data from the input buffer and writes the compressed data
277 * into the output buffer. It may read input without producing any output,
278 * except when forced to flush.
280 * The input buffer is advanced to point to the range of data that hasn't yet
281 * been read. Compression will resume at this point for the next call to
282 * compressStream(). The output buffer is advanced one byte past the last byte
285 * The default flushOp is NONE, which allows compressStream() complete
286 * discretion in how much data to gather before writing any output.
288 * If flushOp is END, all pending and input data is flushed to the output
289 * buffer, and the frame is ended. compressStream() must be called with the
290 * same input and flushOp END until it returns true. At this point the caller
291 * must call resetStream() to use the codec again.
293 * If flushOp is FLUSH, all pending and input data is flushed to the output
294 * buffer, but the frame is not ended. compressStream() must be called with
295 * the same input and flushOp END until it returns true. At this point the
296 * caller can continue to compressStream() with any input data and flushOp.
297 * The uncompressor, if passed all the produced output data, will be able to
298 * uncompress all the input data passed to compressStream() so far. Excessive
299 * use of flushOp FLUSH will deteriorate compression ratio. This is useful for
300 * stateful streaming across a network. Most users don't need to use this
303 * A std::logic_error is thrown on incorrect usage of the API.
304 * A std::runtime_error is thrown upon error conditions or if no forward
305 * progress could be made twice in a row.
308 folly::ByteRange& input,
309 folly::MutableByteRange& output,
310 FlushOp flushOp = StreamCodec::FlushOp::NONE);
313 * Uncompresses some data from the input buffer and writes the uncompressed
314 * data into the output buffer. It may read input without producing any
317 * The input buffer is advanced to point to the range of data that hasn't yet
318 * been read. Uncompression will resume at this point for the next call to
319 * uncompressStream(). The output buffer is advanced one byte past the last
322 * The default flushOp is NONE, which allows uncompressStream() complete
323 * discretion in how much output data to flush. The uncompressor may not make
324 * maximum forward progress, but will make some forward progress when
327 * If flushOp is END, the caller guarantees that no more input will be
328 * presented to uncompressStream(). uncompressStream() must be called with the
329 * same input and flushOp END until it returns true. This is not mandatory,
330 * but if the input is all available in one buffer, and there is enough output
331 * space to write the entire frame, codecs can uncompress faster.
333 * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum
334 * amount of forward progress possible. When using this flushOp and
335 * uncompressStream() returns with `!output.empty()` the caller knows that all
336 * pending output has been flushed. This is useful for stateful streaming
337 * across a network, and it should be used in conjunction with
338 * compressStream() with flushOp FLUSH. Most users don't need to use this
341 * A std::runtime_error is thrown upon error conditions or if no forward
342 * progress could be made upon two consecutive calls to the function (only the
343 * second call will throw an exception).
345 * Returns true at the end of a frame. At this point resetStream() must be
346 * called to reuse the codec.
348 bool uncompressStream(
349 folly::ByteRange& input,
350 folly::MutableByteRange& output,
351 FlushOp flushOp = StreamCodec::FlushOp::NONE);
354 explicit StreamCodec(CodecType type) : Codec(type) {}
356 // Returns the uncompressed length last passed to resetStream() or none if it
357 // hasn't been called yet.
358 folly::Optional<uint64_t> uncompressedLength() const {
359 return uncompressedLength_;
363 // default: Implemented using the streaming API.
364 std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override;
365 std::unique_ptr<IOBuf> doUncompress(
366 const folly::IOBuf* data,
367 folly::Optional<uint64_t> uncompressedLength) override;
369 // default: Returns false
370 virtual bool doNeedsDataLength() const;
371 virtual void doResetStream() = 0;
372 virtual bool doCompressStream(
373 folly::ByteRange& input,
374 folly::MutableByteRange& output,
375 FlushOp flushOp) = 0;
376 virtual bool doUncompressStream(
377 folly::ByteRange& input,
378 folly::MutableByteRange& output,
379 FlushOp flushOp) = 0;
389 void assertStateIs(State expected) const;
392 State state_{State::RESET};
393 ByteRange previousInput_{};
394 folly::Optional<uint64_t> uncompressedLength_{};
395 bool progressMade_{true};
398 constexpr int COMPRESSION_LEVEL_FASTEST = -1;
399 constexpr int COMPRESSION_LEVEL_DEFAULT = -2;
400 constexpr int COMPRESSION_LEVEL_BEST = -3;
403 * Return a codec for the given type. Throws on error. The level
404 * is a non-negative codec-dependent integer indicating the level of
405 * compression desired, or one of the following constants:
407 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
409 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
411 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
414 * When decompressing, the compression level is ignored. All codecs will
415 * decompress all data compressed with the a codec of the same type, regardless
416 * of compression level.
418 std::unique_ptr<Codec> getCodec(
420 int level = COMPRESSION_LEVEL_DEFAULT);
423 * Return a codec for the given type. Throws on error. The level
424 * is a non-negative codec-dependent integer indicating the level of
425 * compression desired, or one of the following constants:
427 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
429 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
431 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
434 * When decompressing, the compression level is ignored. All codecs will
435 * decompress all data compressed with the a codec of the same type, regardless
436 * of compression level.
438 std::unique_ptr<StreamCodec> getStreamCodec(
440 int level = COMPRESSION_LEVEL_DEFAULT);
443 * Returns a codec that can uncompress any of the given codec types as well as
444 * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to
445 * customCodecs in order, so long as a codec with the same type() isn't already
446 * present in customCodecs or as the terminalCodec. When uncompress() is called,
447 * each codec's canUncompress() is called in the order that they are given.
448 * Appended default codecs are checked last. uncompress() is called on the
449 * first codec whose canUncompress() returns true.
451 * In addition, an optional `terminalCodec` can be provided. This codec's
452 * uncompress() will be called either when no other codec canUncompress() the
453 * data or the chosen codec throws an exception on the data. The terminalCodec
454 * is intended for ambiguous headers, when canUncompress() is false for some
455 * data it can actually uncompress. The terminalCodec does not need to override
456 * validPrefixes() or canUncompress() and overriding these functions will have
457 * no effect on the returned codec's validPrefixes() or canUncompress()
458 * functions. The terminalCodec's needsUncompressedLength() and
459 * maxUncompressedLength() will affect the returned codec's respective
460 * functions. The terminalCodec must not be duplicated in customCodecs.
462 * An exception is thrown if no codec canUncompress() the data and either no
463 * terminal codec was provided or a terminal codec was provided and it throws on
465 * An exception is thrown if the chosen codec's uncompress() throws on the data
466 * and either no terminal codec was provided or a terminal codec was provided
467 * and it also throws on the data.
468 * An exception is thrown if compress() is called on the returned codec.
470 * Requirements are checked in debug mode and are as follows:
471 * Let headers be the concatenation of every codec's validPrefixes().
472 * 1. Each codec must override validPrefixes() and canUncompress().
473 * 2. No codec's validPrefixes() may be empty.
474 * 3. No header in headers may be empty.
475 * 4. headers must not contain any duplicate elements.
476 * 5. No strict non-empty prefix of any header in headers may be in headers.
477 * 6. The terminalCodec's type must not be the same as any other codec's type
478 * (with USER_DEFINED being the exception).
480 std::unique_ptr<Codec> getAutoUncompressionCodec(
481 std::vector<std::unique_ptr<Codec>> customCodecs = {},
482 std::unique_ptr<Codec> terminalCodec = {});
485 * Check if a specified codec is supported.
487 bool hasCodec(CodecType type);
490 * Check if a specified codec is supported and supports streaming.
492 bool hasStreamCodec(CodecType type);