folly/GroupVarint.h

   1 /*
   2  * Copyright 2012 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef FOLLY_GROUPVARINT_H_
  18 #define FOLLY_GROUPVARINT_H_
  19
  20 #ifndef __GNUC__
  21 #error GroupVarint.h requires GCC
  22 #endif
  23
  24 #if !defined(__x86_64__) && !defined(__i386__)
  25 #error GroupVarint.h requires x86_64 or i386
  26 #endif
  27
  28 #include <cstdint>
  29 #include <limits>
  30 #include "folly/detail/GroupVarintDetail.h"
  31 #include "folly/Range.h"
  32 #include <glog/logging.h>
  33
  34 #ifdef __SSSE3__
  35 #include <x86intrin.h>
  36 namespace folly {
  37 namespace detail {
  38 extern const __m128i groupVarintSSEMasks[];
  39 }  // namespace detail
  40 }  // namespace folly
  41 #endif
  42
  43 namespace folly {
  44 namespace detail {
  45 extern const uint8_t groupVarintLengths[];
  46 }  // namespace detail
  47 }  // namespace folly
  48
  49 namespace folly {
  50
  51 template <typename T>
  52 class GroupVarint;
  53
  54 /**
  55  * GroupVarint encoding for 32-bit values.
  56  *
  57  * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
  58  * There is one byte of overhead.  (The first byte contains the lengths of
  59  * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
  60  *
  61  * This implementation assumes little-endian and does unaligned 32-bit
  62  * accesses, so it's basically not portable outside of the x86[_64] world.
  63  */
  64 template <>
  65 class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
  66  public:
  67
  68   /**
  69    * Return the number of bytes used to encode these four values.
  70    */
  71   static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  72     return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
  73   }
  74
  75   /**
  76    * Return the number of bytes used to encode four uint32_t values stored
  77    * at consecutive positions in an array.
  78    */
  79   static size_t size(const uint32_t* p) {
  80     return size(p[0], p[1], p[2], p[3]);
  81   }
  82
  83   /**
  84    * Return the number of bytes used to encode count (<= 4) values.
  85    * If you clip a buffer after these many bytes, you can still decode
  86    * the first "count" values correctly (if the remaining size() -
  87    * partialSize() bytes are filled with garbage).
  88    */
  89   static size_t partialSize(const type* p, size_t count) {
  90     DCHECK_LE(count, kGroupSize);
  91     size_t s = kHeaderSize + count;
  92     for (; count; --count, ++p) {
  93       s += key(*p);
  94     }
  95     return s;
  96   }
  97
  98   /**
  99    * Return the number of values from *p that are valid from an encoded
 100    * buffer of size bytes.
 101    */
 102   static size_t partialCount(const char* p, size_t size) {
 103     char v = *p;
 104     size_t s = kHeaderSize;
 105     s += 1 + b0key(v);
 106     if (s > size) return 0;
 107     s += 1 + b1key(v);
 108     if (s > size) return 1;
 109     s += 1 + b2key(v);
 110     if (s > size) return 2;
 111     s += 1 + b3key(v);
 112     if (s > size) return 3;
 113     return 4;
 114   }
 115
 116   /**
 117    * Given a pointer to the beginning of an GroupVarint32-encoded block,
 118    * return the number of bytes used by the encoding.
 119    */
 120   static size_t encodedSize(const char* p) {
 121     return (kHeaderSize + kGroupSize +
 122             b0key(*p) + b1key(*p) + b2key(*p) + b3key(*p));
 123   }
 124
 125   /**
 126    * Encode four uint32_t values into the buffer pointed-to by p, and return
 127    * the next position in the buffer (that is, one character past the last
 128    * encoded byte).  p needs to have at least size()+4 bytes available.
 129    */
 130   static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 131     uint8_t b0key = key(a);
 132     uint8_t b1key = key(b);
 133     uint8_t b2key = key(c);
 134     uint8_t b3key = key(d);
 135     *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key;
 136     *reinterpret_cast<uint32_t*>(p) = a;
 137     p += b0key+1;
 138     *reinterpret_cast<uint32_t*>(p) = b;
 139     p += b1key+1;
 140     *reinterpret_cast<uint32_t*>(p) = c;
 141     p += b2key+1;
 142     *reinterpret_cast<uint32_t*>(p) = d;
 143     p += b3key+1;
 144     return p;
 145   }
 146
 147   /**
 148    * Encode four uint32_t values from the array pointed-to by src into the
 149    * buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
 150    */
 151   static char* encode(char* p, const uint32_t* src) {
 152     return encode(p, src[0], src[1], src[2], src[3]);
 153   }
 154
 155   /**
 156    * Decode four uint32_t values from a buffer, and return the next position
 157    * in the buffer (that is, one character past the last encoded byte).
 158    * The buffer needs to have at least 3 extra bytes available (they
 159    * may be read but ignored).
 160    */
 161   static const char* decode_simple(const char* p, uint32_t* a, uint32_t* b,
 162                                    uint32_t* c, uint32_t* d) {
 163     size_t k = *reinterpret_cast<const uint8_t*>(p);
 164     const char* end = p + detail::groupVarintLengths[k];
 165     ++p;
 166     size_t k0 = b0key(k);
 167     *a = *reinterpret_cast<const uint32_t*>(p) & kMask[k0];
 168     p += k0+1;
 169     size_t k1 = b1key(k);
 170     *b = *reinterpret_cast<const uint32_t*>(p) & kMask[k1];
 171     p += k1+1;
 172     size_t k2 = b2key(k);
 173     *c = *reinterpret_cast<const uint32_t*>(p) & kMask[k2];
 174     p += k2+1;
 175     size_t k3 = b3key(k);
 176     *d = *reinterpret_cast<const uint32_t*>(p) & kMask[k3];
 177     p += k3+1;
 178     return end;
 179   }
 180
 181   /**
 182    * Decode four uint32_t values from a buffer and store them in the array
 183    * pointed-to by dest, similar to decode(p,a,b,c,d) above.
 184    */
 185   static const char* decode_simple(const char* p, uint32_t* dest) {
 186     return decode_simple(p, dest, dest+1, dest+2, dest+3);
 187   }
 188
 189 #ifdef __SSSE3__
 190   static const char* decode(const char* p, uint32_t* dest) {
 191     uint8_t key = p[0];
 192     __m128i val = _mm_loadu_si128((const __m128i*)(p+1));
 193     __m128i mask = detail::groupVarintSSEMasks[key];
 194     __m128i r = _mm_shuffle_epi8(val, mask);
 195     _mm_storeu_si128((__m128i*)dest, r);
 196     return p + detail::groupVarintLengths[key];
 197   }
 198
 199   static const char* decode(const char* p, uint32_t* a, uint32_t* b,
 200                             uint32_t* c, uint32_t* d) {
 201     uint8_t key = p[0];
 202     __m128i val = _mm_loadu_si128((const __m128i*)(p+1));
 203     __m128i mask = detail::groupVarintSSEMasks[key];
 204     __m128i r = _mm_shuffle_epi8(val, mask);
 205
 206     // Extracting 32 bits at a time out of an XMM register is a SSE4 feature
 207 #ifdef __SSE4__
 208     *a = _mm_extract_epi32(r, 0);
 209     *b = _mm_extract_epi32(r, 1);
 210     *c = _mm_extract_epi32(r, 2);
 211     *d = _mm_extract_epi32(r, 3);
 212 #else  /* !__SSE4__ */
 213     *a = _mm_extract_epi16(r, 0) + (_mm_extract_epi16(r, 1) << 16);
 214     *b = _mm_extract_epi16(r, 2) + (_mm_extract_epi16(r, 3) << 16);
 215     *c = _mm_extract_epi16(r, 4) + (_mm_extract_epi16(r, 5) << 16);
 216     *d = _mm_extract_epi16(r, 6) + (_mm_extract_epi16(r, 7) << 16);
 217 #endif  /* __SSE4__ */
 218
 219     return p + detail::groupVarintLengths[key];
 220   }
 221
 222 #else  /* !__SSSE3__ */
 223   static const char* decode(const char* p, uint32_t* a, uint32_t* b,
 224                             uint32_t* c, uint32_t* d) {
 225     return decode_simple(p, a, b, c, d);
 226   }
 227
 228   static const char* decode(const char* p, uint32_t* dest) {
 229     return decode_simple(p, dest);
 230   }
 231 #endif  /* __SSSE3__ */
 232
 233  private:
 234   static uint8_t key(uint32_t x) {
 235     // __builtin_clz is undefined for the x==0 case
 236     return 3 - (__builtin_clz(x|1) / 8);
 237   }
 238   static size_t b0key(size_t x) { return x & 3; }
 239   static size_t b1key(size_t x) { return (x >> 2) & 3; }
 240   static size_t b2key(size_t x) { return (x >> 4) & 3; }
 241   static size_t b3key(size_t x) { return (x >> 6) & 3; }
 242
 243   static const uint32_t kMask[];
 244 };
 245
 246
 247 /**
 248  * GroupVarint encoding for 64-bit values.
 249  *
 250  * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
 251  * There are two bytes of overhead.  (The first two bytes contain the lengths
 252  * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
 253  *
 254  * This implementation assumes little-endian and does unaligned 64-bit
 255  * accesses, so it's basically not portable outside of the x86[_64] world.
 256  */
 257 template <>
 258 class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
 259  public:
 260   /**
 261    * Return the number of bytes used to encode these five values.
 262    */
 263   static size_t size(uint64_t a, uint64_t b, uint64_t c, uint64_t d,
 264                      uint64_t e) {
 265     return (kHeaderSize + kGroupSize +
 266             key(a) + key(b) + key(c) + key(d) + key(e));
 267   }
 268
 269   /**
 270    * Return the number of bytes used to encode five uint64_t values stored
 271    * at consecutive positions in an array.
 272    */
 273   static size_t size(const uint64_t* p) {
 274     return size(p[0], p[1], p[2], p[3], p[4]);
 275   }
 276
 277   /**
 278    * Return the number of bytes used to encode count (<= 4) values.
 279    * If you clip a buffer after these many bytes, you can still decode
 280    * the first "count" values correctly (if the remaining size() -
 281    * partialSize() bytes are filled with garbage).
 282    */
 283   static size_t partialSize(const type* p, size_t count) {
 284     DCHECK_LE(count, kGroupSize);
 285     size_t s = kHeaderSize + count;
 286     for (; count; --count, ++p) {
 287       s += key(*p);
 288     }
 289     return s;
 290   }
 291
 292   /**
 293    * Return the number of values from *p that are valid from an encoded
 294    * buffer of size bytes.
 295    */
 296   static size_t partialCount(const char* p, size_t size) {
 297     uint16_t v = *reinterpret_cast<const uint16_t*>(p);
 298     size_t s = kHeaderSize;
 299     s += 1 + b0key(v);
 300     if (s > size) return 0;
 301     s += 1 + b1key(v);
 302     if (s > size) return 1;
 303     s += 1 + b2key(v);
 304     if (s > size) return 2;
 305     s += 1 + b3key(v);
 306     if (s > size) return 3;
 307     s += 1 + b4key(v);
 308     if (s > size) return 4;
 309     return 5;
 310   }
 311
 312   /**
 313    * Given a pointer to the beginning of an GroupVarint64-encoded block,
 314    * return the number of bytes used by the encoding.
 315    */
 316   static size_t encodedSize(const char* p) {
 317     uint16_t n = *reinterpret_cast<const uint16_t*>(p);
 318     return (kHeaderSize + kGroupSize +
 319             b0key(n) + b1key(n) + b2key(n) + b3key(n) + b4key(n));
 320   }
 321
 322   /**
 323    * Encode five uint64_t values into the buffer pointed-to by p, and return
 324    * the next position in the buffer (that is, one character past the last
 325    * encoded byte).  p needs to have at least size()+8 bytes available.
 326    */
 327   static char* encode(char* p, uint64_t a, uint64_t b, uint64_t c,
 328                       uint64_t d, uint64_t e) {
 329     uint8_t b0key = key(a);
 330     uint8_t b1key = key(b);
 331     uint8_t b2key = key(c);
 332     uint8_t b3key = key(d);
 333     uint8_t b4key = key(e);
 334     *reinterpret_cast<uint16_t*>(p) =
 335       (b4key << 12) | (b3key << 9) | (b2key << 6) | (b1key << 3) | b0key;
 336     p += 2;
 337     *reinterpret_cast<uint64_t*>(p) = a;
 338     p += b0key+1;
 339     *reinterpret_cast<uint64_t*>(p) = b;
 340     p += b1key+1;
 341     *reinterpret_cast<uint64_t*>(p) = c;
 342     p += b2key+1;
 343     *reinterpret_cast<uint64_t*>(p) = d;
 344     p += b3key+1;
 345     *reinterpret_cast<uint64_t*>(p) = e;
 346     p += b4key+1;
 347     return p;
 348   }
 349
 350   /**
 351    * Encode five uint64_t values from the array pointed-to by src into the
 352    * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
 353    */
 354   static char* encode(char* p, const uint64_t* src) {
 355     return encode(p, src[0], src[1], src[2], src[3], src[4]);
 356   }
 357
 358   /**
 359    * Decode five uint64_t values from a buffer, and return the next position
 360    * in the buffer (that is, one character past the last encoded byte).
 361    * The buffer needs to have at least 7 bytes available (they may be read
 362    * but ignored).
 363    */
 364   static const char* decode(const char* p, uint64_t* a, uint64_t* b,
 365                             uint64_t* c, uint64_t* d, uint64_t* e) {
 366     uint16_t k = *reinterpret_cast<const uint16_t*>(p);
 367     p += 2;
 368     uint8_t k0 = b0key(k);
 369     *a = *reinterpret_cast<const uint64_t*>(p) & kMask[k0];
 370     p += k0+1;
 371     uint8_t k1 = b1key(k);
 372     *b = *reinterpret_cast<const uint64_t*>(p) & kMask[k1];
 373     p += k1+1;
 374     uint8_t k2 = b2key(k);
 375     *c = *reinterpret_cast<const uint64_t*>(p) & kMask[k2];
 376     p += k2+1;
 377     uint8_t k3 = b3key(k);
 378     *d = *reinterpret_cast<const uint64_t*>(p) & kMask[k3];
 379     p += k3+1;
 380     uint8_t k4 = b4key(k);
 381     *e = *reinterpret_cast<const uint64_t*>(p) & kMask[k4];
 382     p += k4+1;
 383     return p;
 384   }
 385
 386   /**
 387    * Decode five uint64_t values from a buffer and store them in the array
 388    * pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
 389    */
 390   static const char* decode(const char* p, uint64_t* dest) {
 391     return decode(p, dest, dest+1, dest+2, dest+3, dest+4);
 392   }
 393
 394  private:
 395   enum { kHeaderBytes = 2 };
 396
 397   static uint8_t key(uint64_t x) {
 398     // __builtin_clzll is undefined for the x==0 case
 399     return 7 - (__builtin_clzll(x|1) / 8);
 400   }
 401
 402   static uint8_t b0key(uint16_t x) { return x & 7; }
 403   static uint8_t b1key(uint16_t x) { return (x >> 3) & 7; }
 404   static uint8_t b2key(uint16_t x) { return (x >> 6) & 7; }
 405   static uint8_t b3key(uint16_t x) { return (x >> 9) & 7; }
 406   static uint8_t b4key(uint16_t x) { return (x >> 12) & 7; }
 407
 408   static const uint64_t kMask[];
 409 };
 410
 411 typedef GroupVarint<uint32_t> GroupVarint32;
 412 typedef GroupVarint<uint64_t> GroupVarint64;
 413
 414 /**
 415  * Simplify use of GroupVarint* for the case where data is available one
 416  * entry at a time (instead of one group at a time).  Handles buffering
 417  * and an incomplete last chunk.
 418  *
 419  * Output is a function object that accepts character ranges:
 420  * out(StringPiece) appends the given character range to the output.
 421  */
 422 template <class T, class Output>
 423 class GroupVarintEncoder {
 424  public:
 425   typedef GroupVarint<T> Base;
 426   typedef T type;
 427
 428   explicit GroupVarintEncoder(Output out)
 429     : out_(out),
 430       count_(0) {
 431   }
 432
 433   ~GroupVarintEncoder() {
 434     finish();
 435   }
 436
 437   /**
 438    * Add a value to the encoder.
 439    */
 440   void add(type val) {
 441     buf_[count_++] = val;
 442     if (count_ == Base::kGroupSize) {
 443       char* p = Base::encode(tmp_, buf_);
 444       out_(StringPiece(tmp_, p));
 445       count_ = 0;
 446     }
 447   }
 448
 449   /**
 450    * Finish encoding, flushing any buffered values if necessary.
 451    * After finish(), the encoder is immediately ready to encode more data
 452    * to the same output.
 453    */
 454   void finish() {
 455     if (count_) {
 456       // This is not strictly necessary, but it makes testing easy;
 457       // uninitialized bytes are guaranteed to be recorded as taking one byte
 458       // (not more).
 459       for (size_t i = count_; i < Base::kGroupSize; i++) {
 460         buf_[i] = 0;
 461       }
 462       Base::encode(tmp_, buf_);
 463       out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
 464       count_ = 0;
 465     }
 466   }
 467
 468   /**
 469    * Return the appender that was used.
 470    */
 471   Output& output() {
 472     return out_;
 473   }
 474   const Output& output() const {
 475     return out_;
 476   }
 477
 478   /**
 479    * Reset the encoder, disregarding any state (except what was already
 480    * flushed to the output, of course).
 481    */
 482   void clear() {
 483     count_ = 0;
 484   }
 485
 486  private:
 487   Output out_;
 488   char tmp_[Base::kMaxSize];
 489   type buf_[Base::kGroupSize];
 490   size_t count_;
 491 };
 492
 493 /**
 494  * Simplify use of GroupVarint* for the case where the last group in the
 495  * input may be incomplete (but the exact size of the input is known).
 496  * Allows for extracting values one at a time.
 497  */
 498 template <typename T>
 499 class GroupVarintDecoder {
 500  public:
 501   typedef GroupVarint<T> Base;
 502   typedef T type;
 503
 504   GroupVarintDecoder() { }
 505
 506   explicit GroupVarintDecoder(StringPiece data,
 507                               size_t maxCount = (size_t)-1)
 508     : p_(data.data()),
 509       end_(data.data() + data.size()),
 510       pos_(0),
 511       count_(0),
 512       remaining_(maxCount) {
 513   }
 514
 515   void reset(StringPiece data, size_t maxCount=(size_t)-1) {
 516     p_ = data.data();
 517     end_ = data.data() + data.size();
 518     pos_ = 0;
 519     count_ = 0;
 520     remaining_ = maxCount;
 521   }
 522
 523   /**
 524    * Read and return the next value.
 525    */
 526   bool next(type* val) {
 527     if (pos_ == count_) {
 528       // refill
 529       size_t rem = end_ - p_;
 530       if (rem == 0 || remaining_ == 0) {
 531         return false;
 532       }
 533       // next() attempts to read one full group at a time, and so we must have
 534       // at least enough bytes readable after its end to handle the case if the
 535       // last group is full.
 536       //
 537       // The best way to ensure this is to ensure that data has at least
 538       // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy
 539       // into a temporary buffer.
 540       if (rem < Base::kMaxSize) {
 541         memcpy(tmp_, p_, rem);
 542         p_ = tmp_;
 543         end_ = p_ + rem;
 544       }
 545       pos_ = 0;
 546       const char* n = Base::decode(p_, buf_);
 547       if (n <= end_) {
 548         // Full group could be decoded
 549         if (remaining_ >= Base::kGroupSize) {
 550           remaining_ -= Base::kGroupSize;
 551           count_ = Base::kGroupSize;
 552           p_ = n;
 553         } else {
 554           count_ = remaining_;
 555           remaining_ = 0;
 556           p_ += Base::partialSize(buf_, count_);
 557         }
 558       } else {
 559         // Can't decode a full group
 560         count_ = Base::partialCount(p_, end_ - p_);
 561         if (remaining_ >= count_) {
 562           remaining_ -= count_;
 563           p_ = end_;
 564         } else {
 565           count_ = remaining_;
 566           remaining_ = 0;
 567           p_ += Base::partialSize(buf_, count_);
 568         }
 569         if (count_ == 0) {
 570           return false;
 571         }
 572       }
 573     }
 574     *val = buf_[pos_++];
 575     return true;
 576   }
 577
 578   StringPiece rest() const {
 579     // This is only valid after next() returned false
 580     CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0));
 581     return StringPiece(p_, end_ - p_);
 582   }
 583
 584  private:
 585   const char* p_;
 586   const char* end_;
 587   char tmp_[Base::kMaxSize];
 588   type buf_[Base::kGroupSize];
 589   size_t pos_;
 590   size_t count_;
 591   size_t remaining_;
 592 };
 593
 594 typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
 595 typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
 596
 597 }  // namespace folly
 598
 599 #endif /* FOLLY_GROUPVARINT_H_ */
 600