folly/experimental/EliasFanoCoding.h

   1 /*
   2  * Copyright 2015 Facebook, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *   http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /**
  18  * @author Philip Pronin (philipp@fb.com)
  19  *
  20  * Based on the paper by Sebastiano Vigna,
  21  * "Quasi-succinct indices" (arxiv:1206.4300).
  22  */
  23
  24 #ifndef FOLLY_EXPERIMENTAL_ELIAS_FANO_CODING_H
  25 #define FOLLY_EXPERIMENTAL_ELIAS_FANO_CODING_H
  26
  27 #ifndef __GNUC__
  28 #error EliasFanoCoding.h requires GCC
  29 #endif
  30
  31 #if !FOLLY_X64
  32 #error EliasFanoCoding.h requires x86_64
  33 #endif
  34
  35 #include <cstdlib>
  36 #include <limits>
  37 #include <type_traits>
  38 #include <boost/noncopyable.hpp>
  39 #include <glog/logging.h>
  40
  41 #include <folly/Bits.h>
  42 #include <folly/CpuId.h>
  43 #include <folly/Likely.h>
  44 #include <folly/Range.h>
  45 #include <folly/experimental/Select64.h>
  46
  47 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
  48 #error EliasFanoCoding.h requires little endianness
  49 #endif
  50
  51 namespace folly { namespace compression {
  52
  53 struct EliasFanoCompressedList {
  54   EliasFanoCompressedList() { }
  55
  56   void free() {
  57     ::free(const_cast<unsigned char*>(lower.data()));
  58     ::free(const_cast<unsigned char*>(upper.data()));
  59     ::free(const_cast<unsigned char*>(skipPointers.data()));
  60     ::free(const_cast<unsigned char*>(forwardPointers.data()));
  61   }
  62
  63   size_t size = 0;
  64   uint8_t numLowerBits = 0;
  65
  66   // WARNING: EliasFanoCompressedList has no ownership of
  67   // lower, upper, skipPointers and forwardPointers.
  68   // The 7 bytes following the last byte of lower and upper
  69   // sequences should be readable.
  70   folly::ByteRange lower;
  71   folly::ByteRange upper;
  72
  73   folly::ByteRange skipPointers;
  74   folly::ByteRange forwardPointers;
  75 };
  76
  77 template <class Value,
  78           class SkipValue = size_t,
  79           size_t kSkipQuantum = 0,     // 0 = disabled
  80           size_t kForwardQuantum = 0>  // 0 = disabled
  81 struct EliasFanoEncoderV2 {
  82   static_assert(std::is_integral<Value>::value &&
  83                 std::is_unsigned<Value>::value,
  84                 "Value should be unsigned integral");
  85
  86   typedef EliasFanoCompressedList CompressedList;
  87
  88   typedef Value ValueType;
  89   typedef SkipValue SkipValueType;
  90
  91   static constexpr size_t skipQuantum = kSkipQuantum;
  92   static constexpr size_t forwardQuantum = kForwardQuantum;
  93
  94   static uint8_t defaultNumLowerBits(size_t upperBound, size_t size) {
  95     if (size == 0 || upperBound < size) {
  96       return 0;
  97     }
  98     // floor(log(upperBound / size));
  99     return folly::findLastSet(upperBound / size) - 1;
 100   }
 101
 102   // Requires: input range (begin, end) is sorted (encoding
 103   // crashes if it's not).
 104   // WARNING: encode() mallocates lower, upper, skipPointers
 105   // and forwardPointers. As EliasFanoCompressedList has
 106   // no ownership of them, you need to call free() explicitly.
 107   template <class RandomAccessIterator>
 108   static EliasFanoCompressedList encode(RandomAccessIterator begin,
 109                                         RandomAccessIterator end) {
 110     if (begin == end) {
 111       return EliasFanoCompressedList();
 112     }
 113     EliasFanoEncoderV2 encoder(end - begin, *(end - 1));
 114     for (; begin != end; ++begin) {
 115       encoder.add(*begin);
 116     }
 117     return encoder.finish();
 118   }
 119
 120   EliasFanoEncoderV2(size_t size, ValueType upperBound) {
 121     if (size == 0) {
 122       return;
 123     }
 124
 125     uint8_t numLowerBits = defaultNumLowerBits(upperBound, size);
 126
 127     // This is detail::writeBits56 limitation.
 128     numLowerBits = std::min<uint8_t>(numLowerBits, 56);
 129     CHECK_LT(numLowerBits, 8 * sizeof(Value));  // As we shift by numLowerBits.
 130
 131     // WARNING: Current read/write logic assumes that the 7 bytes
 132     // following the last byte of lower and upper sequences are
 133     // readable (stored value doesn't matter and won't be changed),
 134     // so we allocate additional 7B, but do not include them in size
 135     // of returned value.
 136
 137     // *** Lower bits.
 138     const size_t lowerSize = (numLowerBits * size + 7) / 8;
 139     if (lowerSize > 0) {  // numLowerBits != 0
 140       lower_ = static_cast<unsigned char*>(calloc(lowerSize + 7, 1));
 141     }
 142
 143     // *** Upper bits.
 144     // Upper bits are stored using unary delta encoding.
 145     // For example, (3 5 5 9) will be encoded as 1000011001000_2.
 146     const size_t upperSizeBits =
 147       (upperBound >> numLowerBits) +  // Number of 0-bits to be stored.
 148       size;                           // 1-bits.
 149     const size_t upperSize = (upperSizeBits + 7) / 8;
 150     upper_ = static_cast<unsigned char*>(calloc(upperSize + 7, 1));
 151
 152     // *** Skip pointers.
 153     // Store (1-indexed) position of every skipQuantum-th
 154     // 0-bit in upper bits sequence.
 155     size_t numSkipPointers = 0;
 156     /* static */ if (skipQuantum != 0) {
 157       CHECK_LT(size, std::numeric_limits<SkipValueType>::max());
 158
 159       // 8 * upperSize is used here instead of upperSizeBits, as that is
 160       // more serialization-friendly way (upperSizeBits isn't known outside of
 161       // this function, unlike upperSize; thus numSkipPointers could easily be
 162       // deduced from upperSize).
 163       numSkipPointers = (8 * upperSize - size) / (skipQuantum ?: 1);
 164       skipPointers_ = static_cast<SkipValueType*>(
 165           numSkipPointers == 0
 166             ? nullptr
 167             : calloc(numSkipPointers, sizeof(SkipValueType)));
 168     }
 169
 170     // *** Forward pointers.
 171     // Store (1-indexed) position of every forwardQuantum-th
 172     // 1-bit in upper bits sequence.
 173     size_t numForwardPointers = 0;
 174     /* static */ if (forwardQuantum != 0) {
 175       CHECK_LT(upperBound >> numLowerBits,
 176                std::numeric_limits<SkipValueType>::max());
 177
 178       // '?: 1' is a workaround for false 'division by zero' compile-time error.
 179       numForwardPointers = size / (forwardQuantum ?: 1);
 180       forwardPointers_ = static_cast<SkipValueType*>(
 181         numForwardPointers == 0
 182           ? nullptr
 183           : malloc(numForwardPointers * sizeof(SkipValueType)));
 184     }
 185
 186     // *** Result.
 187     result_.size = size;
 188     result_.numLowerBits = numLowerBits;
 189     result_.lower.reset(lower_, lowerSize);
 190     result_.upper.reset(upper_, upperSize);
 191     result_.skipPointers.reset(
 192         reinterpret_cast<unsigned char*>(skipPointers_),
 193         numSkipPointers * sizeof(SkipValueType));
 194     result_.forwardPointers.reset(
 195         reinterpret_cast<unsigned char*>(forwardPointers_),
 196         numForwardPointers * sizeof(SkipValueType));
 197   }
 198
 199   void add(ValueType value) {
 200     CHECK_GE(value, lastValue_);
 201
 202     const auto numLowerBits = result_.numLowerBits;
 203     const ValueType upperBits = value >> numLowerBits;
 204
 205     // Upper sequence consists of upperBits 0-bits and (size_ + 1) 1-bits.
 206     const size_t pos = upperBits + size_;
 207     upper_[pos / 8] |= 1U << (pos % 8);
 208     // Append numLowerBits bits to lower sequence.
 209     if (numLowerBits != 0) {
 210       const ValueType lowerBits = value & ((ValueType(1) << numLowerBits) - 1);
 211       writeBits56(lower_, size_ * numLowerBits, numLowerBits, lowerBits);
 212     }
 213
 214     /* static */ if (skipQuantum != 0) {
 215       while ((skipPointersSize_ + 1) * skipQuantum <= upperBits) {
 216         // Store the number of preceding 1-bits.
 217         skipPointers_[skipPointersSize_++] = size_;
 218       }
 219     }
 220
 221     /* static */ if (forwardQuantum != 0) {
 222       if ((size_ + 1) % forwardQuantum == 0) {
 223         const auto pos = size_ / forwardQuantum;
 224         // Store the number of preceding 0-bits.
 225         forwardPointers_[pos] = upperBits;
 226       }
 227     }
 228
 229     lastValue_ = value;
 230     ++size_;
 231   }
 232
 233   const EliasFanoCompressedList& finish() const {
 234     CHECK_EQ(size_, result_.size);
 235     return result_;
 236   }
 237
 238  private:
 239   // Writes value (with len up to 56 bits) to data starting at pos-th bit.
 240   static void writeBits56(unsigned char* data, size_t pos,
 241                           uint8_t len, uint64_t value) {
 242     DCHECK_LE(uint32_t(len), 56);
 243     DCHECK_EQ(0, value & ~((uint64_t(1) << len) - 1));
 244     unsigned char* const ptr = data + (pos / 8);
 245     uint64_t ptrv = folly::loadUnaligned<uint64_t>(ptr);
 246     ptrv |= value << (pos % 8);
 247     folly::storeUnaligned<uint64_t>(ptr, ptrv);
 248   }
 249
 250   unsigned char* lower_ = nullptr;
 251   unsigned char* upper_ = nullptr;
 252   SkipValueType* skipPointers_ = nullptr;
 253   SkipValueType* forwardPointers_ = nullptr;
 254
 255   ValueType lastValue_ = 0;
 256   size_t size_ = 0;
 257   size_t skipPointersSize_ = 0;
 258
 259   EliasFanoCompressedList result_;
 260 };
 261
 262 // NOTE: It's recommended to compile EF coding with -msse4.2, starting
 263 // with Nehalem, Intel CPUs support POPCNT instruction and gcc will emit
 264 // it for __builtin_popcountll intrinsic.
 265 // But we provide an alternative way for the client code: it can switch to
 266 // the appropriate version of EliasFanoReader<> in realtime (client should
 267 // implement this switching logic itself) by specifying instruction set to
 268 // use explicitly.
 269 namespace instructions {
 270
 271 struct Default {
 272   static bool supported(const folly::CpuId& cpuId = {}) {
 273     return true;
 274   }
 275   static inline uint64_t popcount(uint64_t value) {
 276     return __builtin_popcountll(value);
 277   }
 278   static inline int ctz(uint64_t value) {
 279     DCHECK_GT(value, 0);
 280     return __builtin_ctzll(value);
 281   }
 282   static inline uint64_t blsr(uint64_t value) {
 283     return value & (value - 1);
 284   }
 285 };
 286
 287 struct Nehalem : public Default {
 288   static bool supported(const folly::CpuId& cpuId = {}) {
 289     return cpuId.popcnt();
 290   }
 291   static inline uint64_t popcount(uint64_t value) {
 292     // POPCNT is supported starting with Intel Nehalem, AMD K10.
 293     uint64_t result;
 294     asm ("popcntq %1, %0" : "=r" (result) : "r" (value));
 295     return result;
 296   }
 297 };
 298
 299 struct Haswell : public Nehalem {
 300   static bool supported(const folly::CpuId& cpuId = {}) {
 301     return Nehalem::supported(cpuId) && cpuId.bmi1();
 302   }
 303   static inline uint64_t blsr(uint64_t value) {
 304     // BMI1 is supported starting with Intel Haswell, AMD Piledriver.
 305     // BLSR combines two instuctions into one and reduces register pressure.
 306     uint64_t result;
 307     asm ("blsrq %1, %0" : "=r" (result) : "r" (value));
 308     return result;
 309   }
 310 };
 311
 312 }  // namespace instructions
 313
 314 namespace detail {
 315
 316 template <class Encoder, class Instructions>
 317 class UpperBitsReader {
 318   typedef typename Encoder::SkipValueType SkipValueType;
 319  public:
 320   typedef typename Encoder::ValueType ValueType;
 321
 322   explicit UpperBitsReader(const EliasFanoCompressedList& list)
 323     : forwardPointers_(list.forwardPointers.data()),
 324       skipPointers_(list.skipPointers.data()),
 325       start_(list.upper.data()) {
 326     reset();
 327   }
 328
 329   void reset() {
 330     block_ = start_ != nullptr ? folly::loadUnaligned<block_t>(start_) : 0;
 331     outer_ = 0;
 332     inner_ = -1;
 333     position_ = -1;
 334     value_ = 0;
 335   }
 336
 337   size_t position() const { return position_; }
 338   ValueType value() const { return value_; }
 339
 340   ValueType next() {
 341     // Skip to the first non-zero block.
 342     while (block_ == 0) {
 343       outer_ += sizeof(block_t);
 344       block_ = folly::loadUnaligned<block_t>(start_ + outer_);
 345     }
 346
 347     ++position_;
 348     inner_ = Instructions::ctz(block_);
 349     block_ = Instructions::blsr(block_);
 350
 351     return setValue();
 352   }
 353
 354   ValueType skip(size_t n) {
 355     DCHECK_GT(n, 0);
 356
 357     position_ += n;  // n 1-bits will be read.
 358
 359     // Use forward pointer.
 360     if (Encoder::forwardQuantum > 0 && n > Encoder::forwardQuantum) {
 361       // Workaround to avoid 'division by zero' compile-time error.
 362       constexpr size_t q = Encoder::forwardQuantum ?: 1;
 363
 364       const size_t steps = position_ / q;
 365       const size_t dest =
 366         folly::loadUnaligned<SkipValueType>(
 367             forwardPointers_ + (steps - 1) * sizeof(SkipValueType));
 368
 369       reposition(dest + steps * q);
 370       n = position_ + 1 - steps * q;  // n is > 0.
 371       // Correct inner_ will be set at the end.
 372     }
 373
 374     size_t cnt;
 375     // Find necessary block.
 376     while ((cnt = Instructions::popcount(block_)) < n) {
 377       n -= cnt;
 378       outer_ += sizeof(block_t);
 379       block_ = folly::loadUnaligned<block_t>(start_ + outer_);
 380     }
 381
 382     // Skip to the n-th one in the block.
 383     DCHECK_GT(n, 0);
 384     inner_ = select64<Instructions>(block_, n - 1);
 385     block_ &= (block_t(-1) << inner_) << 1;
 386
 387     return setValue();
 388   }
 389
 390   // Skip to the first element that is >= v and located *after* the current
 391   // one (so even if current value equals v, position will be increased by 1).
 392   ValueType skipToNext(ValueType v) {
 393     DCHECK_GE(v, value_);
 394
 395     // Use skip pointer.
 396     if (Encoder::skipQuantum > 0 && v >= value_ + Encoder::skipQuantum) {
 397       // Workaround to avoid 'division by zero' compile-time error.
 398       constexpr size_t q = Encoder::skipQuantum ?: 1;
 399
 400       const size_t steps = v / q;
 401       const size_t dest =
 402         folly::loadUnaligned<SkipValueType>(
 403             skipPointers_ + (steps - 1) * sizeof(SkipValueType));
 404
 405       reposition(dest + q * steps);
 406       position_ = dest - 1;
 407
 408       // Correct inner_ and value_ will be set during the next()
 409       // call at the end.
 410
 411       // NOTE: Corresponding block of lower bits sequence may be
 412       // prefetched here (via __builtin_prefetch), but experiments
 413       // didn't show any significant improvements.
 414     }
 415
 416     // Skip by blocks.
 417     size_t cnt;
 418     size_t skip = v - (8 * outer_ - position_ - 1);
 419
 420     constexpr size_t kBitsPerBlock = 8 * sizeof(block_t);
 421     while ((cnt = Instructions::popcount(~block_)) < skip) {
 422       skip -= cnt;
 423       position_ += kBitsPerBlock - cnt;
 424       outer_ += sizeof(block_t);
 425       block_ = folly::loadUnaligned<block_t>(start_ + outer_);
 426     }
 427
 428     if (LIKELY(skip)) {
 429       auto inner = select64<Instructions>(~block_, skip - 1);
 430       position_ += inner - skip + 1;
 431       block_ &= block_t(-1) << inner;
 432     }
 433
 434     next();
 435     return value_;
 436   }
 437
 438   ValueType jump(size_t n) {
 439     if (Encoder::forwardQuantum == 0 || n <= Encoder::forwardQuantum) {
 440       reset();
 441     } else {
 442       position_ = -1;  // Avoid reading the head, skip() will reposition.
 443     }
 444     return skip(n);
 445   }
 446
 447   ValueType jumpToNext(ValueType v) {
 448     if (Encoder::skipQuantum == 0 || v < Encoder::skipQuantum) {
 449       reset();
 450     } else {
 451       value_ = 0;  // Avoid reading the head, skipToNext() will reposition.
 452     }
 453     return skipToNext(v);
 454   }
 455
 456  private:
 457   ValueType setValue() {
 458     value_ = static_cast<ValueType>(8 * outer_ + inner_ - position_);
 459     return value_;
 460   }
 461
 462   void reposition(size_t dest) {
 463     outer_ = dest / 8;
 464     block_ = folly::loadUnaligned<block_t>(start_ + outer_);
 465     block_ &= ~((block_t(1) << (dest % 8)) - 1);
 466   }
 467
 468   typedef unsigned long long block_t;
 469   const unsigned char* const forwardPointers_;
 470   const unsigned char* const skipPointers_;
 471   const unsigned char* const start_;
 472   block_t block_;
 473   size_t outer_;  // Outer offset: number of consumed bytes in upper.
 474   size_t inner_;  // Inner offset: (bit) position in current block.
 475   size_t position_;  // Index of current value (= #reads - 1).
 476   ValueType value_;
 477 };
 478
 479 }  // namespace detail
 480
 481 template <class Encoder,
 482           class Instructions = instructions::Default>
 483 class EliasFanoReader : private boost::noncopyable {
 484  public:
 485   typedef Encoder EncoderType;
 486   typedef typename Encoder::ValueType ValueType;
 487
 488   explicit EliasFanoReader(const EliasFanoCompressedList& list)
 489     : list_(list),
 490       lowerMask_((ValueType(1) << list_.numLowerBits) - 1),
 491       upper_(list_) {
 492     DCHECK(Instructions::supported());
 493     // To avoid extra branching during skipTo() while reading
 494     // upper sequence we need to know the last element.
 495     if (UNLIKELY(list_.size == 0)) {
 496       lastValue_ = 0;
 497       return;
 498     }
 499     ValueType lastUpperValue = 8 * list_.upper.size() - list_.size;
 500     auto it = list_.upper.end() - 1;
 501     DCHECK_NE(*it, 0);
 502     lastUpperValue -= 8 - folly::findLastSet(*it);
 503     lastValue_ = readLowerPart(list_.size - 1) |
 504                  (lastUpperValue << list_.numLowerBits);
 505   }
 506
 507   void reset() {
 508     upper_.reset();
 509     progress_ = 0;
 510     value_ = 0;
 511   }
 512
 513   bool next() {
 514     if (UNLIKELY(progress_ >= list_.size)) {
 515       return setDone();
 516     }
 517     value_ = readLowerPart(progress_) |
 518              (upper_.next() << list_.numLowerBits);
 519     ++progress_;
 520     return true;
 521   }
 522
 523   bool skip(size_t n) {
 524     CHECK_GT(n, 0);
 525
 526     progress_ += n;
 527     if (LIKELY(progress_ <= list_.size)) {
 528       if (LIKELY(n < kLinearScanThreshold)) {
 529         for (size_t i = 0; i < n; ++i) upper_.next();
 530       } else {
 531         upper_.skip(n);
 532       }
 533       value_ = readLowerPart(progress_ - 1) |
 534         (upper_.value() << list_.numLowerBits);
 535       return true;
 536     }
 537
 538     return setDone();
 539   }
 540
 541   bool skipTo(ValueType value) {
 542     DCHECK_GE(value, value_);
 543     if (value <= value_) {
 544       return true;
 545     } else if (value > lastValue_) {
 546       return setDone();
 547     }
 548
 549     size_t upperValue = (value >> list_.numLowerBits);
 550     size_t upperSkip = upperValue - upper_.value();
 551     // The average density of ones in upper bits is 1/2.
 552     // LIKELY here seems to make things worse, even for small skips.
 553     if (upperSkip < 2 * kLinearScanThreshold) {
 554       do {
 555         upper_.next();
 556       } while (UNLIKELY(upper_.value() < upperValue));
 557     } else {
 558       upper_.skipToNext(upperValue);
 559     }
 560
 561     iterateTo(value);
 562     return true;
 563   }
 564
 565   bool jump(size_t n) {
 566     if (LIKELY(n - 1 < list_.size)) {  // n > 0 && n <= list_.size
 567       progress_ = n;
 568       value_ = readLowerPart(n - 1) | (upper_.jump(n) << list_.numLowerBits);
 569       return true;
 570     } else if (n == 0) {
 571       reset();
 572       return true;
 573     }
 574     return setDone();
 575   }
 576
 577   bool jumpTo(ValueType value) {
 578     if (value <= 0) {
 579       reset();
 580       return true;
 581     } else if (value > lastValue_) {
 582       return setDone();
 583     }
 584
 585     upper_.jumpToNext(value >> list_.numLowerBits);
 586     iterateTo(value);
 587     return true;
 588   }
 589
 590   size_t size() const { return list_.size; }
 591
 592   size_t position() const { return progress_ - 1; }
 593   ValueType value() const { return value_; }
 594
 595  private:
 596   bool setDone() {
 597     value_ = std::numeric_limits<ValueType>::max();
 598     progress_ = list_.size + 1;
 599     return false;
 600   }
 601
 602   ValueType readLowerPart(size_t i) const {
 603     DCHECK_LT(i, list_.size);
 604     const size_t pos = i * list_.numLowerBits;
 605     const unsigned char* ptr = list_.lower.data() + (pos / 8);
 606     const uint64_t ptrv = folly::loadUnaligned<uint64_t>(ptr);
 607     return lowerMask_ & (ptrv >> (pos % 8));
 608   }
 609
 610   void iterateTo(ValueType value) {
 611     while (true) {
 612       value_ = readLowerPart(upper_.position()) |
 613         (upper_.value() << list_.numLowerBits);
 614       if (LIKELY(value_ >= value)) break;
 615       upper_.next();
 616     }
 617     progress_ = upper_.position() + 1;
 618   }
 619
 620   constexpr static size_t kLinearScanThreshold = 8;
 621
 622   const EliasFanoCompressedList list_;
 623   const ValueType lowerMask_;
 624   detail::UpperBitsReader<Encoder, Instructions> upper_;
 625   size_t progress_ = 0;
 626   ValueType value_ = 0;
 627   ValueType lastValue_;
 628 };
 629
 630 }}  // namespaces
 631
 632 #endif  // FOLLY_EXPERIMENTAL_ELIAS_FANO_CODING_H