folly/detail/ChecksumDetail.cpp

   1 /*
   2  * crc32_impl.h
   3  *
   4  * Copyright 2016 Eric Biggers
   5  *
   6  * Permission is hereby granted, free of charge, to any person
   7  * obtaining a copy of this software and associated documentation
   8  * files (the "Software"), to deal in the Software without
   9  * restriction, including without limitation the rights to use,
  10  * copy, modify, merge, publish, distribute, sublicense, and/or sell
  11  * copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following
  13  * conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be
  16  * included in all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25  * OTHER DEALINGS IN THE SOFTWARE.
  26  */
  27
  28 /*
  29  * CRC-32 folding with PCLMULQDQ.
  30  *
  31  * The basic idea is to repeatedly "fold" each 512 bits into the next
  32  * 512 bits, producing an abbreviated message which is congruent the
  33  * original message modulo the generator polynomial G(x).
  34  *
  35  * Folding each 512 bits is implemented as eight 64-bit folds, each of
  36  * which uses one carryless multiplication instruction.  It's expected
  37  * that CPUs may be able to execute some of these multiplications in
  38  * parallel.
  39  *
  40  * Explanation of "folding": let A(x) be 64 bits from the message, and
  41  * let B(x) be 95 bits from a constant distance D later in the
  42  * message.  The relevant portion of the message can be written as:
  43  *
  44  *      M(x) = A(x)*x^D + B(x)
  45  *
  46  * ... where + and * represent addition and multiplication,
  47  * respectively, of polynomials over GF(2).  Note that when
  48  * implemented on a computer, these operations are equivalent to XOR
  49  * and carryless multiplication, respectively.
  50  *
  51  * For the purpose of CRC calculation, only the remainder modulo the
  52  * generator polynomial G(x) matters:
  53  *
  54  * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
  55  *
  56  * Since the modulo operation can be applied anywhere in a sequence of
  57  * additions and multiplications without affecting the result, this is
  58  * equivalent to:
  59  *
  60  * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
  61  *
  62  * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree
  63  * 31, i.e.  a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is
  64  * equivalent to a carryless multiplication of a 64-bit quantity by a
  65  * 32-bit quantity, producing a 95-bit product.  Then, adding
  66  * (XOR-ing) the product to B(x) produces a polynomial with the same
  67  * length as B(x) but with the same remainder as 'A(x)*x^D + B(x)'.
  68  * This is the basic fold operation with 64 bits.
  69  *
  70  * Note that the carryless multiplication instruction PCLMULQDQ
  71  * actually takes two 64-bit inputs and produces a 127-bit product in
  72  * the low-order bits of a 128-bit XMM register.  This works fine, but
  73  * care must be taken to account for "bit endianness".  With the CRC
  74  * version implemented here, bits are always ordered such that the
  75  * lowest-order bit represents the coefficient of highest power of x
  76  * and the highest-order bit represents the coefficient of the lowest
  77  * power of x.  This is backwards from the more intuitive order.
  78  * Still, carryless multiplication works essentially the same either
  79  * way.  It just must be accounted for that when we XOR the 95-bit
  80  * product in the low-order 95 bits of a 128-bit XMM register into
  81  * 128-bits of later data held in another XMM register, we'll really
  82  * be XOR-ing the product into the mathematically higher degree end of
  83  * those later bits, not the lower degree end as may be expected.
  84  *
  85  * So given that caveat and the fact that we process 512 bits per
  86  * iteration, the 'D' values we need for the two 64-bit halves of each
  87  * 128 bits of data are:
  88  *
  89  * D = (512 + 95) - 64 for the higher-degree half of each 128
  90  *                 bits, i.e. the lower order bits in
  91  *                 the XMM register
  92  *
  93  *    D = (512 + 95) - 128 for the lower-degree half of each 128
  94  *                 bits, i.e. the higher order bits in
  95  *                 the XMM register
  96  *
  97  * The required 'x^D mod G(x)' values were precomputed.
  98  *
  99  * When <= 512 bits remain in the message, we finish up by folding
 100  * across smaller distances.  This works similarly; the distance D is
 101  * just different, so different constant multipliers must be used.
 102  * Finally, once the remaining message is just 64 bits, it is is
 103  * reduced to the CRC-32 using Barrett reduction (explained later).
 104  *
 105  * For more information see the original paper from Intel: "Fast CRC
 106  *    Computation for Generic Polynomials Using PCLMULQDQ
 107  *    Instruction" December 2009
 108  *    http://www.intel.com/content/dam/www/public/us/en/documents/
 109  *    white-papers/
 110  *    fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 111  */
 112
 113 #include <folly/detail/ChecksumDetail.h>
 114
 115 namespace folly {
 116 namespace detail {
 117
 118 #if FOLLY_SSE_PREREQ(4, 2)
 119
 120 uint32_t
 121 crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) {
 122   /* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
 123   const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95);
 124   const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA);
 125   const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191);
 126   const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765);
 127   const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
 128   const __m128i barrett_reduction_constants =
 129       _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641);
 130
 131   const __m128i* const end = p + vec_count;
 132   const __m128i* const end512 = p + (vec_count & ~3);
 133   __m128i x0, x1, x2, x3;
 134
 135   /*
 136    * Account for the current 'remainder', i.e. the CRC of the part of
 137    * the message already processed.  Explanation: rewrite the message
 138    * polynomial M(x) in terms of the first part A(x), the second part
 139    * B(x), and the length of the second part in bits |B(x)| >= 32:
 140    *
 141    *    M(x) = A(x)*x^|B(x)| + B(x)
 142    *
 143    * Then the CRC of M(x) is:
 144    *
 145    *    CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
 146    *              = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
 147    *              = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
 148    *
 149    * Note: all arithmetic is modulo G(x), the generator polynomial; that's
 150    * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
 151    *
 152    * So the CRC of the full message is the CRC of the second part of the
 153    * message where the first 32 bits of the second part of the message
 154    * have been XOR'ed with the CRC of the first part of the message.
 155    */
 156   x0 = *p++;
 157   x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder));
 158
 159   if (p > end512) /* only 128, 256, or 384 bits of input? */
 160     goto _128_bits_at_a_time;
 161   x1 = *p++;
 162   x2 = *p++;
 163   x3 = *p++;
 164
 165   /* Fold 512 bits at a time */
 166   for (; p != end512; p += 4) {
 167     __m128i y0, y1, y2, y3;
 168
 169     y0 = p[0];
 170     y1 = p[1];
 171     y2 = p[2];
 172     y3 = p[3];
 173
 174     /*
 175      * Note: the immediate constant for PCLMULQDQ specifies which
 176      * 64-bit halves of the 128-bit vectors to multiply:
 177      *
 178      * 0x00 means low halves (higher degree polynomial terms for us)
 179      * 0x11 means high halves (lower degree polynomial terms for us)
 180      */
 181     y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00));
 182     y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00));
 183     y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00));
 184     y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00));
 185     y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11));
 186     y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11));
 187     y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11));
 188     y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11));
 189
 190     x0 = y0;
 191     x1 = y1;
 192     x2 = y2;
 193     x3 = y3;
 194   }
 195
 196   /* Fold 512 bits => 128 bits */
 197   x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00));
 198   x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00));
 199   x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11));
 200   x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11));
 201   x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00));
 202   x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11));
 203   x0 = x3;
 204
 205 _128_bits_at_a_time:
 206   while (p != end) {
 207     /* Fold 128 bits into next 128 bits */
 208     x1 = *p++;
 209     x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00));
 210     x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11));
 211     x0 = x1;
 212   }
 213
 214   /* Now there are just 128 bits left, stored in 'x0'. */
 215
 216   /*
 217    * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
 218    * which is equivalent to multiplying by x^32.  This is needed because
 219    * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
 220    */
 221   x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10));
 222
 223   /* Fold 96 => 64 bits */
 224   x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
 225       _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00));
 226
 227   /*
 228    * Finally, reduce 64 => 32 bits using Barrett reduction.
 229    *
 230    * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
 231    * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
 232    *
 233    *    R(x) = (A(x)*x^32 + B(x)) mod G(x)
 234    *         = (A(x)*x^32) mod G(x) + B(x)
 235    *
 236    * Then, by the Division Algorithm there exists a unique q(x) such that:
 237    *
 238    *    A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
 239    *
 240    * Since the left-hand side is of maximum degree 31, the right-hand side
 241    * must be too.  This implies that we can apply 'mod x^32' to the
 242    * right-hand side without changing its value:
 243    *
 244    *    (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
 245    *
 246    * Note that '+' is equivalent to '-' in polynomials over GF(2).
 247    *
 248    * We also know that:
 249    *
 250    *                  / A(x)*x^32 \
 251    *    q(x) = floor (  ---------  )
 252    *                  \    G(x)   /
 253    *
 254    * To compute this efficiently, we can multiply the top and bottom by
 255    * x^32 and move the division by G(x) to the top:
 256    *
 257    *                  / A(x) * floor(x^64 / G(x)) \
 258    *    q(x) = floor (  -------------------------  )
 259    *                  \           x^32            /
 260    *
 261    * Note that floor(x^64 / G(x)) is a constant.
 262    *
 263    * So finally we have:
 264    *
 265    *                              / A(x) * floor(x^64 / G(x)) \
 266    *    R(x) = B(x) + G(x)*floor (  -------------------------  )
 267    *                              \           x^32            /
 268    */
 269   x1 = x0;
 270   x0 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00);
 271   x0 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10);
 272   return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4));
 273 }
 274
 275 #endif
 276 }
 277 } // namespace