From: Dave Watson <davejwatson@fb.com>
Date: Mon, 8 May 2017 16:44:17 +0000 (-0700)
Subject: Add hardware crc impl
X-Git-Tag: v2017.05.15.00~25
X-Git-Url: http://plrg.eecs.uci.edu/git/?p=folly.git;a=commitdiff_plain;h=d9acfc9e4d3f94c3470df9b45c7f0b6c172e8c23

Add hardware crc impl

Summary:
A faster crc32 impl for folly.  Similar to crc32c, except
intel doesn't provide crc32 directly in hardware - instead, pclmul can be used,
which is ~2x slower than crc32c, but still ~5-10x faster than software implementation.

Reviewed By: Orvid, yfeldblum

Differential Revision: D4994761

fbshipit-source-id: ad8ba856649eea6dc7b541d561329ff7d7fe2d60
---

diff --git a/folly/Checksum.cpp b/folly/Checksum.cpp
index 6ec07893..cbc99913 100644
--- a/folly/Checksum.cpp
+++ b/folly/Checksum.cpp
@@ -15,10 +15,11 @@
  */
 
 #include <folly/Checksum.h>
-#include <algorithm>
-#include <stdexcept>
 #include <boost/crc.hpp>
 #include <folly/CpuId.h>
+#include <folly/detail/ChecksumDetail.h>
+#include <algorithm>
+#include <stdexcept>
 
 #if FOLLY_X64 && (__SSE4_2__ || defined(__clang__) || __GNUC_PREREQ(4, 9))
 #include <nmmintrin.h>
@@ -28,6 +29,8 @@ namespace folly {
 
 namespace detail {
 
+uint32_t
+crc32c_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
 #if FOLLY_X64 && (__SSE4_2__ || defined(__clang__) || __GNUC_PREREQ(4, 9))
 
 // Fast SIMD implementation of CRC-32C for x86 with SSE 4.2
@@ -64,11 +67,43 @@ uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
   return sum;
 }
 
+uint32_t
+crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
+
+// Fast SIMD implementation of CRC-32 for x86 with pclmul
+uint32_t
+crc32_hw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  uint32_t sum = startingChecksum;
+  size_t offset = 0;
+
+  // Process unaligned bytes
+  if ((uintptr_t)data & 15) {
+    size_t limit = std::min(nbytes, -(uintptr_t)data & 15);
+    sum = crc32_sw(data, limit, sum);
+    offset += limit;
+    nbytes -= limit;
+  }
+
+  if (nbytes >= 16) {
+    sum = crc32_hw_aligned(sum, (const __m128i*)(data + offset), nbytes / 16);
+    offset += nbytes & ~15;
+    nbytes &= 15;
+  }
+
+  // Remaining unaligned bytes
+  return crc32_sw(data + offset, nbytes, sum);
+}
+
 bool crc32c_hw_supported() {
   static folly::CpuId id;
   return id.sse42();
 }
 
+bool crc32_hw_supported() {
+  static folly::CpuId id;
+  return id.sse42();
+}
+
 #else
 
 uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
@@ -80,11 +115,13 @@ bool crc32c_hw_supported() {
   return false;
 }
 
+bool crc32_hw_supported() {
+  return false;
+}
 #endif
 
-uint32_t crc32c_sw(const uint8_t *data, size_t nbytes,
-    uint32_t startingChecksum) {
-
+template <uint32_t CRC_POLYNOMIAL>
+uint32_t crc_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
   // Reverse the bits in the starting checksum so they'll be in the
   // right internal format for Boost's CRC engine.
   //     O(1)-time, branchless bit reversal algorithm from
@@ -100,13 +137,24 @@ uint32_t crc32c_sw(const uint8_t *data, size_t nbytes,
   startingChecksum = (startingChecksum >> 16) |
       (startingChecksum << 16);
 
-  static const uint32_t CRC32C_POLYNOMIAL = 0x1EDC6F41;
-  boost::crc_optimal<32, CRC32C_POLYNOMIAL, ~0U, 0, true, true> sum(
+  boost::crc_optimal<32, CRC_POLYNOMIAL, ~0U, 0, true, true> sum(
       startingChecksum);
   sum.process_bytes(data, nbytes);
   return sum.checksum();
 }
 
+uint32_t
+crc32c_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  constexpr uint32_t CRC32C_POLYNOMIAL = 0x1EDC6F41;
+  return crc_sw<CRC32C_POLYNOMIAL>(data, nbytes, startingChecksum);
+}
+
+uint32_t
+crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  constexpr uint32_t CRC32_POLYNOMIAL = 0x04C11DB7;
+  return crc_sw<CRC32_POLYNOMIAL>(data, nbytes, startingChecksum);
+}
+
 } // folly::detail
 
 uint32_t crc32c(const uint8_t *data, size_t nbytes,
@@ -118,4 +166,12 @@ uint32_t crc32c(const uint8_t *data, size_t nbytes,
   }
 }
 
+uint32_t crc32(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  if (detail::crc32_hw_supported()) {
+    return detail::crc32_hw(data, nbytes, startingChecksum);
+  } else {
+    return detail::crc32_sw(data, nbytes, startingChecksum);
+  }
+}
+
 } // folly
diff --git a/folly/Checksum.h b/folly/Checksum.h
index 24ce9a47..642ad258 100644
--- a/folly/Checksum.h
+++ b/folly/Checksum.h
@@ -37,4 +37,12 @@ namespace folly {
 uint32_t crc32c(const uint8_t* data, size_t nbytes,
     uint32_t startingChecksum = ~0U);
 
+/**
+ * Compute the CRC-32 checksum of a buffer, using a hardware-accelerated
+ * implementation if available or a portable software implementation as
+ * a default.
+ */
+uint32_t
+crc32(const uint8_t* data, size_t nbytes, uint32_t startingChecksum = ~0U);
+
 } // folly
diff --git a/folly/Makefile.am b/folly/Makefile.am
index e4532573..52ae0d49 100644
--- a/folly/Makefile.am
+++ b/folly/Makefile.am
@@ -418,6 +418,7 @@ GroupVarintTables.cpp: build/generate_varint_tables.py
 CLEANFILES += GroupVarintTables.cpp
 
 libfollybasesse42_la_SOURCES = \
+	detail/ChecksumDetail.cpp \
 	detail/RangeSse42.cpp
 
 libfollybase_la_SOURCES = \
@@ -613,7 +614,7 @@ libfolly_la_SOURCES += \
 endif
 
 libfollybasesse42_la_LDFLAGS = $(AM_LDFLAGS) -version-info $(LT_VERSION)
-libfollybasesse42_la_CXXFLAGS = -msse4.2
+libfollybasesse42_la_CXXFLAGS = -msse4.2 -mpclmul
 
 libfollybase_la_LIBADD = libfollybasesse42.la
 libfollybase_la_LDFLAGS = $(AM_LDFLAGS) -version-info $(LT_VERSION)
diff --git a/folly/detail/ChecksumDetail.cpp b/folly/detail/ChecksumDetail.cpp
new file mode 100644
index 00000000..5128c1a4
--- /dev/null
+++ b/folly/detail/ChecksumDetail.cpp
@@ -0,0 +1,273 @@
+/*
+ * crc32_impl.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * CRC-32 folding with PCLMULQDQ.
+ *
+ * The basic idea is to repeatedly "fold" each 512 bits into the next
+ * 512 bits, producing an abbreviated message which is congruent the
+ * original message modulo the generator polynomial G(x).
+ *
+ * Folding each 512 bits is implemented as eight 64-bit folds, each of
+ * which uses one carryless multiplication instruction.  It's expected
+ * that CPUs may be able to execute some of these multiplications in
+ * parallel.
+ *
+ * Explanation of "folding": let A(x) be 64 bits from the message, and
+ * let B(x) be 95 bits from a constant distance D later in the
+ * message.  The relevant portion of the message can be written as:
+ *
+ *      M(x) = A(x)*x^D + B(x)
+ *
+ * ... where + and * represent addition and multiplication,
+ * respectively, of polynomials over GF(2).  Note that when
+ * implemented on a computer, these operations are equivalent to XOR
+ * and carryless multiplication, respectively.
+ *
+ * For the purpose of CRC calculation, only the remainder modulo the
+ * generator polynomial G(x) matters:
+ *
+ * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
+ *
+ * Since the modulo operation can be applied anywhere in a sequence of
+ * additions and multiplications without affecting the result, this is
+ * equivalent to:
+ *
+ * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
+ *
+ * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree
+ * 31, i.e.  a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is
+ * equivalent to a carryless multiplication of a 64-bit quantity by a
+ * 32-bit quantity, producing a 95-bit product.  Then, adding
+ * (XOR-ing) the product to B(x) produces a polynomial with the same
+ * length as B(x) but with the same remainder as 'A(x)*x^D + B(x)'.
+ * This is the basic fold operation with 64 bits.
+ *
+ * Note that the carryless multiplication instruction PCLMULQDQ
+ * actually takes two 64-bit inputs and produces a 127-bit product in
+ * the low-order bits of a 128-bit XMM register.  This works fine, but
+ * care must be taken to account for "bit endianness".  With the CRC
+ * version implemented here, bits are always ordered such that the
+ * lowest-order bit represents the coefficient of highest power of x
+ * and the highest-order bit represents the coefficient of the lowest
+ * power of x.  This is backwards from the more intuitive order.
+ * Still, carryless multiplication works essentially the same either
+ * way.  It just must be accounted for that when we XOR the 95-bit
+ * product in the low-order 95 bits of a 128-bit XMM register into
+ * 128-bits of later data held in another XMM register, we'll really
+ * be XOR-ing the product into the mathematically higher degree end of
+ * those later bits, not the lower degree end as may be expected.
+ *
+ * So given that caveat and the fact that we process 512 bits per
+ * iteration, the 'D' values we need for the two 64-bit halves of each
+ * 128 bits of data are:
+ *
+ * D = (512 + 95) - 64 for the higher-degree half of each 128
+ *                 bits, i.e. the lower order bits in
+ *                 the XMM register
+ *
+ *    D = (512 + 95) - 128 for the lower-degree half of each 128
+ *                 bits, i.e. the higher order bits in
+ *                 the XMM register
+ *
+ * The required 'x^D mod G(x)' values were precomputed.
+ *
+ * When <= 512 bits remain in the message, we finish up by folding
+ * across smaller distances.  This works similarly; the distance D is
+ * just different, so different constant multipliers must be used.
+ * Finally, once the remaining message is just 64 bits, it is is
+ * reduced to the CRC-32 using Barrett reduction (explained later).
+ *
+ * For more information see the original paper from Intel: "Fast CRC
+ *    Computation for Generic Polynomials Using PCLMULQDQ
+ *    Instruction" December 2009
+ *    http://www.intel.com/content/dam/www/public/us/en/documents/
+ *    white-papers/
+ *    fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ */
+
+#include <folly/detail/ChecksumDetail.h>
+
+namespace folly {
+namespace detail {
+
+uint32_t
+crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) {
+  /* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
+  const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95);
+  const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA);
+  const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191);
+  const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765);
+  const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
+  const __m128i barrett_reduction_constants =
+      _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641);
+
+  const __m128i* const end = p + vec_count;
+  const __m128i* const end512 = p + (vec_count & ~3);
+  __m128i x0, x1, x2, x3;
+
+  /*
+   * Account for the current 'remainder', i.e. the CRC of the part of
+   * the message already processed.  Explanation: rewrite the message
+   * polynomial M(x) in terms of the first part A(x), the second part
+   * B(x), and the length of the second part in bits |B(x)| >= 32:
+   *
+   *    M(x) = A(x)*x^|B(x)| + B(x)
+   *
+   * Then the CRC of M(x) is:
+   *
+   *    CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
+   *              = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
+   *              = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
+   *
+   * Note: all arithmetic is modulo G(x), the generator polynomial; that's
+   * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
+   *
+   * So the CRC of the full message is the CRC of the second part of the
+   * message where the first 32 bits of the second part of the message
+   * have been XOR'ed with the CRC of the first part of the message.
+   */
+  x0 = *p++;
+  x0 ^= _mm_set_epi32(0, 0, 0, remainder);
+
+  if (p > end512) /* only 128, 256, or 384 bits of input? */
+    goto _128_bits_at_a_time;
+  x1 = *p++;
+  x2 = *p++;
+  x3 = *p++;
+
+  /* Fold 512 bits at a time */
+  for (; p != end512; p += 4) {
+    __m128i y0, y1, y2, y3;
+
+    y0 = p[0];
+    y1 = p[1];
+    y2 = p[2];
+    y3 = p[3];
+
+    /*
+     * Note: the immediate constant for PCLMULQDQ specifies which
+     * 64-bit halves of the 128-bit vectors to multiply:
+     *
+     * 0x00 means low halves (higher degree polynomial terms for us)
+     * 0x11 means high halves (lower degree polynomial terms for us)
+     */
+    y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
+    y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
+    y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
+    y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
+    y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
+    y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
+    y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
+    y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
+
+    x0 = y0;
+    x1 = y1;
+    x2 = y2;
+    x3 = y3;
+  }
+
+  /* Fold 512 bits => 128 bits */
+  x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
+  x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
+  x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
+  x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
+  x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
+  x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
+  x0 = x3;
+
+_128_bits_at_a_time:
+  while (p != end) {
+    /* Fold 128 bits into next 128 bits */
+    x1 = *p++;
+    x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
+    x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
+    x0 = x1;
+  }
+
+  /* Now there are just 128 bits left, stored in 'x0'. */
+
+  /*
+   * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+   * which is equivalent to multiplying by x^32.  This is needed because
+   * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+   */
+  x0 = _mm_srli_si128(x0, 8) ^ _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
+
+  /* Fold 96 => 64 bits */
+  x0 = _mm_srli_si128(x0, 4) ^
+      _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
+
+  /*
+   * Finally, reduce 64 => 32 bits using Barrett reduction.
+   *
+   * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
+   * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
+   *
+   *    R(x) = (A(x)*x^32 + B(x)) mod G(x)
+   *         = (A(x)*x^32) mod G(x) + B(x)
+   *
+   * Then, by the Division Algorithm there exists a unique q(x) such that:
+   *
+   *    A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+   *
+   * Since the left-hand side is of maximum degree 31, the right-hand side
+   * must be too.  This implies that we can apply 'mod x^32' to the
+   * right-hand side without changing its value:
+   *
+   *    (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+   *
+   * Note that '+' is equivalent to '-' in polynomials over GF(2).
+   *
+   * We also know that:
+   *
+   *                  / A(x)*x^32 \
+   *    q(x) = floor (  ---------  )
+   *                  \    G(x)   /
+   *
+   * To compute this efficiently, we can multiply the top and bottom by
+   * x^32 and move the division by G(x) to the top:
+   *
+   *                  / A(x) * floor(x^64 / G(x)) \
+   *    q(x) = floor (  -------------------------  )
+   *                  \           x^32            /
+   *
+   * Note that floor(x^64 / G(x)) is a constant.
+   *
+   * So finally we have:
+   *
+   *                              / A(x) * floor(x^64 / G(x)) \
+   *    R(x) = B(x) + G(x)*floor (  -------------------------  )
+   *                              \           x^32            /
+   */
+  x1 = x0;
+  x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
+  x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
+  return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
+}
+}
+} // namespace
diff --git a/folly/detail/ChecksumDetail.h b/folly/detail/ChecksumDetail.h
index c018c1c8..97ad4519 100644
--- a/folly/detail/ChecksumDetail.h
+++ b/folly/detail/ChecksumDetail.h
@@ -16,6 +16,10 @@
 
 #pragma once
 
+#include <immintrin.h>
+#include <stdint.h>
+#include <cstddef>
+
 namespace folly { namespace detail {
 
 /**
@@ -51,5 +55,39 @@ bool crc32c_hw_supported();
 uint32_t crc32c_sw(const uint8_t* data, size_t nbytes,
     uint32_t startingChecksum = ~0U);
 
+/**
+ * Compute a CRC-32 checksum of a buffer using a hardware-accelerated
+ * implementation.
+ *
+ * @note This function is exposed to support special cases where the
+ *       calling code is absolutely certain it ought to invoke a hardware-
+ *       accelerated CRC-32 implementation - unit tests, for example.  For
+ *       all other scenarios, please call crc32() and let it pick an
+ *       implementation based on the capabilities of the underlying CPU.
+ */
+uint32_t
+crc32_hw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum = ~0U);
+
+uint32_t
+crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count);
+
+/**
+ * Check whether a hardware-accelerated CRC-32 implementation is
+ * supported on the current CPU.
+ */
+bool crc32_hw_supported();
 
+/**
+ * Compute a CRC-32 checksum of a buffer using a portable,
+ * software-only implementation.
+ *
+ * @note This function is exposed to support special cases where the
+ *       calling code is absolutely certain it wants to use the software
+ *       implementation instead of the hardware-accelerated code - unit
+ *       tests, for example.  For all other scenarios, please call crc32()
+ *       and let it pick an implementation based on the capabilities of
+ *       the underlying CPU.
+ */
+uint32_t
+crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum = ~0U);
 }} // folly::detail
diff --git a/folly/test/ChecksumTest.cpp b/folly/test/ChecksumTest.cpp
index 0c4747d7..4c5039a2 100644
--- a/folly/test/ChecksumTest.cpp
+++ b/folly/test/ChecksumTest.cpp
@@ -126,6 +126,49 @@ TEST(Checksum, crc32c_continuation_autodetect) {
   testCRC32CContinuation(folly::crc32c);
 }
 
+TEST(Checksum, crc32) {
+  if (folly::detail::crc32c_hw_supported()) {
+    // Just check that sw and hw match
+    for (auto expected : expectedResults) {
+      uint32_t sw_res =
+          folly::detail::crc32_sw(buffer + expected.offset, expected.length, 0);
+      uint32_t hw_res =
+          folly::detail::crc32_hw(buffer + expected.offset, expected.length, 0);
+      EXPECT_EQ(sw_res, hw_res);
+    }
+  } else {
+    LOG(WARNING) << "skipping hardware-accelerated CRC-32 tests"
+                 << " (not supported on this CPU)";
+  }
+}
+
+TEST(Checksum, crc32_continuation) {
+  if (folly::detail::crc32c_hw_supported()) {
+    // Just check that sw and hw match
+    for (auto expected : expectedResults) {
+      auto halflen = expected.length / 2;
+      uint32_t sw_res =
+          folly::detail::crc32_sw(buffer + expected.offset, halflen, 0);
+      sw_res = folly::detail::crc32_sw(
+          buffer + expected.offset + halflen, halflen, sw_res);
+      uint32_t hw_res =
+          folly::detail::crc32_hw(buffer + expected.offset, halflen, 0);
+      hw_res = folly::detail::crc32_hw(
+          buffer + expected.offset + halflen, halflen, hw_res);
+      EXPECT_EQ(sw_res, hw_res);
+      uint32_t sw_res2 =
+          folly::detail::crc32_sw(buffer + expected.offset, halflen * 2, 0);
+      EXPECT_EQ(sw_res, sw_res2);
+      uint32_t hw_res2 =
+          folly::detail::crc32_hw(buffer + expected.offset, halflen * 2, 0);
+      EXPECT_EQ(hw_res, hw_res2);
+    }
+  } else {
+    LOG(WARNING) << "skipping hardware-accelerated CRC-32 tests"
+                 << " (not supported on this CPU)";
+  }
+}
+
 void benchmarkHardwareCRC32C(unsigned long iters, size_t blockSize) {
   if (folly::detail::crc32c_hw_supported()) {
     uint32_t checksum;
@@ -147,6 +190,27 @@ void benchmarkSoftwareCRC32C(unsigned long iters, size_t blockSize) {
   }
 }
 
+void benchmarkHardwareCRC32(unsigned long iters, size_t blockSize) {
+  if (folly::detail::crc32_hw_supported()) {
+    uint32_t checksum;
+    for (unsigned long i = 0; i < iters; i++) {
+      checksum = folly::detail::crc32_hw(buffer, blockSize);
+      folly::doNotOptimizeAway(checksum);
+    }
+  } else {
+    LOG(WARNING) << "skipping hardware-accelerated CRC-32 benchmarks"
+                 << " (not supported on this CPU)";
+  }
+}
+
+void benchmarkSoftwareCRC32(unsigned long iters, size_t blockSize) {
+  uint32_t checksum;
+  for (unsigned long i = 0; i < iters; i++) {
+    checksum = folly::detail::crc32_sw(buffer, blockSize);
+    folly::doNotOptimizeAway(checksum);
+  }
+}
+
 // This test fits easily in the L1 cache on modern server processors,
 // and thus it mainly measures the speed of the checksum computation.
 BENCHMARK(crc32c_hardware_1KB_block, iters) {
@@ -157,6 +221,14 @@ BENCHMARK(crc32c_software_1KB_block, iters) {
   benchmarkSoftwareCRC32C(iters, 1024);
 }
 
+BENCHMARK(crc32_hardware_1KB_block, iters) {
+  benchmarkHardwareCRC32(iters, 1024);
+}
+
+BENCHMARK(crc32_software_1KB_block, iters) {
+  benchmarkSoftwareCRC32(iters, 1024);
+}
+
 BENCHMARK_DRAW_LINE();
 
 // This test is too big for the L1 cache but fits in L2
@@ -168,6 +240,14 @@ BENCHMARK(crc32c_software_64KB_block, iters) {
   benchmarkSoftwareCRC32C(iters, 64 * 1024);
 }
 
+BENCHMARK(crc32_hardware_64KB_block, iters) {
+  benchmarkHardwareCRC32(iters, 64 * 1024);
+}
+
+BENCHMARK(crc32_software_64KB_block, iters) {
+  benchmarkSoftwareCRC32(iters, 64 * 1024);
+}
+
 BENCHMARK_DRAW_LINE();
 
 // This test is too big for the L2 cache but fits in L3
@@ -179,6 +259,13 @@ BENCHMARK(crc32c_software_512KB_block, iters) {
   benchmarkSoftwareCRC32C(iters, 512 * 1024);
 }
 
+BENCHMARK(crc32_hardware_512KB_block, iters) {
+  benchmarkHardwareCRC32(iters, 512 * 1024);
+}
+
+BENCHMARK(crc32_software_512KB_block, iters) {
+  benchmarkSoftwareCRC32(iters, 512 * 1024);
+}
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);