Fix GlogFormatterTest on Windows

[folly.git] / folly / Checksum.cpp
diff --git a/folly/Checksum.cpp b/folly/Checksum.cpp

index d8daf66fcd11bd55efebb48403a2b7cba047d9ab..4e5f7768c1979c8d4b858cd219fd237cc57b6e94 100644 (file)
--- a/folly/Checksum.cpp
+++ b/folly/Checksum.cpp
@@ -1,5 +1,5 @@
  /*
- * Copyright 2015 Facebook, Inc.
+ * Copyright 2017 Facebook, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -15,27 +15,26 @@
   */
  
  #include <folly/Checksum.h>
-#include <algorithm>
-#include <stdexcept>
  #include <boost/crc.hpp>
  #include <folly/CpuId.h>
+#include <folly/detail/ChecksumDetail.h>
+#include <algorithm>
+#include <stdexcept>
+
+#if FOLLY_SSE_PREREQ(4, 2)
+#include <nmmintrin.h>
+#endif
  
  namespace folly {
  
  namespace detail {
  
-#ifndef __has_builtin
-  /* nolint */
-  #define __has_builtin(x) 0
-#endif
-
-#if __SSE4_2__ && \
-    ((__has_builtin(__builtin_ia32_crc32qi) && \
-     __has_builtin(__builtin_ia32_crc32di)) || \
-    (FOLLY_X64 && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
-     (((__GNUC__ * 100) + __GNUC_MINOR__) >= 407)))
+uint32_t
+crc32c_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
+#if FOLLY_SSE_PREREQ(4, 2)
  
  // Fast SIMD implementation of CRC-32C for x86 with SSE 4.2
+FOLLY_TARGET_ATTRIBUTE("sse4.2")
  uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
      uint32_t startingChecksum) {
    uint32_t sum = startingChecksum;
@@ -48,7 +47,7 @@ uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
    if (mask != 0) {
      size_t limit = std::min(nbytes, sizeof(uint64_t) - mask);
      while (offset < limit) {
-      sum = (uint32_t)__builtin_ia32_crc32qi(sum, data[offset]);
+      sum = (uint32_t)_mm_crc32_u8(sum, data[offset]);
        offset++;
      }
    }
@@ -56,23 +55,55 @@ uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
    // Process 8 bytes at a time until we have fewer than 8 bytes left.
    while (offset + sizeof(uint64_t) <= nbytes) {
      const uint64_t* src = (const uint64_t*)(data + offset);
-    sum = __builtin_ia32_crc32di(sum, *src);
+    sum = uint32_t(_mm_crc32_u64(sum, *src));
      offset += sizeof(uint64_t);
    }
  
    // Process any bytes remaining after the last aligned 8-byte block.
    while (offset < nbytes) {
-    sum = (uint32_t)__builtin_ia32_crc32qi(sum, data[offset]);
+    sum = (uint32_t)_mm_crc32_u8(sum, data[offset]);
      offset++;
    }
    return sum;
  }
  
+uint32_t
+crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum);
+
+// Fast SIMD implementation of CRC-32 for x86 with pclmul
+uint32_t
+crc32_hw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  uint32_t sum = startingChecksum;
+  size_t offset = 0;
+
+  // Process unaligned bytes
+  if ((uintptr_t)data & 15) {
+    size_t limit = std::min(nbytes, -(uintptr_t)data & 15);
+    sum = crc32_sw(data, limit, sum);
+    offset += limit;
+    nbytes -= limit;
+  }
+
+  if (nbytes >= 16) {
+    sum = crc32_hw_aligned(sum, (const __m128i*)(data + offset), nbytes / 16);
+    offset += nbytes & ~15;
+    nbytes &= 15;
+  }
+
+  // Remaining unaligned bytes
+  return crc32_sw(data + offset, nbytes, sum);
+}
+
  bool crc32c_hw_supported() {
    static folly::CpuId id;
    return id.sse42();
  }
  
+bool crc32_hw_supported() {
+  static folly::CpuId id;
+  return id.sse42();
+}
+
  #else
  
  uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
@@ -80,15 +111,22 @@ uint32_t crc32c_hw(const uint8_t *data, size_t nbytes,
    throw std::runtime_error("crc32_hw is not implemented on this platform");
  }
  
+uint32_t crc32_hw(const uint8_t *data, size_t nbytes,
+    uint32_t startingChecksum) {
+  throw std::runtime_error("crc32_hw is not implemented on this platform");
+}
+
  bool crc32c_hw_supported() {
    return false;
  }
  
+bool crc32_hw_supported() {
+  return false;
+}
  #endif
  
-uint32_t crc32c_sw(const uint8_t *data, size_t nbytes,
-    uint32_t startingChecksum) {
-
+template <uint32_t CRC_POLYNOMIAL>
+uint32_t crc_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
    // Reverse the bits in the starting checksum so they'll be in the
    // right internal format for Boost's CRC engine.
    //     O(1)-time, branchless bit reversal algorithm from
@@ -104,13 +142,24 @@ uint32_t crc32c_sw(const uint8_t *data, size_t nbytes,
    startingChecksum = (startingChecksum >> 16) |
        (startingChecksum << 16);
  
-  static const uint32_t CRC32C_POLYNOMIAL = 0x1EDC6F41;
-  boost::crc_optimal<32, CRC32C_POLYNOMIAL, ~0U, 0, true, true> sum(
+  boost::crc_optimal<32, CRC_POLYNOMIAL, ~0U, 0, true, true> sum(
        startingChecksum);
    sum.process_bytes(data, nbytes);
    return sum.checksum();
  }
  
+uint32_t
+crc32c_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  constexpr uint32_t CRC32C_POLYNOMIAL = 0x1EDC6F41;
+  return crc_sw<CRC32C_POLYNOMIAL>(data, nbytes, startingChecksum);
+}
+
+uint32_t
+crc32_sw(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  constexpr uint32_t CRC32_POLYNOMIAL = 0x04C11DB7;
+  return crc_sw<CRC32_POLYNOMIAL>(data, nbytes, startingChecksum);
+}
+
  } // folly::detail
  
  uint32_t crc32c(const uint8_t *data, size_t nbytes,
@@ -122,4 +171,12 @@ uint32_t crc32c(const uint8_t *data, size_t nbytes,
    }
  }
  
+uint32_t crc32(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+  if (detail::crc32_hw_supported()) {
+    return detail::crc32_hw(data, nbytes, startingChecksum);
+  } else {
+    return detail::crc32_sw(data, nbytes, startingChecksum);
+  }
+}
+
  } // folly