Fix build failure with GCC 5

[folly.git] / folly / detail / RangeSse42.cpp
diff --git a/folly/detail/RangeSse42.cpp b/folly/detail/RangeSse42.cpp

index c540988722e7602d27ceb8a3e51acc16e8a8f453..fc9a759f3cf3be984cc4c35c9a0b38747e2126a6 100644 (file)
--- a/folly/detail/RangeSse42.cpp
+++ b/folly/detail/RangeSse42.cpp
@@ -1,5 +1,5 @@
  /*
- * Copyright 2015 Facebook, Inc.
+ * Copyright 2017 Facebook, Inc.
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
@@ -14,53 +14,53 @@
   * limitations under the License.
   */
  
-
-
  #include "RangeSse42.h"
  
  #include <glog/logging.h>
  #include <folly/Portability.h>
  
-
-
  //  Essentially, two versions of this file: one with an SSE42 implementation
  //  and one with a fallback implementation. We determine which version to use by
  //  testing for the presence of the required headers.
  //
  //  TODO: Maybe this should be done by the build system....
  #if !FOLLY_SSE_PREREQ(4, 2)
-
-
-
  namespace folly {
-
  namespace detail {
-
  size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
                                   const StringPieceLite needles) {
-  CHECK(false) << "Function " << __func__ << " only works with SSE42!";
    return qfind_first_byte_of_nosse(haystack, needles);
  }
-
  }
-
  }
-
-
-
  # else
-
-
-
  #include <cstdint>
  #include <limits>
  #include <string>
+
  #include <emmintrin.h>
+#include <nmmintrin.h>
  #include <smmintrin.h>
+
  #include <folly/Likely.h>
  
-namespace folly {
+//  GCC 4.9 with ASAN has a problem: a function with no_sanitize_address calling
+//  a function with always_inline fails to build. The _mm_* functions are marked
+//  always_inline.
+//  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+#if defined FOLLY_SANITIZE_ADDRESS && \
+    FOLLY_SANITIZE_ADDRESS == 1 && \
+    __GNUC_PREREQ(4, 9)
+# define _mm_load_si128(p) (*(p))
+# define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p)))
+# ifdef _mm_cmpestri
+#  undef _mm_cmpestri
+# endif
+# define _mm_cmpestri(a, b, c, d, e) \
+  __builtin_ia32_pcmpestri128((__v16qi)(a), b, (__v16qi)(c), d, e)
+#endif
  
+namespace folly {
  namespace detail {
  
  // It's okay if pages are bigger than this (as powers of two), but they should
@@ -88,9 +88,9 @@ static size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
  // helper method for case where needles.size() <= 16
  size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
                                       const StringPieceLite needles) {
-  DCHECK_GT(haystack.size(), 0);
-  DCHECK_GT(needles.size(), 0);
-  DCHECK_LE(needles.size(), 16);
+  DCHECK_GT(haystack.size(), 0u);
+  DCHECK_GT(needles.size(), 0u);
+  DCHECK_LE(needles.size(), 16u);
    if ((needles.size() <= 2 && haystack.size() >= 256) ||
        // must bail if we can't even SSE-load a single segment of haystack
        (haystack.size() < 16 &&
@@ -100,25 +100,24 @@ size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
      return detail::qfind_first_byte_of_nosse(haystack, needles);
    }
  
-  auto arr2 = ::_mm_loadu_si128(
+  auto arr2 = _mm_loadu_si128(
        reinterpret_cast<const __m128i*>(needles.data()));
    // do an unaligned load for first block of haystack
-  auto arr1 = ::_mm_loadu_si128(
+  auto arr1 = _mm_loadu_si128(
        reinterpret_cast<const __m128i*>(haystack.data()));
-  auto index = __builtin_ia32_pcmpestri128((__v16qi)arr2, needles.size(),
-                                           (__v16qi)arr1, haystack.size(), 0);
+  auto index =
+      _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
    if (index < 16) {
-    return index;
+    return size_t(index);
    }
  
    // Now, we can do aligned loads hereafter...
    size_t i = nextAlignedIndex(haystack.data());
    for (; i < haystack.size(); i+= 16) {
-    auto arr1 = ::_mm_load_si128(
-        reinterpret_cast<const __m128i*>(haystack.data() + i));
-    auto index = __builtin_ia32_pcmpestri128(
-        (__v16qi)arr2, needles.size(),
-        (__v16qi)arr1, haystack.size() - i, 0);
+    arr1 =
+        _mm_load_si128(reinterpret_cast<const __m128i*>(haystack.data() + i));
+    index = _mm_cmpestri(
+        arr2, int(needles.size()), arr1, int(haystack.size() - i), 0);
      if (index < 16) {
        return i + index;
      }
@@ -144,35 +143,38 @@ template <bool HAYSTACK_ALIGNED>
  size_t scanHaystackBlock(const StringPieceLite haystack,
                           const StringPieceLite needles,
                           uint64_t blockStartIdx) {
-  DCHECK_GT(needles.size(), 16);  // should handled by *needles16() method
+  DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method
    DCHECK(blockStartIdx + 16 <= haystack.size() ||
           (page_for(haystack.data() + blockStartIdx) ==
            page_for(haystack.data() + blockStartIdx + 15)));
  
    __m128i arr1;
    if (HAYSTACK_ALIGNED) {
-    arr1 = ::_mm_load_si128(
+    arr1 = _mm_load_si128(
          reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
    } else {
-    arr1 = ::_mm_loadu_si128(
+    arr1 = _mm_loadu_si128(
          reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
    }
  
    // This load is safe because needles.size() >= 16
-  auto arr2 = ::_mm_loadu_si128(
+  auto arr2 = _mm_loadu_si128(
        reinterpret_cast<const __m128i*>(needles.data()));
-  size_t b = __builtin_ia32_pcmpestri128(
-    (__v16qi)arr2, 16, (__v16qi)arr1, haystack.size() - blockStartIdx, 0);
+  auto b =
+      _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
  
    size_t j = nextAlignedIndex(needles.data());
    for (; j < needles.size(); j += 16) {
-    arr2 = ::_mm_load_si128(
+    arr2 = _mm_load_si128(
          reinterpret_cast<const __m128i*>(needles.data() + j));
  
-    auto index = __builtin_ia32_pcmpestri128(
-      (__v16qi)arr2, needles.size() - j,
-      (__v16qi)arr1, haystack.size() - blockStartIdx, 0);
-    b = std::min<size_t>(index, b);
+    auto index = _mm_cmpestri(
+        arr2,
+        int(needles.size() - j),
+        arr1,
+        int(haystack.size() - blockStartIdx),
+        0);
+    b = std::min(index, b);
    }
  
    if (b < 16) {
@@ -210,7 +212,7 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
  
    size_t i = nextAlignedIndex(haystack.data());
    for (; i < haystack.size(); i += 16) {
-    auto ret = scanHaystackBlock<true>(haystack, needles, i);
+    ret = scanHaystackBlock<true>(haystack, needles, i);
      if (ret != std::string::npos) {
        return ret;
      }
@@ -218,11 +220,6 @@ size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
  
    return std::string::npos;
  }
-
  }
-
  }
-
-
-
  #endif