* limitations under the License.
*/
-#include "RangeSse42.h"
+#include <folly/detail/RangeSse42.h>
#include <glog/logging.h>
+
#include <folly/Portability.h>
// Essentially, two versions of this file: one with an SSE42 implementation
#if !FOLLY_SSE_PREREQ(4, 2)
namespace folly {
namespace detail {
-size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
- const StringPieceLite needles) {
+size_t qfind_first_byte_of_sse42(
+ const StringPieceLite haystack,
+ const StringPieceLite needles) {
return qfind_first_byte_of_nosse(haystack, needles);
}
-}
-}
-# else
+} // namespace detail
+} // namespace folly
+#else
#include <cstdint>
#include <limits>
#include <string>
// a function with always_inline fails to build. The _mm_* functions are marked
// always_inline.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
-#if defined FOLLY_SANITIZE_ADDRESS && \
- FOLLY_SANITIZE_ADDRESS == 1 && \
+#if defined FOLLY_SANITIZE_ADDRESS && FOLLY_SANITIZE_ADDRESS == 1 && \
__GNUC_PREREQ(4, 9)
-# define _mm_load_si128(p) (*(p))
-# define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p)))
-# ifdef _mm_cmpestri
-# undef _mm_cmpestri
-# endif
-# define _mm_cmpestri(a, b, c, d, e) \
+#define _mm_load_si128(p) (*(p))
+#define _mm_loadu_si128(p) ((__m128i)__builtin_ia32_loaddqu((const char*)(p)))
+#ifdef _mm_cmpestri
+#undef _mm_cmpestri
+#endif
+#define _mm_cmpestri(a, b, c, d, e) \
__builtin_ia32_pcmpestri128((__v16qi)(a), b, (__v16qi)(c), d, e)
#endif
// It's okay if pages are bigger than this (as powers of two), but they should
// not be smaller.
static constexpr size_t kMinPageSize = 4096;
-static_assert(kMinPageSize >= 16,
- "kMinPageSize must be at least SSE register size");
+static_assert(
+ kMinPageSize >= 16,
+ "kMinPageSize must be at least SSE register size");
template <typename T>
static inline uintptr_t page_for(T* addr) {
}
static inline size_t nextAlignedIndex(const char* arr) {
- auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
- return 1 + // add 1 because the index starts at 'arr'
- ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
- - firstPossible;
+ auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
+ return 1 + // add 1 because the index starts at 'arr'
+ ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
+ - firstPossible;
}
-static size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
- const StringPieceLite needles)
- FOLLY_DISABLE_ADDRESS_SANITIZER;
+static size_t qfind_first_byte_of_needles16(
+ const StringPieceLite haystack,
+ const StringPieceLite needles) FOLLY_DISABLE_ADDRESS_SANITIZER;
// helper method for case where needles.size() <= 16
-size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
- const StringPieceLite needles) {
+size_t qfind_first_byte_of_needles16(
+ const StringPieceLite haystack,
+ const StringPieceLite needles) {
DCHECK_GT(haystack.size(), 0u);
DCHECK_GT(needles.size(), 0u);
DCHECK_LE(needles.size(), 16u);
return detail::qfind_first_byte_of_nosse(haystack, needles);
}
- auto arr2 = _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(needles.data()));
+ auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
// do an unaligned load for first block of haystack
- auto arr1 = _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(haystack.data()));
+ auto arr1 =
+ _mm_loadu_si128(reinterpret_cast<const __m128i*>(haystack.data()));
auto index =
_mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
if (index < 16) {
// Now, we can do aligned loads hereafter...
size_t i = nextAlignedIndex(haystack.data());
- for (; i < haystack.size(); i+= 16) {
+ for (; i < haystack.size(); i += 16) {
arr1 =
_mm_load_si128(reinterpret_cast<const __m128i*>(haystack.data() + i));
index = _mm_cmpestri(
}
template <bool HAYSTACK_ALIGNED>
-size_t scanHaystackBlock(const StringPieceLite haystack,
- const StringPieceLite needles,
- uint64_t idx)
-// Turn off ASAN because the "arr2 = ..." assignment in the loop below reads
-// up to 15 bytes beyond end of the buffer in #needles#. That is ok because
-// ptr2 is always 16-byte aligned, so the read can never span a page boundary.
-// Also, the extra data that may be read is never actually used.
- FOLLY_DISABLE_ADDRESS_SANITIZER;
+size_t scanHaystackBlock(
+ const StringPieceLite haystack,
+ const StringPieceLite needles,
+ uint64_t idx)
+ // Turn off ASAN because the "arr2 = ..." assignment in the loop below reads
+ // up to 15 bytes beyond end of the buffer in #needles#. That is ok because
+ // ptr2 is always 16-byte aligned, so the read can never span a page
+ // boundary. Also, the extra data that may be read is never actually used.
+ FOLLY_DISABLE_ADDRESS_SANITIZER;
// Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
// needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
// If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
// block.
template <bool HAYSTACK_ALIGNED>
-size_t scanHaystackBlock(const StringPieceLite haystack,
- const StringPieceLite needles,
- uint64_t blockStartIdx) {
+size_t scanHaystackBlock(
+ const StringPieceLite haystack,
+ const StringPieceLite needles,
+ uint64_t blockStartIdx) {
DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method
- DCHECK(blockStartIdx + 16 <= haystack.size() ||
- (page_for(haystack.data() + blockStartIdx) ==
- page_for(haystack.data() + blockStartIdx + 15)));
+ DCHECK(
+ blockStartIdx + 16 <= haystack.size() ||
+ (page_for(haystack.data() + blockStartIdx) ==
+ page_for(haystack.data() + blockStartIdx + 15)));
__m128i arr1;
if (HAYSTACK_ALIGNED) {
}
// This load is safe because needles.size() >= 16
- auto arr2 = _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(needles.data()));
- size_t b =
+ auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
+ auto b =
_mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
size_t j = nextAlignedIndex(needles.data());
for (; j < needles.size(); j += 16) {
- arr2 = _mm_load_si128(
- reinterpret_cast<const __m128i*>(needles.data() + j));
+ arr2 = _mm_load_si128(reinterpret_cast<const __m128i*>(needles.data() + j));
auto index = _mm_cmpestri(
arr2,
arr1,
int(haystack.size() - blockStartIdx),
0);
- b = std::min<size_t>(index, b);
+ b = std::min(index, b);
}
if (b < 16) {
return std::string::npos;
}
-size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
- const StringPieceLite needles);
+size_t qfind_first_byte_of_sse42(
+ const StringPieceLite haystack,
+ const StringPieceLite needles);
-size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
- const StringPieceLite needles) {
+size_t qfind_first_byte_of_sse42(
+ const StringPieceLite haystack,
+ const StringPieceLite needles) {
if (UNLIKELY(needles.empty() || haystack.empty())) {
return std::string::npos;
} else if (needles.size() <= 16) {