2 * Copyright 2015 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author Mark Rabkin (mrabkin@fb.com)
18 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
20 #include <folly/Range.h>
22 #if FOLLY_HAVE_EMMINTRIN_H
23 #include <emmintrin.h> // __v16qi
32 // It's okay if pages are bigger than this (as powers of two), but they should
34 constexpr size_t kMinPageSize = 4096;
35 static_assert(kMinPageSize >= 16,
36 "kMinPageSize must be at least SSE register size");
37 #define PAGE_FOR(addr) \
38 (reinterpret_cast<uintptr_t>(addr) / kMinPageSize)
41 // Earlier versions of GCC (for example, Clang on Mac OS X, which is based on
42 // GCC 4.2) do not have a full compliment of SSE builtins.
43 #if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6)
44 inline size_t nextAlignedIndex(const char* arr) {
45 auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
46 return 1 + // add 1 because the index starts at 'arr'
47 ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
51 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
52 size_t qfind_first_byte_of_needles16(const StringPiece haystack,
53 const StringPiece needles)
54 __attribute__ ((__target__("sse4.2"), noinline))
55 FOLLY_DISABLE_ADDRESS_SANITIZER;
57 // helper method for case where needles.size() <= 16
58 size_t qfind_first_byte_of_needles16(const StringPiece haystack,
59 const StringPiece needles) {
60 DCHECK(!haystack.empty());
61 DCHECK(!needles.empty());
62 DCHECK_LE(needles.size(), 16);
63 if ((needles.size() <= 2 && haystack.size() >= 256) ||
64 // must bail if we can't even SSE-load a single segment of haystack
65 (haystack.size() < 16 &&
66 PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) ||
67 // can't load needles into SSE register if it could cross page boundary
68 PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) {
69 return detail::qfind_first_byte_of_nosse(haystack, needles);
72 auto arr2 = __builtin_ia32_loaddqu(needles.data());
73 // do an unaligned load for first block of haystack
74 auto arr1 = __builtin_ia32_loaddqu(haystack.data());
75 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
76 arr1, haystack.size(), 0);
81 // Now, we can do aligned loads hereafter...
82 size_t i = nextAlignedIndex(haystack.data());
83 for (; i < haystack.size(); i+= 16) {
84 void* ptr1 = __builtin_assume_aligned(haystack.data() + i, 16);
85 auto arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
86 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
87 arr1, haystack.size() - i, 0);
92 return StringPiece::npos;
94 #endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+
96 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
97 // of Computer Algorithms" (1974), but the best description is here:
98 // http://research.swtch.com/sparse
101 FastByteSet() : size_(0) { } // no init of arrays required!
103 inline void add(uint8_t i) {
110 inline bool contains(uint8_t i) const {
111 DCHECK_LE(size_, 256);
112 return sparse_[i] < size_ && dense_[sparse_[i]] == i;
116 uint16_t size_; // can't use uint8_t because it would overflow if all
117 // possible values were inserted.
118 uint8_t sparse_[256];
126 size_t qfind_first_byte_of_byteset(const StringPiece haystack,
127 const StringPiece needles) {
129 for (auto needle: needles) {
132 for (size_t index = 0; index < haystack.size(); ++index) {
133 if (s.contains(haystack[index])) {
137 return StringPiece::npos;
140 size_t qfind_first_byte_of_bitset(const StringPiece haystack,
141 const StringPiece needles) {
143 for (auto needle : needles) {
144 s[(uint8_t)needle] = true;
146 for (size_t index = 0; index < haystack.size(); ++index) {
147 if (s[(uint8_t)haystack[index]]) {
151 return StringPiece::npos;
154 #if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6)
156 template <bool HAYSTACK_ALIGNED>
157 size_t scanHaystackBlock(const StringPiece haystack,
158 const StringPiece needles,
160 // inline is okay because it's only called from other sse4.2 functions
161 __attribute__ ((__target__("sse4.2")))
162 // Turn off ASAN because the "arr2 = ..." assignment in the loop below reads
163 // up to 15 bytes beyond end of the buffer in #needles#. That is ok because
164 // ptr2 is always 16-byte aligned, so the read can never span a page boundary.
165 // Also, the extra data that may be read is never actually used.
166 FOLLY_DISABLE_ADDRESS_SANITIZER;
168 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
169 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
170 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
172 template <bool HAYSTACK_ALIGNED>
173 size_t scanHaystackBlock(const StringPiece haystack,
174 const StringPiece needles,
175 uint64_t blockStartIdx) {
176 DCHECK_GT(needles.size(), 16); // should handled by *needles16() method
177 DCHECK(blockStartIdx + 16 <= haystack.size() ||
178 (PAGE_FOR(haystack.data() + blockStartIdx) ==
179 PAGE_FOR(haystack.data() + blockStartIdx + 15)));
182 if (HAYSTACK_ALIGNED) {
183 void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16);
184 arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
186 arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
189 // This load is safe because needles.size() >= 16
190 auto arr2 = __builtin_ia32_loaddqu(needles.data());
191 size_t b = __builtin_ia32_pcmpestri128(
192 arr2, 16, arr1, haystack.size() - blockStartIdx, 0);
194 size_t j = nextAlignedIndex(needles.data());
195 for (; j < needles.size(); j += 16) {
196 void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16);
197 arr2 = *reinterpret_cast<const __v16qi*>(ptr2);
199 auto index = __builtin_ia32_pcmpestri128(
200 arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0);
201 b = std::min<size_t>(index, b);
205 return blockStartIdx + b;
207 return StringPiece::npos;
210 size_t qfind_first_byte_of_sse42(const StringPiece haystack,
211 const StringPiece needles)
212 __attribute__ ((__target__("sse4.2"), noinline));
214 size_t qfind_first_byte_of_sse42(const StringPiece haystack,
215 const StringPiece needles) {
216 if (UNLIKELY(needles.empty() || haystack.empty())) {
217 return StringPiece::npos;
218 } else if (needles.size() <= 16) {
219 // we can save some unnecessary load instructions by optimizing for
220 // the common case of needles.size() <= 16
221 return qfind_first_byte_of_needles16(haystack, needles);
224 if (haystack.size() < 16 &&
225 PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) {
226 // We can't safely SSE-load haystack. Use a different approach.
227 if (haystack.size() <= 2) {
228 return qfind_first_of(haystack, needles, AsciiCaseSensitive());
230 return qfind_first_byte_of_byteset(haystack, needles);
233 auto ret = scanHaystackBlock<false>(haystack, needles, 0);
234 if (ret != StringPiece::npos) {
238 size_t i = nextAlignedIndex(haystack.data());
239 for (; i < haystack.size(); i += 16) {
240 auto ret = scanHaystackBlock<true>(haystack, needles, i);
241 if (ret != StringPiece::npos) {
246 return StringPiece::npos;
248 #endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+
250 size_t qfind_first_byte_of_nosse(const StringPiece haystack,
251 const StringPiece needles) {
252 if (UNLIKELY(needles.empty() || haystack.empty())) {
253 return StringPiece::npos;
255 // The thresholds below were empirically determined by benchmarking.
256 // This is not an exact science since it depends on the CPU, the size of
257 // needles, and the size of haystack.
258 if ((needles.size() >= 4 && haystack.size() <= 10) ||
259 (needles.size() >= 16 && haystack.size() <= 64) ||
260 needles.size() >= 32) {
261 return qfind_first_byte_of_byteset(haystack, needles);
263 return qfind_first_of(haystack, needles, AsciiCaseSensitive());
266 } // namespace detail