2 * Copyright 2015 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "RangeSse42.h"
21 #include <glog/logging.h>
22 #include <folly/Portability.h>
26 // Essentially, two versions of this file: one with an SSE42 implementation
27 // and one with a fallback implementation. We determine which version to use by
28 // testing for the presence of the required headers.
30 // TODO: Maybe this should be done by the build system....
31 #if !FOLLY_SSE_PREREQ(4, 2)
39 size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
40 const StringPieceLite needles) {
41 CHECK(false) << "Function " << __func__ << " only works with SSE42!";
42 return qfind_first_byte_of_nosse(haystack, needles);
58 #include <emmintrin.h>
59 #include <smmintrin.h>
60 #include <folly/Likely.h>
66 // It's okay if pages are bigger than this (as powers of two), but they should
68 static constexpr size_t kMinPageSize = 4096;
69 static_assert(kMinPageSize >= 16,
70 "kMinPageSize must be at least SSE register size");
73 static inline uintptr_t page_for(T* addr) {
74 return reinterpret_cast<uintptr_t>(addr) / kMinPageSize;
77 static inline size_t nextAlignedIndex(const char* arr) {
78 auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
79 return 1 + // add 1 because the index starts at 'arr'
80 ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
84 static size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
85 const StringPieceLite needles)
86 FOLLY_DISABLE_ADDRESS_SANITIZER;
88 // helper method for case where needles.size() <= 16
89 size_t qfind_first_byte_of_needles16(const StringPieceLite haystack,
90 const StringPieceLite needles) {
91 DCHECK_GT(haystack.size(), 0);
92 DCHECK_GT(needles.size(), 0);
93 DCHECK_LE(needles.size(), 16);
94 if ((needles.size() <= 2 && haystack.size() >= 256) ||
95 // must bail if we can't even SSE-load a single segment of haystack
96 (haystack.size() < 16 &&
97 page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) ||
98 // can't load needles into SSE register if it could cross page boundary
99 page_for(needles.end() - 1) != page_for(needles.data() + 15)) {
100 return detail::qfind_first_byte_of_nosse(haystack, needles);
103 auto arr2 = ::_mm_loadu_si128(
104 reinterpret_cast<const __m128i*>(needles.data()));
105 // do an unaligned load for first block of haystack
106 auto arr1 = ::_mm_loadu_si128(
107 reinterpret_cast<const __m128i*>(haystack.data()));
108 auto index = __builtin_ia32_pcmpestri128((__v16qi)arr2, needles.size(),
109 (__v16qi)arr1, haystack.size(), 0);
114 // Now, we can do aligned loads hereafter...
115 size_t i = nextAlignedIndex(haystack.data());
116 for (; i < haystack.size(); i+= 16) {
117 auto arr1 = ::_mm_load_si128(
118 reinterpret_cast<const __m128i*>(haystack.data() + i));
119 auto index = __builtin_ia32_pcmpestri128(
120 (__v16qi)arr2, needles.size(),
121 (__v16qi)arr1, haystack.size() - i, 0);
126 return std::string::npos;
129 template <bool HAYSTACK_ALIGNED>
130 size_t scanHaystackBlock(const StringPieceLite haystack,
131 const StringPieceLite needles,
133 // Turn off ASAN because the "arr2 = ..." assignment in the loop below reads
134 // up to 15 bytes beyond end of the buffer in #needles#. That is ok because
135 // ptr2 is always 16-byte aligned, so the read can never span a page boundary.
136 // Also, the extra data that may be read is never actually used.
137 FOLLY_DISABLE_ADDRESS_SANITIZER;
139 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
140 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
141 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
143 template <bool HAYSTACK_ALIGNED>
144 size_t scanHaystackBlock(const StringPieceLite haystack,
145 const StringPieceLite needles,
146 uint64_t blockStartIdx) {
147 DCHECK_GT(needles.size(), 16); // should handled by *needles16() method
148 DCHECK(blockStartIdx + 16 <= haystack.size() ||
149 (page_for(haystack.data() + blockStartIdx) ==
150 page_for(haystack.data() + blockStartIdx + 15)));
153 if (HAYSTACK_ALIGNED) {
154 arr1 = ::_mm_load_si128(
155 reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
157 arr1 = ::_mm_loadu_si128(
158 reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
161 // This load is safe because needles.size() >= 16
162 auto arr2 = ::_mm_loadu_si128(
163 reinterpret_cast<const __m128i*>(needles.data()));
164 size_t b = __builtin_ia32_pcmpestri128(
165 (__v16qi)arr2, 16, (__v16qi)arr1, haystack.size() - blockStartIdx, 0);
167 size_t j = nextAlignedIndex(needles.data());
168 for (; j < needles.size(); j += 16) {
169 arr2 = ::_mm_load_si128(
170 reinterpret_cast<const __m128i*>(needles.data() + j));
172 auto index = __builtin_ia32_pcmpestri128(
173 (__v16qi)arr2, needles.size() - j,
174 (__v16qi)arr1, haystack.size() - blockStartIdx, 0);
175 b = std::min<size_t>(index, b);
179 return blockStartIdx + b;
181 return std::string::npos;
184 size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
185 const StringPieceLite needles);
187 size_t qfind_first_byte_of_sse42(const StringPieceLite haystack,
188 const StringPieceLite needles) {
189 if (UNLIKELY(needles.empty() || haystack.empty())) {
190 return std::string::npos;
191 } else if (needles.size() <= 16) {
192 // we can save some unnecessary load instructions by optimizing for
193 // the common case of needles.size() <= 16
194 return qfind_first_byte_of_needles16(haystack, needles);
197 if (haystack.size() < 16 &&
198 page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) {
199 // We can't safely SSE-load haystack. Use a different approach.
200 if (haystack.size() <= 2) {
201 return qfind_first_byte_of_std(haystack, needles);
203 return qfind_first_byte_of_byteset(haystack, needles);
206 auto ret = scanHaystackBlock<false>(haystack, needles, 0);
207 if (ret != std::string::npos) {
211 size_t i = nextAlignedIndex(haystack.data());
212 for (; i < haystack.size(); i += 16) {
213 auto ret = scanHaystackBlock<true>(haystack, needles, i);
214 if (ret != std::string::npos) {
219 return std::string::npos;