2 * Copyright 2013 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 // @author Mark Rabkin (mrabkin@fb.com)
19 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
22 #include "folly/Range.h"
24 #include "folly/CpuId.h"
25 #include "folly/Likely.h"
30 Predicates that can be used with qfind and startsWith
32 const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive();
33 const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive();
35 std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
36 os.write(piece.start(), piece.size());
41 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
42 const StringPiece& needles) {
43 size_t best = haystack.size();
44 for (char needle: needles) {
45 const void* ptr = memchr(haystack.data(), needle, best);
47 auto found = static_cast<const char*>(ptr) - haystack.data();
48 best = std::min<size_t>(best, found);
51 if (best == haystack.size()) {
52 return StringPiece::npos;
59 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
60 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
61 const StringPiece& needles)
62 __attribute__ ((__target__("sse4.2")));
64 // helper method for case where needles.size() <= 16
65 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
66 const StringPiece& needles) {
67 DCHECK_LE(needles.size(), 16);
68 if (needles.size() <= 2 && haystack.size() >= 256) {
69 // benchmarking shows that memchr beats out SSE for small needle-sets
70 // with large haystacks.
71 // TODO(mcurtiss): could this be because of unaligned SSE loads?
72 return detail::qfind_first_byte_of_memchr(haystack, needles);
74 auto arr2 = __builtin_ia32_loaddqu(needles.data());
75 for (size_t i = 0; i < haystack.size(); i+= 16) {
76 auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
77 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
78 arr1, haystack.size() - i, 0);
83 return StringPiece::npos;
86 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
87 const StringPiece& needles)
88 __attribute__ ((__target__("sse4.2")));
90 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
91 const StringPiece& needles) {
92 if (UNLIKELY(needles.empty() || haystack.empty())) {
93 return StringPiece::npos;
94 } else if (needles.size() <= 16) {
95 // we can save some unnecessary load instructions by optimizing for
96 // the common case of needles.size() <= 16
97 return qfind_first_byte_of_needles16(haystack, needles);
100 size_t index = haystack.size();
101 for (size_t i = 0; i < haystack.size(); i += 16) {
103 auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
104 for (size_t j = 0; j < needles.size(); j += 16) {
105 auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
106 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
107 arr1, haystack.size() - i, 0);
108 b = std::min<size_t>(index, b);
114 return StringPiece::npos;
117 typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of;
119 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
120 // of Computer Algorithms" (1974), but the best description is here:
121 // http://research.swtch.com/sparse
124 FastByteSet() : size_(0) { } // no init of arrays required!
126 inline void add(uint8_t i) {
133 inline bool contains(uint8_t i) const {
134 DCHECK_LE(size_, 256);
135 return sparse_[i] < size_ && dense_[sparse_[i]] == i;
139 uint16_t size_; // can't use uint8_t because it would overflow if all
140 // possible values were inserted.
141 uint8_t sparse_[256];
147 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
148 const StringPiece& needles) {
150 for (auto needle: needles) {
153 for (size_t index = 0; index < haystack.size(); ++index) {
154 if (s.contains(haystack[index])) {
158 return StringPiece::npos;
161 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
162 const StringPiece& needles) {
163 if (UNLIKELY(needles.empty() || haystack.empty())) {
164 return StringPiece::npos;
166 // The thresholds below were empirically determined by benchmarking.
167 // This is not an exact science since it depends on the CPU, the size of
168 // needles, and the size of haystack.
169 if (haystack.size() == 1 ||
170 (haystack.size() < 4 && needles.size() <= 16)) {
171 return qfind_first_of(haystack, needles, asciiCaseSensitive);
172 } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
173 (needles.size() >= 16 && haystack.size() <= 64) ||
174 needles.size() >= 32) {
175 return qfind_first_byte_of_byteset(haystack, needles);
178 return qfind_first_byte_of_memchr(haystack, needles);
181 // This function is called on startup to resolve folly::qfind_first_byte_of
182 extern "C" Type_qfind_first_byte_of* qfind_first_byte_of_ifunc() {
183 return folly::CpuId().sse42() ? qfind_first_byte_of_sse42 :
184 qfind_first_byte_of_nosse;
187 size_t qfind_first_byte_of(const StringPiece& haystack,
188 const StringPiece& needles)
189 __attribute__((ifunc("qfind_first_byte_of_ifunc")));
191 } // namespace detail