2 * Copyright 2013 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 // @author Mark Rabkin (mrabkin@fb.com)
18 // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com)
20 #include "folly/Range.h"
22 #include "folly/Likely.h"
27 * Predicates that can be used with qfind and startsWith
29 const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive();
30 const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive();
32 std::ostream& operator<<(std::ostream& os, const StringPiece& piece) {
33 os.write(piece.start(), piece.size());
39 size_t qfind_first_byte_of_memchr(const StringPiece& haystack,
40 const StringPiece& needles) {
41 size_t best = haystack.size();
42 for (char needle: needles) {
43 const void* ptr = memchr(haystack.data(), needle, best);
45 auto found = static_cast<const char*>(ptr) - haystack.data();
46 best = std::min<size_t>(best, found);
49 if (best == haystack.size()) {
50 return StringPiece::npos;
59 // build sse4.2-optimized version even if -msse4.2 is not passed to GCC
60 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
61 const StringPiece& needles)
62 __attribute__ ((__target__("sse4.2"), noinline));
64 // helper method for case where needles.size() <= 16
65 size_t qfind_first_byte_of_needles16(const StringPiece& haystack,
66 const StringPiece& needles) {
67 DCHECK_LE(needles.size(), 16);
68 if (needles.size() <= 2 && haystack.size() >= 256) {
69 // benchmarking shows that memchr beats out SSE for small needle-sets
70 // with large haystacks.
71 // TODO(mcurtiss): could this be because of unaligned SSE loads?
72 return detail::qfind_first_byte_of_memchr(haystack, needles);
74 auto arr2 = __builtin_ia32_loaddqu(needles.data());
75 for (size_t i = 0; i < haystack.size(); i+= 16) {
76 auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
77 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(),
78 arr1, haystack.size() - i, 0);
83 return StringPiece::npos;
86 // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis
87 // of Computer Algorithms" (1974), but the best description is here:
88 // http://research.swtch.com/sparse
91 FastByteSet() : size_(0) { } // no init of arrays required!
93 inline void add(uint8_t i) {
100 inline bool contains(uint8_t i) const {
101 DCHECK_LE(size_, 256);
102 return sparse_[i] < size_ && dense_[sparse_[i]] == i;
106 uint16_t size_; // can't use uint8_t because it would overflow if all
107 // possible values were inserted.
108 uint8_t sparse_[256];
116 size_t qfind_first_byte_of_byteset(const StringPiece& haystack,
117 const StringPiece& needles) {
119 for (auto needle: needles) {
122 for (size_t index = 0; index < haystack.size(); ++index) {
123 if (s.contains(haystack[index])) {
127 return StringPiece::npos;
130 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
131 const StringPiece& needles)
132 __attribute__ ((__target__("sse4.2"), noinline));
134 size_t qfind_first_byte_of_sse42(const StringPiece& haystack,
135 const StringPiece& needles) {
136 if (UNLIKELY(needles.empty() || haystack.empty())) {
137 return StringPiece::npos;
138 } else if (needles.size() <= 16) {
139 // we can save some unnecessary load instructions by optimizing for
140 // the common case of needles.size() <= 16
141 return qfind_first_byte_of_needles16(haystack, needles);
144 size_t index = haystack.size();
145 for (size_t i = 0; i < haystack.size(); i += 16) {
147 auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i);
148 for (size_t j = 0; j < needles.size(); j += 16) {
149 auto arr2 = __builtin_ia32_loaddqu(needles.data() + j);
150 auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j,
151 arr1, haystack.size() - i, 0);
152 b = std::min<size_t>(index, b);
158 return StringPiece::npos;
161 size_t qfind_first_byte_of_nosse(const StringPiece& haystack,
162 const StringPiece& needles) {
163 if (UNLIKELY(needles.empty() || haystack.empty())) {
164 return StringPiece::npos;
166 // The thresholds below were empirically determined by benchmarking.
167 // This is not an exact science since it depends on the CPU, the size of
168 // needles, and the size of haystack.
169 if (haystack.size() == 1 ||
170 (haystack.size() < 4 && needles.size() <= 16)) {
171 return qfind_first_of(haystack, needles, asciiCaseSensitive);
172 } else if ((needles.size() >= 4 && haystack.size() <= 10) ||
173 (needles.size() >= 16 && haystack.size() <= 64) ||
174 needles.size() >= 32) {
175 return qfind_first_byte_of_byteset(haystack, needles);
178 return qfind_first_byte_of_memchr(haystack, needles);
181 } // namespace detail