From 4988b28c014b35129a6e84a47a68b03ce1ed23ca Mon Sep 17 00:00:00 2001 From: Philip Pronin Date: Thu, 24 Jan 2013 16:36:11 -0800 Subject: [PATCH] qfind_first_byte_of may suffer from global initialization order Summary: ##static## handling adds ~2 more ns overhead per call (and the first call is kinda slow), but now the logic is correct now. Also inlined ##qfind_first_byte_of##. Test Plan: unittests Reviewed By: tudorb@fb.com FB internal diff: D687947 --- folly/Range.cpp | 76 ++++++++++++++++------------------------ folly/Range.h | 18 ++++++++-- folly/test/RangeTest.cpp | 6 ++-- 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/folly/Range.cpp b/folly/Range.cpp index 6149f4ed..94dc4fe9 100644 --- a/folly/Range.cpp +++ b/folly/Range.cpp @@ -14,14 +14,11 @@ * limitations under the License. */ -// // @author Mark Rabkin (mrabkin@fb.com) // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com) -// #include "folly/Range.h" -#include "folly/CpuId.h" #include "folly/Likely.h" namespace folly { @@ -86,39 +83,6 @@ size_t qfind_first_byte_of_needles16(const StringPiece& haystack, return StringPiece::npos; } -size_t qfind_first_byte_of_sse42(const StringPiece& haystack, - const StringPiece& needles) - __attribute__ ((__target__("sse4.2"), noinline)); - -size_t qfind_first_byte_of_sse42(const StringPiece& haystack, - const StringPiece& needles) { - if (UNLIKELY(needles.empty() || haystack.empty())) { - return StringPiece::npos; - } else if (needles.size() <= 16) { - // we can save some unnecessary load instructions by optimizing for - // the common case of needles.size() <= 16 - return qfind_first_byte_of_needles16(haystack, needles); - } - - size_t index = haystack.size(); - for (size_t i = 0; i < haystack.size(); i += 16) { - size_t b = 16; - auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i); - for (size_t j = 0; j < needles.size(); j += 16) { - auto arr2 = __builtin_ia32_loaddqu(needles.data() + j); - auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j, - arr1, haystack.size() - i, 0); - b = std::min(index, b); - } - if (b < 16) { - return i + b; - } - }; - return StringPiece::npos; -} - -typedef decltype(qfind_first_byte_of_sse42) Type_qfind_first_byte_of; - // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis // of Computer Algorithms" (1974), but the best description is here: // http://research.swtch.com/sparse @@ -163,6 +127,37 @@ size_t qfind_first_byte_of_byteset(const StringPiece& haystack, return StringPiece::npos; } +size_t qfind_first_byte_of_sse42(const StringPiece& haystack, + const StringPiece& needles) + __attribute__ ((__target__("sse4.2"), noinline)); + +size_t qfind_first_byte_of_sse42(const StringPiece& haystack, + const StringPiece& needles) { + if (UNLIKELY(needles.empty() || haystack.empty())) { + return StringPiece::npos; + } else if (needles.size() <= 16) { + // we can save some unnecessary load instructions by optimizing for + // the common case of needles.size() <= 16 + return qfind_first_byte_of_needles16(haystack, needles); + } + + size_t index = haystack.size(); + for (size_t i = 0; i < haystack.size(); i += 16) { + size_t b = 16; + auto arr1 = __builtin_ia32_loaddqu(haystack.data() + i); + for (size_t j = 0; j < needles.size(); j += 16) { + auto arr2 = __builtin_ia32_loaddqu(needles.data() + j); + auto index = __builtin_ia32_pcmpestri128(arr2, needles.size() - j, + arr1, haystack.size() - i, 0); + b = std::min(index, b); + } + if (b < 16) { + return i + b; + } + }; + return StringPiece::npos; +} + size_t qfind_first_byte_of_nosse(const StringPiece& haystack, const StringPiece& needles) { if (UNLIKELY(needles.empty() || haystack.empty())) { @@ -183,14 +178,5 @@ size_t qfind_first_byte_of_nosse(const StringPiece& haystack, return qfind_first_byte_of_memchr(haystack, needles); } -auto const qfind_first_byte_of_fn = - folly::CpuId().sse42() ? qfind_first_byte_of_sse42 - : qfind_first_byte_of_nosse; - -size_t qfind_first_byte_of(const StringPiece& haystack, - const StringPiece& needles) { - return qfind_first_byte_of_fn(haystack, needles); -} - } // namespace detail } // namespace folly diff --git a/folly/Range.h b/folly/Range.h index 196e8d3d..d69d335e 100644 --- a/folly/Range.h +++ b/folly/Range.h @@ -32,6 +32,7 @@ #include #include #include +#include "folly/CpuId.h" #include "folly/Traits.h" namespace folly { @@ -593,8 +594,21 @@ size_t qfind(const Range& haystack, } namespace detail { -size_t qfind_first_byte_of(const StringPiece& haystack, - const StringPiece& needles); + +size_t qfind_first_byte_of_sse42(const StringPiece& haystack, + const StringPiece& needles); + +size_t qfind_first_byte_of_nosse(const StringPiece& haystack, + const StringPiece& needles); + +inline size_t qfind_first_byte_of(const StringPiece& haystack, + const StringPiece& needles) { + static auto const qfind_first_byte_of_fn = + folly::CpuId().sse42() ? qfind_first_byte_of_sse42 + : qfind_first_byte_of_nosse; + return qfind_first_byte_of_fn(haystack, needles); +} + } // namespace detail template diff --git a/folly/test/RangeTest.cpp b/folly/test/RangeTest.cpp index cb8aec8a..6b297252 100644 --- a/folly/test/RangeTest.cpp +++ b/folly/test/RangeTest.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -// // @author Kristina Holst (kholst@fb.com) // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com) @@ -25,6 +24,7 @@ #include "folly/Range.h" namespace folly { namespace detail { + // declaration of functions in Range.cpp size_t qfind_first_byte_of_memchr(const StringPiece& haystack, const StringPiece& needles); @@ -32,9 +32,7 @@ size_t qfind_first_byte_of_memchr(const StringPiece& haystack, size_t qfind_first_byte_of_byteset(const StringPiece& haystack, const StringPiece& needles); -size_t qfind_first_byte_of_nosse(const StringPiece& haystack, - const StringPiece& needles); -}} +}} // namespaces using namespace folly; using namespace std; -- 2.34.1