From 7fd87e7e86bb78dc693da2111bf915761346d540 Mon Sep 17 00:00:00 2001 From: Tudor Bosman Date: Wed, 8 Aug 2012 10:39:47 -0700 Subject: [PATCH] Detect popcnt instruction at runtime, use it if available. Summary: If compiled for a popcnt-supporting target (-march=corei7, for example), use __builtin_popcount, as it's presumably inlined. Otherwise, detect on startup (in the same way as glibc dispatches to one of the many flavors of memcpy): GCC allows us to add a resolver function which the dynamic loader will call on startup to resolve a function to one of various alternatives; we check (using the cpuid instruction) whether popcnt is supported, and use it if available. Test Plan: tests added Reviewed By: soren@fb.com FB internal diff: D542977 --- folly/Bits.cpp | 77 ++++++++++++++++++++++++++ folly/Bits.h | 97 ++++++++++----------------------- folly/CpuId.h | 112 ++++++++++++++++++++++++++++++++++++++ folly/detail/BitsDetail.h | 47 ++++++++++++++++ folly/experimental/Bits.h | 25 --------- folly/test/BitsTest.cpp | 26 ++------- folly/test/CpuIdTest.cpp | 29 ++++++++++ 7 files changed, 298 insertions(+), 115 deletions(-) create mode 100644 folly/Bits.cpp create mode 100644 folly/CpuId.h create mode 100644 folly/detail/BitsDetail.h create mode 100644 folly/test/CpuIdTest.cpp diff --git a/folly/Bits.cpp b/folly/Bits.cpp new file mode 100644 index 00000000..41c2bc46 --- /dev/null +++ b/folly/Bits.cpp @@ -0,0 +1,77 @@ +/* + * Copyright 2012 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "folly/Bits.h" + +#include "folly/CpuId.h" + +// None of this is necessary if we're compiling for a target that supports +// popcnt +#ifndef __POPCNT__ + +namespace { + +int popcount_inst(unsigned int x) { + asm ("popcntl %0, %0" : "=r" (x) : "0" (x)); + return x; +} + +int popcount_builtin(unsigned int x) { + return __builtin_popcount(x); +} + +int popcountll_inst(unsigned long long x) { + asm ("popcntq %0, %0" : "=r" (x) : "0" (x)); + return x; +} + +int popcountll_builtin(unsigned long long x) { + return __builtin_popcountll(x); +} + +typedef decltype(popcount_builtin) Type_popcount; +typedef decltype(popcountll_builtin) Type_popcountll; + +} // namespace + +// This function is called on startup to resolve folly::detail::popcount +extern "C" Type_popcount* folly_popcount_ifunc() { + return folly::CpuId().popcnt() ? popcount_inst : popcount_builtin; +} + +// This function is called on startup to resolve folly::detail::popcountll +extern "C" Type_popcountll* folly_popcountll_ifunc() { + return folly::CpuId().popcnt() ? popcountll_inst : popcountll_builtin; +} + +namespace folly { +namespace detail { + +// Call folly_popcount_ifunc on startup to resolve to either popcount_inst +// or popcount_builtin +int popcount(unsigned int x) + __attribute__((ifunc("folly_popcount_ifunc"))); + +// Call folly_popcount_ifunc on startup to resolve to either popcountll_inst +// or popcountll_builtin +int popcountll(unsigned long long x) + __attribute__((ifunc("folly_popcountll_ifunc"))); + +} // namespace detail +} // namespace folly + +#endif /* !__POPCNT__ */ + diff --git a/folly/Bits.h b/folly/Bits.h index f97ada8e..4d4656d2 100644 --- a/folly/Bits.h +++ b/folly/Bits.h @@ -26,6 +26,9 @@ * 1-based. 0 = no bits are set (x == 0) * for x != 0, findLastSet(x) == 1 + floor(log2(x)) * + * popcount(x) + * return the number of 1 bits in x + * * nextPowTwo(x) * Finds the next power of two >= x. * @@ -55,6 +58,11 @@ #define _GNU_SOURCE 1 #endif +#ifndef __GNUC__ +#error GCC required +#endif + +#include "folly/detail/BitsDetail.h" #include "folly/detail/BitIteratorDetail.h" #include "folly/Likely.h" @@ -125,21 +133,6 @@ typename std::enable_if< return findFirstSet(static_cast::type>(x)); } -namespace detail { - -// Portable, but likely slow... -inline unsigned int findLastSetPortable(uint64_t x) { - unsigned int r = (x != 0); // 1-based index, except for x==0 - while (x >>= 1) { - ++r; - } - return r; -} - -} // namespace detail - -#ifdef __GNUC__ - // findLastSet: return the 1-based index of the highest bit set // for x > 0, findLastSet(x) == 1 + floor(log2(x)) template @@ -179,19 +172,6 @@ typename std::enable_if< return x ? 8 * sizeof(unsigned long long) - __builtin_clzll(x) : 0; } -#else /* !__GNUC__ */ - -template -typename std::enable_if< - (std::is_integral::value && - std::is_unsigned::value), - unsigned int>::type - findLastSet(T x) { - return detail:findLastSetPortable(x); -} - -#endif - template typename std::enable_if< (std::is_integral::value && @@ -201,36 +181,6 @@ typename std::enable_if< return findLastSet(static_cast::type>(x)); } -namespace detail { - -template -inline -typename std::enable_if< - std::is_integral::value && std::is_unsigned::value, - T>::type -nextPowTwoPortable(T v) { - if (UNLIKELY(v == 0)) { - return 1; - } - - --v; - for (uint32_t i = 1; i < sizeof(T) * 8; i <<= 8) { - v |= (v >> i); - v |= (v >> (i << 1)); - v |= (v >> (i << 2)); - v |= (v >> (i << 3)); - v |= (v >> (i << 4)); - v |= (v >> (i << 5)); - v |= (v >> (i << 6)); - v |= (v >> (i << 7)); - } - return v + 1; -} - -} // namespace detail - -#ifdef __GNUC__ - template inline typename std::enable_if< @@ -243,20 +193,29 @@ nextPowTwo(T v) { return 1ul << findLastSet(v - 1); } -#else /* __GNUC__ */ - +/** + * Population count + */ template -inline -typename std::enable_if< - std::is_integral::value && std::is_unsigned::value, - T>::type -nextPowTwo(T v) { - return detail::nextPowTwoPortable(v); +inline typename std::enable_if< + (std::is_integral::value && + std::is_unsigned::value && + sizeof(T) <= sizeof(unsigned int)), + size_t>::type + popcount(T x) { + return detail::popcount(x); } -#endif /* __GNUC__ */ - - +template +inline typename std::enable_if< + (std::is_integral::value && + std::is_unsigned::value && + sizeof(T) > sizeof(unsigned int) && + sizeof(T) <= sizeof(unsigned long long)), + size_t>::type + popcount(T x) { + return detail::popcountll(x); +} /** * Endianness detection and manipulation primitives. diff --git a/folly/CpuId.h b/folly/CpuId.h new file mode 100644 index 00000000..ef5a3588 --- /dev/null +++ b/folly/CpuId.h @@ -0,0 +1,112 @@ +/* + * Copyright 2012 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_CPUID_H_ +#define FOLLY_CPUID_H_ + +#include + +namespace folly { + +/** + * Identification of an Intel CPU. + * Supports CPUID (EAX=1) feature flags. + * Values from http://www.intel.com/content/www/us/en/processors/processor-identification-cpuid-instruction-note.html + */ +class CpuId { + public: + CpuId() { + __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx"); + } +#define X(name, r, bit) bool name() const { return r & (1U << bit); } +#define C(name, bit) X(name, c_, bit) +#define D(name, bit) X(name, d_, bit) + C(sse3, 0) + C(pclmuldq, 1) + C(dtes64, 2) + C(monitor, 3) + C(dscpl, 4) + C(vmx, 5) + C(smx, 6) + C(eist, 7) + C(tm2, 8) + C(ssse3, 9) + C(cnxtid, 10) + // 11 is reserved + C(fma, 12) + C(cx16, 13) + C(xtpr, 14) + C(pdcm, 15) + // 16 is reserved + C(pcid, 17) + C(dca, 18) + C(sse41, 19) + C(sse42, 20) + C(x2apic, 21) + C(movbe, 22) + C(popcnt, 23) + C(tscdeadline, 24) + C(aes, 25) + C(xsave, 26) + C(osxsave, 27) + C(avx, 28) + C(f16c, 29) + C(rdrand, 30) + // 31 is not used + D(fpu, 0) + D(vme, 1) + D(de, 2) + D(pse, 3) + D(tsc, 4) + D(msr, 5) + D(pae, 6) + D(mce, 7) + D(cx8, 8) + D(apic, 9) + // 10 is reserved + D(sep, 11) + D(mtrr, 12) + D(pge, 13) + D(mca, 14) + D(cmov, 15) + D(pat, 16) + D(pse36, 17) + D(psn, 18) + D(clfsh, 19) + // 20 is reserved + D(ds, 21) + D(acpi, 22) + D(mmx, 23) + D(fxsr, 24) + D(sse, 25) + D(sse2, 26) + D(ss, 27) + D(htt, 28) + D(tm, 29) + // 30 is reserved + D(pbe, 31) +#undef D +#undef C +#undef X + private: + uint32_t c_; // ECX + uint32_t d_; // EDX +}; + +} // namespace folly + +#endif /* FOLLY_CPUID_H_ */ + diff --git a/folly/detail/BitsDetail.h b/folly/detail/BitsDetail.h new file mode 100644 index 00000000..504ed96f --- /dev/null +++ b/folly/detail/BitsDetail.h @@ -0,0 +1,47 @@ +/* + * Copyright 2012 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FOLLY_DETAIL_BITSDETAIL_H_ +#define FOLLY_DETAIL_BITSDETAIL_H_ + +namespace folly { +namespace detail { + +// If we're targeting an architecture with popcnt support, use +// __builtin_popcount directly, as it's presumably inlined. +// If not, use runtime detection using __attribute__((ifunc)) +// (see Bits.cpp) +#ifdef __POPCNT__ + +inline int popcount(unsigned int x) { + return __builtin_popcount(x); +} +inline int popcountll(unsigned long long x) { + return __builtin_popcountll(x); +} + +#else /* !__POPCNT__ */ + +int popcount(unsigned int x); +int popcountll(unsigned long long x); + +#endif /* !__POPCNT__ */ + +} // namespace detail +} // namespace folly + +#endif /* FOLLY_DETAIL_BITSDETAIL_H_ */ + diff --git a/folly/experimental/Bits.h b/folly/experimental/Bits.h index a18f4f95..e3ec6eb6 100644 --- a/folly/experimental/Bits.h +++ b/folly/experimental/Bits.h @@ -30,31 +30,6 @@ namespace folly { // right-shift is arithmetic for signed values, and that can lead to // unpleasant bugs. -/** - * Population count (number of bits set), using __builtin_popcount or - * __builtin_popcountll, depending on size. - */ -template -inline typename std::enable_if< - (std::is_integral::value && - std::is_unsigned::value && - sizeof(T) <= sizeof(unsigned int)), - size_t>::type - popcount(T x) { - return __builtin_popcount(x); -} - -template -inline typename std::enable_if< - (std::is_integral::value && - std::is_unsigned::value && - sizeof(T) > sizeof(unsigned int) && - sizeof(T) <= sizeof(unsigned long long)), - size_t>::type - popcount(T x) { - return __builtin_popcountll(x); -} - namespace detail { /** diff --git a/folly/test/BitsTest.cpp b/folly/test/BitsTest.cpp index 8f08a9e5..99bdcb48 100644 --- a/folly/test/BitsTest.cpp +++ b/folly/test/BitsTest.cpp @@ -37,12 +37,6 @@ void testFFS() { } } -template -unsigned int findLastSetPortable(INT x) { - return detail::findLastSetPortable( - static_cast::type>(x)); -} - template void testFLS() { typedef typename std::make_unsigned::type UINT; @@ -51,11 +45,9 @@ void testFLS() { for (size_t i = 0; i < bits; i++) { INT v1 = static_cast(1) << i; EXPECT_EQ(i + 1, findLastSet(v1)); - EXPECT_EQ(i + 1, findLastSetPortable(v1)); INT v2 = (static_cast(1) << i) - 1; EXPECT_EQ(i, findLastSet(v2)); - EXPECT_EQ(i, findLastSetPortable(v2)); } } @@ -113,8 +105,6 @@ TEST(Bits, FindLastSet) { } -#ifdef __GNUC__ - TEST(Bits, nextPowTwoClz) { testPowTwo(nextPowTwo); } @@ -124,18 +114,13 @@ BENCHMARK(nextPowTwoClz, iters) { x = folly::nextPowTwo(iters); } -#endif - -TEST(Bits, nextPowTwoPortable) { - testPowTwo(detail::nextPowTwoPortable); -} - -BENCHMARK(nextPowTwoPortable, iters) { - x = detail::nextPowTwoPortable(iters); +TEST(Bits, popcount) { + EXPECT_EQ(0, popcount(0U)); + EXPECT_EQ(1, popcount(1U)); + EXPECT_EQ(32, popcount(uint32_t(-1))); + EXPECT_EQ(64, popcount(uint64_t(-1))); } -BENCHMARK_DRAW_LINE(); - int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); google::ParseCommandLineFlags(&argc, &argv, true); @@ -153,5 +138,4 @@ Benchmarks run on dual Xeon X5650's @ 2.67GHz w/hyperthreading enabled Benchmark Iters Total t t/iter iter/sec ------------------------------------------------------------------------------ * nextPowTwoClz 1000000 1.659 ms 1.659 ns 574.8 M - +66.8% nextPowTwoPortable 1000000 2.767 ms 2.767 ns 344.7 M */ diff --git a/folly/test/CpuIdTest.cpp b/folly/test/CpuIdTest.cpp new file mode 100644 index 00000000..1692f92c --- /dev/null +++ b/folly/test/CpuIdTest.cpp @@ -0,0 +1,29 @@ +/* + * Copyright 2012 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "folly/CpuId.h" + +#include +#include + +using namespace folly; + +TEST(CpuId, Simple) { + // All CPUs should support MMX + CpuId id; + EXPECT_TRUE(id.mmx()); +} + -- 2.34.1