From 594d091c1aa21687f898bee9ab8765c9ad5954c5 Mon Sep 17 00:00:00 2001 From: khizmax Date: Sat, 22 Apr 2017 19:31:36 +0300 Subject: [PATCH] [urcu] Replaced RMW atomics with atomic load/store in URCU read-side lock/unlock. Fixing the barriers speed up a micro-benchmark (16 threads accessing an RCU-protected split-list set) by 30-40%. Thanks to Todd Lipcon who found this improvement --- cds/urcu/details/base.h | 4 +++- cds/urcu/details/gp.h | 16 ++++++++++------ cds/urcu/details/sh.h | 21 +++++++++++++-------- change.log | 5 +++-- thanks | 1 + 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/cds/urcu/details/base.h b/cds/urcu/details/base.h index 5e61254e..599bb2b1 100644 --- a/cds/urcu/details/base.h +++ b/cds/urcu/details/base.h @@ -51,6 +51,8 @@ namespace cds { Chapter 6 "User-Level Implementations of Read-Copy Update" - [2011] M.Desnoyers, P.McKenney, A.Stern, M.Dagenias, J.Walpole "User-Level Implementations of Read-Copy Update" + - [2012] M.Desnoyers, P.McKenney, A.Stern, M.Dagenias, J.Walpole "Supplementary + Material for User-Level Implementations of Read-Copy Update" Informal introduction to user-space %RCU @@ -107,7 +109,7 @@ namespace cds { design, thus being appropriate for use within a general-purpose library, but it has relatively higher read-side overhead. The \p libcds contains several implementations of general-purpose %RCU: \ref general_instant, \ref general_buffered, \ref general_threaded. - - \ref signal_buffered: the signal-handling %RCU presents an implementation having low read-side overhead and + - \p signal_buffered: the signal-handling %RCU presents an implementation having low read-side overhead and requiring only that the application give up one POSIX signal to %RCU update processing. @note The signal-handled %RCU is defined only for UNIX-like systems, not for Windows. diff --git a/cds/urcu/details/gp.h b/cds/urcu/details/gp.h index 2bd7e364..41287a7d 100644 --- a/cds/urcu/details/gp.h +++ b/cds/urcu/details/gp.h @@ -68,12 +68,14 @@ namespace cds { namespace urcu { namespace details { uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed ); if ( (tmp & rcu_class::c_nNestMask) == 0 ) { pRec->m_nAccessControl.store( gp_singleton::instance()->global_control_word(atomics::memory_order_relaxed), - atomics::memory_order_release ); - atomics::atomic_thread_fence( atomics::memory_order_acquire ); - CDS_COMPILER_RW_BARRIER; + atomics::memory_order_relaxed ); + + // acquire barrier + pRec->m_nAccessControl.load( atomics::memory_order_acquire ); } else { - pRec->m_nAccessControl.fetch_add( 1, atomics::memory_order_relaxed ); + // nested lock + pRec->m_nAccessControl.store( tmp + 1, atomics::memory_order_relaxed ); } } @@ -83,8 +85,10 @@ namespace cds { namespace urcu { namespace details { thread_record * pRec = get_thread_record(); assert( pRec != nullptr ); - CDS_COMPILER_RW_BARRIER; - pRec->m_nAccessControl.fetch_sub( 1, atomics::memory_order_release ); + uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed ); + assert( (tmp & rcu_class::c_nNestMask) > 0 ); + + pRec->m_nAccessControl.store( tmp - 1, atomics::memory_order_release ); } template diff --git a/cds/urcu/details/sh.h b/cds/urcu/details/sh.h index 6de2b70d..fd37602d 100644 --- a/cds/urcu/details/sh.h +++ b/cds/urcu/details/sh.h @@ -69,16 +69,19 @@ namespace cds { namespace urcu { namespace details { assert( pRec != nullptr ); uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed ); + assert( ( tmp & rcu_class::c_nNestMask ) > 0 ); + if ( (tmp & rcu_class::c_nNestMask) == 0 ) { - pRec->m_nAccessControl.store( - sh_singleton::instance()->global_control_word(atomics::memory_order_acquire), - atomics::memory_order_release - ); + pRec->m_nAccessControl.store( sh_singleton::instance()->global_control_word(atomics::memory_order_relaxed), + atomics::memory_order_relaxed ); + + // acquire barrier + pRec->m_nAccessControl.load( atomics::memory_order_acquire ); } else { - pRec->m_nAccessControl.fetch_add( 1, atomics::memory_order_release ); + // nested lock + pRec->m_nAccessControl.store( tmp + 1, atomics::memory_order_relaxed ); } - CDS_COMPILER_RW_BARRIER; } template @@ -87,8 +90,10 @@ namespace cds { namespace urcu { namespace details { thread_record * pRec = get_thread_record(); assert( pRec != nullptr); - CDS_COMPILER_RW_BARRIER; - pRec->m_nAccessControl.fetch_sub( 1, atomics::memory_order_release ); + uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed ); + assert( ( tmp & rcu_class::c_nNestMask ) > 0 ); + + pRec->m_nAccessControl.store( tmp - 1, atomics::memory_order_release ); } template diff --git a/change.log b/change.log index 4b4c94b0..5c5d5ec6 100644 --- a/change.log +++ b/change.log @@ -16,8 +16,9 @@ - Changed: exception handling. Now, exceptions raise by invoking new cds::throw_exception() function. If you compile your code with exception disabled, the function prints an exception message to stdout and calls abort() - instead of throwing. You can provide your own cds::throw_exception() function - and compile libcds with -DCDS_USER_DEFINED_THROW_EXCEPTION. + instead of throwing. + - Flat Combining: fixed memory-order bug that can lead to crash on weak ordered + architecture like PowerPC or ARM - Added: erase_at( iterator ) function to MichaelHashSet/Map and SplitListSet/Map based on IterableList - Fixed a bug in BronsonAVLTreeMap::extract_min()/extract_max()/clear(). diff --git a/thanks b/thanks index 088a3fa1..5110a641 100644 --- a/thanks +++ b/thanks @@ -16,3 +16,4 @@ Mike Krinkin (https://github.com/krinkinmu) Nikolai Rapotkin rwf (https://github.com/rfw) Tamas Lengyel +Todd Lipcon -- 2.34.1