[urcu] Replaced RMW atomics with atomic load/store in URCU read-side lock/unlock.
authorkhizmax <libcds.dev@gmail.com>
Sat, 22 Apr 2017 16:31:36 +0000 (19:31 +0300)
committerkhizmax <libcds.dev@gmail.com>
Sat, 22 Apr 2017 16:31:36 +0000 (19:31 +0300)
Fixing the barriers speed up a micro-benchmark (16 threads accessing an RCU-protected split-list set) by 30-40%. Thanks to Todd Lipcon who found this improvement

cds/urcu/details/base.h
cds/urcu/details/gp.h
cds/urcu/details/sh.h
change.log
thanks

index 5e61254..599bb2b 100644 (file)
@@ -51,6 +51,8 @@ namespace cds {
           Chapter 6 "User-Level Implementations of Read-Copy Update"
         - [2011] M.Desnoyers, P.McKenney, A.Stern, M.Dagenias, J.Walpole "User-Level
           Implementations of Read-Copy Update"
+        - [2012] M.Desnoyers, P.McKenney, A.Stern, M.Dagenias, J.Walpole "Supplementary
+          Material for User-Level Implementations of Read-Copy Update"
 
         <b>Informal introduction to user-space %RCU</b>
 
@@ -107,7 +109,7 @@ namespace cds {
           design, thus being appropriate for use within a general-purpose library, but it has
           relatively higher read-side overhead. The \p libcds contains several implementations of general-purpose
           %RCU: \ref general_instant, \ref general_buffered, \ref general_threaded.
-        - \ref signal_buffered: the signal-handling %RCU presents an implementation having low read-side overhead and
+        - \p signal_buffered: the signal-handling %RCU presents an implementation having low read-side overhead and
           requiring only that the application give up one POSIX signal to %RCU update processing.
 
         @note The signal-handled %RCU is defined only for UNIX-like systems, not for Windows.
index 2bd7e36..41287a7 100644 (file)
@@ -68,12 +68,14 @@ namespace cds { namespace urcu { namespace details {
         uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed );
         if ( (tmp & rcu_class::c_nNestMask) == 0 ) {
             pRec->m_nAccessControl.store( gp_singleton<RCUtag>::instance()->global_control_word(atomics::memory_order_relaxed),
-                atomics::memory_order_release );
-            atomics::atomic_thread_fence( atomics::memory_order_acquire );
-            CDS_COMPILER_RW_BARRIER;
+                atomics::memory_order_relaxed );
+
+            // acquire barrier
+            pRec->m_nAccessControl.load( atomics::memory_order_acquire );
         }
         else {
-            pRec->m_nAccessControl.fetch_add( 1, atomics::memory_order_relaxed );
+            // nested lock
+            pRec->m_nAccessControl.store( tmp + 1, atomics::memory_order_relaxed );
         }
     }
 
@@ -83,8 +85,10 @@ namespace cds { namespace urcu { namespace details {
         thread_record * pRec = get_thread_record();
         assert( pRec != nullptr );
 
-        CDS_COMPILER_RW_BARRIER;
-        pRec->m_nAccessControl.fetch_sub( 1, atomics::memory_order_release );
+        uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed );
+        assert( (tmp & rcu_class::c_nNestMask) > 0 );
+
+        pRec->m_nAccessControl.store( tmp - 1, atomics::memory_order_release );
     }
 
     template <typename RCUtag>
index 6de2b70..fd37602 100644 (file)
@@ -69,16 +69,19 @@ namespace cds { namespace urcu { namespace details {
         assert( pRec != nullptr );
 
         uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed );
+        assert( ( tmp & rcu_class::c_nNestMask ) > 0 );
+
         if ( (tmp & rcu_class::c_nNestMask) == 0 ) {
-            pRec->m_nAccessControl.store(
-                sh_singleton<RCUtag>::instance()->global_control_word(atomics::memory_order_acquire),
-                atomics::memory_order_release
-            );
+            pRec->m_nAccessControl.store( sh_singleton<RCUtag>::instance()->global_control_word(atomics::memory_order_relaxed),
+                atomics::memory_order_relaxed );
+
+            // acquire barrier
+            pRec->m_nAccessControl.load( atomics::memory_order_acquire );
         }
         else {
-            pRec->m_nAccessControl.fetch_add( 1, atomics::memory_order_release );
+            // nested lock
+            pRec->m_nAccessControl.store( tmp + 1, atomics::memory_order_relaxed );
         }
-        CDS_COMPILER_RW_BARRIER;
     }
 
     template <typename RCUtag>
@@ -87,8 +90,10 @@ namespace cds { namespace urcu { namespace details {
         thread_record * pRec = get_thread_record();
         assert( pRec != nullptr);
 
-        CDS_COMPILER_RW_BARRIER;
-        pRec->m_nAccessControl.fetch_sub( 1, atomics::memory_order_release );
+        uint32_t tmp = pRec->m_nAccessControl.load( atomics::memory_order_relaxed );
+        assert( ( tmp & rcu_class::c_nNestMask ) > 0 );
+
+        pRec->m_nAccessControl.store( tmp - 1, atomics::memory_order_release );
     }
 
     template <typename RCUtag>
index 4b4c94b..5c5d5ec 100644 (file)
@@ -16,8 +16,9 @@
     - Changed: exception handling. Now, exceptions raise by invoking new 
       cds::throw_exception() function. If you compile your code with exception disabled,
       the function prints an exception message to stdout and calls abort()
-      instead of throwing. You can provide your own cds::throw_exception() function 
-      and compile libcds with -DCDS_USER_DEFINED_THROW_EXCEPTION.
+      instead of throwing.
+    - Flat Combining: fixed memory-order bug that can lead to crash on weak ordered
+      architecture like PowerPC or ARM
     - Added: erase_at( iterator ) function to MichaelHashSet/Map and SplitListSet/Map
       based on IterableList
     - Fixed a bug in BronsonAVLTreeMap::extract_min()/extract_max()/clear().
diff --git a/thanks b/thanks
index 088a3fa..5110a64 100644 (file)
--- a/thanks
+++ b/thanks
@@ -16,3 +16,4 @@ Mike Krinkin (https://github.com/krinkinmu)
 Nikolai Rapotkin\r
 rwf (https://github.com/rfw)\r
 Tamas Lengyel\r
+Todd Lipcon\r