11/* SPDX-License-Identifier: GPL-2.0 */
2- /* Based on Linux kernel 4.16.10
3- * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
2+
3+ /*
4+ * Based on Linux kernel 4.16.10
5+ * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git
6+ * /commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
7+ *
8+ * Description:
9+ *
10+ * This workload implements kernel 'optimistic spin queue' derived from mcs
11+ * lock. Tunable unqueue_retry times and max_backoff_sleep duration have
12+ * also been added to simulate need_resched() condition and unqueue current
13+ * cpu node from spinning queue and put to sleep.
14+ *
15+ * Changes from Linux kernel osq_lock.c
16+ *
17+ * The original DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node,
18+ * osq_node) was modified to 128 byte aligned optimistic_spin_node C array
19+ * allocated in heap during osq_lock_init() in main thread. It was pointed
20+ * by global_osq_nodepool_ptr pointer. The osq lock queue struct itself was
21+ * declared as a global variable too, which would substitute upper level
22+ * mutex lock struct indicated by lock pointer. Therefore we don't need to
23+ * get the lock pointer from lock_acquire() and lock_release() interface.
24+ * The spinning node structure can be linearly located by osq_nodepool_ptr
25+ * with threadnum/coreid as offset. The tail of osq_lock can be accessed
26+ * by global_osq directly.
27+ *
28+ * We haven't changed the algorithm except adding unqueue_retry and max_
29+ * sleep_us as optional backoff sleep to mimic kernel rescheduling events.
30+ * By default we essentially disable unqueue_retry and backoff sleep so
31+ * that osq_lock performance is more stable and similar to mcs queue spin
32+ * lock.
33+ *
34+ * Internals:
35+ *
36+ * In order to port osq_lock from kernel space to user space, we added
37+ * lk_barrier.h and lk_cmpxchg.h to synchronization-benchmarks/ext/linux/
38+ * include. Because there are some special gcc options to restrict compiler
39+ * from allocating x16/x17 registers in arch/arm64/lib/Makefile for
40+ * atomic_ll_sc.o, and our osq_lock.h included from lockhammer.c will not
41+ * generate any other separate object file, we have to modify cmpxchg.h
42+ * and change cmpxchg LLSC/LSE implementation for aarch64.
43+ *
44+ * Kernel arm64 cmpxchg.h supports both LLSC (load-link/store-conditional)
45+ * and LSE (Armv8.1 large system extension) via dynamic binary patching.
46+ * If CONFIG_AS_LSE and CONFIG_ARM64_LSE_ATOMICS have been enabled, kernel
47+ * will use Armv8.1 new atomic instructions CAS to implement the compare
48+ * and swap function. This inline function has 3 instructions mov/cas/mov,
49+ * which will be overwritten during system boot up if the CPU doesn't
50+ * support Armv8.1 LSE. The 3 new instructions are bl/nop/nop. The branch
51+ * and link instruction will redirect program flow to Armv8.0 LLSC function
52+ * without saving any of the caller's local registers. These registers are
53+ * guaranteed to be safe because LLSC function in atomic_ll_sc.o only uses
54+ * x16/x17 and LSE caller doesn't use x16/x17.
55+ *
56+ * Since lockhammer doesn't have runtime cpu detection, whether to use LLSC
57+ * or LSE is manually defined in lockhammer Makefile. Therefore our new
58+ * cmpxchg is also statically defined without branch and link or binary
59+ * patching. LLSC and LSE cmpxchg will share the same interface but use
60+ * different assembly codes and functions.
61+ *
62+ * Workings:
63+ *
64+ * osq_lock works similar to mcs spinlock except the optional unqueue path.
65+ * Linux kernel qspinlock is slightly different than original mcs spinlock.
66+ *
67+ * Tuning Parameters
68+ *
69+ * Optional unqueue and backoff sleep feature like kernel mutex
70+ *
71+ * [-- [-u unqueue_retry]]: how many spin retries before jumping to unqueue
72+ * path and stop spinning.
73+ *
74+ * [-- [-s max_sleep_us]]: how long to sleep after unqueue from osq before
75+ * another osq_lock() acquisition attempt. This
76+ * parameter only defines the maximum sleep time in
77+ * microseconds, each thread will sleep for random
78+ * time less than this max_sleep_us. The actual
79+ * sleep time is predetermined during main thread
80+ * initialization phase with uniform distribution
81+ * random function rand().
82+ *
483 */
584
685#ifndef __LINUX_OSQ_LOCK_H
786#define __LINUX_OSQ_LOCK_H
887
88+ /* redefine initialize_lock and parse_test_args with local functions */
989#ifdef initialize_lock
1090#undef initialize_lock
1191#endif
29109 * An MCS like lock especially tailored for optimistic spinning for sleeping
30110 * lock implementations (mutex, rwsem, etc).
31111 *
32- * Use 128 bytes alignment to eliminate false sharing for various Armv8 core
112+ * Using a single mcs node per CPU is safe because sleeping locks should not be
113+ * called from interrupt context and we have preemption disabled while
114+ * spinning.
115+ *
116+ * Using 128 bytes alignment to eliminate false sharing for various Armv8 core
33117 * cache line size
34118 */
35119struct optimistic_spin_node {
@@ -56,28 +140,30 @@ struct optimistic_spin_queue {
56140 * we need to tune this parameter for different machines.
57141 * http://www.brendangregg.com/blog/2017-03-16/perf-sched.html
58142 */
59- #define MAX_SLEEP_US 0
143+ #define MAX_BACKOFF_SLEEP_US 0
60144
61145/*
62- * Default unqueue_retry times, most system spins at least 500~1000 times
146+ * Default unqueue retry times, most system spins at least 500~1000 times
63147 * before unqueue from optimistic_spin_queue. Default large value simply
64148 * disables unqueue path and make osq_lock more like mcs_queue_spinlock.
65149 */
66- #define UNQUEUE_RETRY 1000000000
150+ #define DEFAULT_UNQUEUE_RETRY 2000000000
67151
68152/* Init macro and function. */
69153#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
70154
71- long long unqueue_retry ;
72- long long max_sleep_us ;
73- struct optimistic_spin_queue global_osq ;
74- struct optimistic_spin_node * global_osq_nodepool_ptr ;
155+ /* Newly added global variables used by osq_lock algorithm */
156+ static long long unqueue_retry ;
157+ static long long max_sleep_us ;
158+ static struct optimistic_spin_queue global_osq ;
159+ static struct optimistic_spin_node * global_osq_nodepool_ptr ;
75160
76- void osq_parse_args (test_args unused , int argc , char * * argv ) {
161+ /* Newly added additional tuning parameters for optional backoff sleep */
162+ static void osq_parse_args (test_args unused , int argc , char * * argv ) {
77163 int i = 0 ;
78164 char * endptr ;
79- unqueue_retry = UNQUEUE_RETRY ;
80- max_sleep_us = MAX_SLEEP_US ;
165+ unqueue_retry = DEFAULT_UNQUEUE_RETRY ;
166+ max_sleep_us = MAX_BACKOFF_SLEEP_US ;
81167
82168 /* extended options retrieved after '--' operator */
83169 while ((i = getopt (argc , argv , "u:s:" )) != -1 )
@@ -88,7 +174,8 @@ void osq_parse_args(test_args unused, int argc, char** argv) {
88174 unqueue_retry = strtoll (optarg , & endptr , 10 );
89175 if ((errno == ERANGE && (unqueue_retry == LONG_LONG_MAX ))
90176 || (errno != 0 && unqueue_retry == 0 ) || endptr == optarg ) {
91- fprintf (stderr , "unqueue_retry: value unsuitable for 'long long int'\n" );
177+ fprintf (stderr , "unqueue_retry: value unsuitable "
178+ "for 'long long int'\n" );
92179 exit (1 );
93180 }
94181 break ;
@@ -98,7 +185,8 @@ void osq_parse_args(test_args unused, int argc, char** argv) {
98185 max_sleep_us = strtoll (optarg , & endptr , 10 );
99186 if ((errno == ERANGE && (max_sleep_us == LONG_LONG_MAX ))
100187 || (errno != 0 && max_sleep_us == 0 ) || endptr == optarg ) {
101- fprintf (stderr , "max_sleep_us: value unsuitable for 'long long int'\n" );
188+ fprintf (stderr , "max_sleep_us: value unsuitable "
189+ "for 'long long int'\n" );
102190 exit (1 );
103191 } else if (max_sleep_us < 0 ) {
104192 fprintf (stderr , "max_sleep_us must be a positive integer.\n" );
@@ -110,8 +198,8 @@ void osq_parse_args(test_args unused, int argc, char** argv) {
110198 fprintf (stderr ,
111199 "osq_lock additional options after --:\n"
112200 "\t[-h print this msg]\n"
113- "\t[-u max spin retries before unqueue, default 1000000 ]\n"
114- "\t[-s max unqueue sleep in microseconds, default 10 ]\n" );
201+ "\t[-u max spin retries before unqueue, default 2 billions ]\n"
202+ "\t[-s max unqueue sleep in microseconds, default 0 ]\n" );
115203 exit (2 );
116204 }
117205 }
@@ -140,8 +228,8 @@ static inline void osq_lock_init(uint64_t *lock, unsigned long cores)
140228 * If osq spins more than unqueue_retry times, the spinning cpu may backoff
141229 * and sleep for 1 ~ 10 microseconds (on average 5 microseconds). Each spinning
142230 * thread uses a different backoff sleep time, and we can adjust the maximum
143- * sleep time by redefine the default MAX_SLEEP_US or tuning via parameter '-s'
144- * By default, we disable this sleep (MAX_SLEEP_US = 0)
231+ * sleep time by redefine MAX_BACKOFF_SLEEP_US or tuning via parameter '-s'
232+ * By default, we disable this sleep (MAX_BACKOFF_SLEEP_US = 0)
145233 *
146234 * Note: Avoid assigning random_sleep a negative value, otherwise usleep would
147235 * have a very large sleep time after implicit casting negative to uint32_t.
@@ -156,9 +244,6 @@ static inline void osq_lock_init(uint64_t *lock, unsigned long cores)
156244 atomic_set (& global_osq .tail , OSQ_UNLOCKED_VAL );
157245}
158246
159- bool osq_lock (uint64_t * osq , unsigned long cpu_number );
160- void osq_unlock (uint64_t * osq , unsigned long cpu_number );
161-
162247static inline bool osq_is_locked (struct optimistic_spin_queue * lock )
163248{
164249 return atomic_read (& lock -> tail ) != OSQ_UNLOCKED_VAL ;
@@ -244,7 +329,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
244329}
245330
246331/* uint64_t *osq is ignored because we use &global_osq instead */
247- bool osq_lock (uint64_t * osq , unsigned long cpu_number )
332+ static bool osq_lock (uint64_t * osq , unsigned long cpu_number )
248333{
249334 /* each cpu core has only one thread spinning on one optimistic_spin_node */
250335 struct optimistic_spin_node * node = global_osq_nodepool_ptr + cpu_number ;
@@ -309,7 +394,7 @@ bool osq_lock(uint64_t *osq, unsigned long cpu_number)
309394 * lock holder.
310395 */
311396 //if (need_resched() || vcpu_is_preempted(node_to_cpu(node->prev)))
312- if (++ back_off > unqueue_retry ) /* default UNQUEUE_RETRY 1 billion */
397+ if (++ back_off > unqueue_retry ) /* DEFAULT_UNQUEUE_RETRY 2 billions */
313398 goto unqueue ;
314399
315400 cpu_relax ();
@@ -373,7 +458,7 @@ bool osq_lock(uint64_t *osq, unsigned long cpu_number)
373458}
374459
375460/* uint64_t *osq is ignored because we use &global_osq instead */
376- void osq_unlock (uint64_t * osq , unsigned long cpu_number )
461+ static void osq_unlock (uint64_t * osq , unsigned long cpu_number )
377462{
378463 /* optimistic_spin_queue stores the current osq tail globally */
379464 struct optimistic_spin_queue * lock = & global_osq ;
@@ -408,14 +493,15 @@ void osq_unlock(uint64_t *osq, unsigned long cpu_number)
408493
409494
410495/* standard lockhammer lock_acquire and lock_release interfaces */
411- unsigned long __attribute__((noinline )) lock_acquire (uint64_t * lock , unsigned long threadnum )
496+ static unsigned long __attribute__((noinline ))
497+ lock_acquire (uint64_t * lock , unsigned long threadnum )
412498{
413499 /*
414500 * Note: The linux kernel implements additional mutex slow path in mutex.c
415501 * __mutex_lock_common() function. We will create another workload which
416502 * combines osq_lock and mutex_lock_common. This workload only benchmarks
417- * osq_lock itself. The osq_lock is different from mcs_queue_spinlock because
418- * of tunable unqueue path and backoff sleep time.
503+ * osq_lock itself. The osq_lock is different from mcs_queue_spinlock
504+ * because of tunable unqueue path and backoff sleep time.
419505 */
420506 while (!osq_lock (lock , threadnum )) {
421507 /*
0 commit comments