Skip to content

Commit d2b704b

Browse files
author
zoybai
committed
Add detail introduction for osq_lock workload
1. A detail introduction comment has been added before osq_lock.h implementations. 2. Redefine all functions and variables to static to limit the scope. 3. Modify variable names to more reflect their use.
1 parent 4f32ba4 commit d2b704b

File tree

1 file changed

+110
-24
lines changed

1 file changed

+110
-24
lines changed

ext/linux/osq_lock.h

Lines changed: 110 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,91 @@
11
/* SPDX-License-Identifier: GPL-2.0 */
2-
/* Based on Linux kernel 4.16.10
3-
* https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
2+
3+
/*
4+
* Based on Linux kernel 4.16.10
5+
* https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git
6+
* /commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
7+
*
8+
* Description:
9+
*
10+
* This workload implements kernel 'optimistic spin queue' derived from mcs
11+
* lock. Tunable unqueue_retry times and max_backoff_sleep duration have
12+
* also been added to simulate need_resched() condition and unqueue current
13+
* cpu node from spinning queue and put to sleep.
14+
*
15+
* Changes from Linux kernel osq_lock.c
16+
*
17+
* The original DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node,
18+
* osq_node) was modified to 128 byte aligned optimistic_spin_node C array
19+
* allocated in heap during osq_lock_init() in main thread. It was pointed
20+
* by global_osq_nodepool_ptr pointer. The osq lock queue struct itself was
21+
* declared as a global variable too, which would substitute upper level
22+
* mutex lock struct indicated by lock pointer. Therefore we don't need to
23+
* get the lock pointer from lock_acquire() and lock_release() interface.
24+
* The spinning node structure can be linearly located by osq_nodepool_ptr
25+
* with threadnum/coreid as offset. The tail of osq_lock can be accessed
26+
* by global_osq directly.
27+
*
28+
* We haven't changed the algorithm except adding unqueue_retry and max_
29+
* sleep_us as optional backoff sleep to mimic kernel rescheduling events.
30+
* By default we essentially disable unqueue_retry and backoff sleep so
31+
* that osq_lock performance is more stable and similar to mcs queue spin
32+
* lock.
33+
*
34+
* Internals:
35+
*
36+
* In order to port osq_lock from kernel space to user space, we added
37+
* lk_barrier.h and lk_cmpxchg.h to synchronization-benchmarks/ext/linux/
38+
* include. Because there are some special gcc options to restrict compiler
39+
* from allocating x16/x17 registers in arch/arm64/lib/Makefile for
40+
* atomic_ll_sc.o, and our osq_lock.h included from lockhammer.c will not
41+
* generate any other separate object file, we have to modify cmpxchg.h
42+
* and change cmpxchg LLSC/LSE implementation for aarch64.
43+
*
44+
* Kernel arm64 cmpxchg.h supports both LLSC (load-link/store-conditional)
45+
* and LSE (Armv8.1 large system extension) via dynamic binary patching.
46+
* If CONFIG_AS_LSE and CONFIG_ARM64_LSE_ATOMICS have been enabled, kernel
47+
* will use Armv8.1 new atomic instructions CAS to implement the compare
48+
* and swap function. This inline function has 3 instructions mov/cas/mov,
49+
* which will be overwritten during system boot up if the CPU doesn't
50+
* support Armv8.1 LSE. The 3 new instructions are bl/nop/nop. The branch
51+
* and link instruction will redirect program flow to Armv8.0 LLSC function
52+
* without saving any of the caller's local registers. These registers are
53+
* guaranteed to be safe because LLSC function in atomic_ll_sc.o only uses
54+
* x16/x17 and LSE caller doesn't use x16/x17.
55+
*
56+
* Since lockhammer doesn't have runtime cpu detection, whether to use LLSC
57+
* or LSE is manually defined in lockhammer Makefile. Therefore our new
58+
* cmpxchg is also statically defined without branch and link or binary
59+
* patching. LLSC and LSE cmpxchg will share the same interface but use
60+
* different assembly codes and functions.
61+
*
62+
* Workings:
63+
*
64+
* osq_lock works similar to mcs spinlock except the optional unqueue path.
65+
* Linux kernel qspinlock is slightly different than original mcs spinlock.
66+
*
67+
* Tuning Parameters
68+
*
69+
* Optional unqueue and backoff sleep feature like kernel mutex
70+
*
71+
* [-- [-u unqueue_retry]]: how many spin retries before jumping to unqueue
72+
* path and stop spinning.
73+
*
74+
* [-- [-s max_sleep_us]]: how long to sleep after unqueue from osq before
75+
* another osq_lock() acquisition attempt. This
76+
* parameter only defines the maximum sleep time in
77+
* microseconds, each thread will sleep for random
78+
* time less than this max_sleep_us. The actual
79+
* sleep time is predetermined during main thread
80+
* initialization phase with uniform distribution
81+
* random function rand().
82+
*
483
*/
584

685
#ifndef __LINUX_OSQ_LOCK_H
786
#define __LINUX_OSQ_LOCK_H
887

88+
/* redefine initialize_lock and parse_test_args with local functions */
989
#ifdef initialize_lock
1090
#undef initialize_lock
1191
#endif
@@ -29,7 +109,11 @@
29109
* An MCS like lock especially tailored for optimistic spinning for sleeping
30110
* lock implementations (mutex, rwsem, etc).
31111
*
32-
* Use 128 bytes alignment to eliminate false sharing for various Armv8 core
112+
* Using a single mcs node per CPU is safe because sleeping locks should not be
113+
* called from interrupt context and we have preemption disabled while
114+
* spinning.
115+
*
116+
* Using 128 bytes alignment to eliminate false sharing for various Armv8 core
33117
* cache line size
34118
*/
35119
struct optimistic_spin_node {
@@ -56,28 +140,30 @@ struct optimistic_spin_queue {
56140
* we need to tune this parameter for different machines.
57141
* http://www.brendangregg.com/blog/2017-03-16/perf-sched.html
58142
*/
59-
#define MAX_SLEEP_US 0
143+
#define MAX_BACKOFF_SLEEP_US 0
60144

61145
/*
62-
* Default unqueue_retry times, most system spins at least 500~1000 times
146+
* Default unqueue retry times, most system spins at least 500~1000 times
63147
* before unqueue from optimistic_spin_queue. Default large value simply
64148
* disables unqueue path and make osq_lock more like mcs_queue_spinlock.
65149
*/
66-
#define UNQUEUE_RETRY 1000000000
150+
#define DEFAULT_UNQUEUE_RETRY 2000000000
67151

68152
/* Init macro and function. */
69153
#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
70154

71-
long long unqueue_retry;
72-
long long max_sleep_us;
73-
struct optimistic_spin_queue global_osq;
74-
struct optimistic_spin_node *global_osq_nodepool_ptr;
155+
/* Newly added global variables used by osq_lock algorithm */
156+
static long long unqueue_retry;
157+
static long long max_sleep_us;
158+
static struct optimistic_spin_queue global_osq;
159+
static struct optimistic_spin_node *global_osq_nodepool_ptr;
75160

76-
void osq_parse_args(test_args unused, int argc, char** argv) {
161+
/* Newly added additional tuning parameters for optional backoff sleep */
162+
static void osq_parse_args(test_args unused, int argc, char** argv) {
77163
int i = 0;
78164
char *endptr;
79-
unqueue_retry = UNQUEUE_RETRY;
80-
max_sleep_us = MAX_SLEEP_US;
165+
unqueue_retry = DEFAULT_UNQUEUE_RETRY;
166+
max_sleep_us = MAX_BACKOFF_SLEEP_US;
81167

82168
/* extended options retrieved after '--' operator */
83169
while ((i = getopt(argc, argv, "u:s:")) != -1)
@@ -88,7 +174,8 @@ void osq_parse_args(test_args unused, int argc, char** argv) {
88174
unqueue_retry = strtoll(optarg, &endptr, 10);
89175
if ((errno == ERANGE && (unqueue_retry == LONG_LONG_MAX))
90176
|| (errno != 0 && unqueue_retry == 0) || endptr == optarg) {
91-
fprintf(stderr, "unqueue_retry: value unsuitable for 'long long int'\n");
177+
fprintf(stderr, "unqueue_retry: value unsuitable "
178+
"for 'long long int'\n");
92179
exit(1);
93180
}
94181
break;
@@ -98,7 +185,8 @@ void osq_parse_args(test_args unused, int argc, char** argv) {
98185
max_sleep_us = strtoll(optarg, &endptr, 10);
99186
if ((errno == ERANGE && (max_sleep_us == LONG_LONG_MAX))
100187
|| (errno != 0 && max_sleep_us == 0) || endptr == optarg) {
101-
fprintf(stderr, "max_sleep_us: value unsuitable for 'long long int'\n");
188+
fprintf(stderr, "max_sleep_us: value unsuitable "
189+
"for 'long long int'\n");
102190
exit(1);
103191
} else if (max_sleep_us < 0) {
104192
fprintf(stderr, "max_sleep_us must be a positive integer.\n");
@@ -140,8 +228,8 @@ static inline void osq_lock_init(uint64_t *lock, unsigned long cores)
140228
* If osq spins more than unqueue_retry times, the spinning cpu may backoff
141229
* and sleep for 1 ~ 10 microseconds (on average 5 microseconds). Each spinning
142230
* thread uses a different backoff sleep time, and we can adjust the maximum
143-
* sleep time by redefine the default MAX_SLEEP_US or tuning via parameter '-s'
144-
* By default, we disable this sleep (MAX_SLEEP_US = 0)
231+
* sleep time by redefine MAX_BACKOFF_SLEEP_US or tuning via parameter '-s'
232+
* By default, we disable this sleep (MAX_BACKOFF_SLEEP_US = 0)
145233
*
146234
* Note: Avoid assigning random_sleep a negative value, otherwise usleep would
147235
* have a very large sleep time after implicit casting negative to uint32_t.
@@ -156,9 +244,6 @@ static inline void osq_lock_init(uint64_t *lock, unsigned long cores)
156244
atomic_set(&global_osq.tail, OSQ_UNLOCKED_VAL);
157245
}
158246

159-
bool osq_lock(uint64_t *osq, unsigned long cpu_number);
160-
void osq_unlock(uint64_t *osq, unsigned long cpu_number);
161-
162247
static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
163248
{
164249
return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
@@ -244,7 +329,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
244329
}
245330

246331
/* uint64_t *osq is ignored because we use &global_osq instead */
247-
bool osq_lock(uint64_t *osq, unsigned long cpu_number)
332+
static bool osq_lock(uint64_t *osq, unsigned long cpu_number)
248333
{
249334
/* each cpu core has only one thread spinning on one optimistic_spin_node */
250335
struct optimistic_spin_node *node = global_osq_nodepool_ptr + cpu_number;
@@ -309,7 +394,7 @@ bool osq_lock(uint64_t *osq, unsigned long cpu_number)
309394
* lock holder.
310395
*/
311396
//if (need_resched() || vcpu_is_preempted(node_to_cpu(node->prev)))
312-
if (++back_off > unqueue_retry) /* default UNQUEUE_RETRY 1 billion */
397+
if (++back_off > unqueue_retry) /* DEFAULT_UNQUEUE_RETRY 2 billion */
313398
goto unqueue;
314399

315400
cpu_relax();
@@ -373,7 +458,7 @@ bool osq_lock(uint64_t *osq, unsigned long cpu_number)
373458
}
374459

375460
/* uint64_t *osq is ignored because we use &global_osq instead */
376-
void osq_unlock(uint64_t *osq, unsigned long cpu_number)
461+
static void osq_unlock(uint64_t *osq, unsigned long cpu_number)
377462
{
378463
/* optimistic_spin_queue stores the current osq tail globally */
379464
struct optimistic_spin_queue *lock = &global_osq;
@@ -408,7 +493,8 @@ void osq_unlock(uint64_t *osq, unsigned long cpu_number)
408493

409494

410495
/* standard lockhammer lock_acquire and lock_release interfaces */
411-
unsigned long __attribute__((noinline)) lock_acquire (uint64_t *lock, unsigned long threadnum)
496+
static unsigned long __attribute__((noinline))
497+
lock_acquire (uint64_t *lock, unsigned long threadnum)
412498
{
413499
/*
414500
* Note: The linux kernel implements additional mutex slow path in mutex.c

0 commit comments

Comments
 (0)