Skip to content

Commit 3845a54

Browse files
committed
lockhammer.c: separate file, improve reproducibility and quality-of-life
Move the measurement routines out of the test harness for separate compilation - The test harness is compiled using a generic compiler target, while the lock routine (inlined in measure.c) uses a specific build target selected by the Makefile's USE_* variables. Enhancements to improve run-to-run reproducibility - duration-based instead of acquires-based measurement - multiple measurements per run, permuting iterations, critical/parallel duration, num_threads/pinorders - lock variable allocated from persistent hugetlb page to reuse the same physical address run-to-run - any-cpu arbitrary pinorder assignment - add a runtime flag --disable-outline-atomics-lse to use exclusives on a system that provides LSE support, when compiled with the outline atomics library (aarch64, USE_BUILTIN=1) Quality-of-life enhancements - run results summary deliminted for easy copy/paste to spreadsheet - output measurements to JSON Change-Id: I1fa7918827d98ebcf3c17059c14c683fa2bf8079
1 parent d81cea4 commit 3845a54

File tree

2 files changed

+1075
-402
lines changed

2 files changed

+1075
-402
lines changed
Lines changed: 178 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017, The Linux Foundation. All rights reserved.
2+
* Copyright (c) 2017-2025, The Linux Foundation. All rights reserved.
33
*
44
* SPDX-License-Identifier: BSD-3-Clause
55
*
@@ -33,49 +33,187 @@
3333
#define __LOCKHAMMER_H__
3434

3535

36-
#ifndef initialize_lock
37-
#define initialize_lock(lock, thread)
36+
// PROGRESS_TICK_PROFILE - prints each thread's timer value at lock_acquires milestones to show thread concurrency
37+
#define PROGRESS_TICK_PROFILE
38+
39+
enum units { NS,
40+
INSTS, NOT_SET };
41+
typedef enum units Units;
42+
43+
#define _stringify(x) #x
44+
#define stringify(x) _stringify(x)
45+
46+
// per_thread_results_t - each thread returns its results in this struct (inside thread_args_t)
47+
typedef struct {
48+
unsigned long cpu_affined; // which CPU this was pinned on.
49+
50+
unsigned long lock_acquires;// number of locks acquired-and-released per thread
51+
unsigned long cputime_ns; // this thread's CPU time in nanoseconds
52+
unsigned long walltime_ns; // this thread's wall clock time in nanoseconds
53+
unsigned long hmrdepth; // depth=lock-specific notion of contention
54+
55+
unsigned long hwtimer_start; // timer value at start of measurement loop
56+
unsigned long hwtimer_end; // "' at end
57+
58+
unsigned long hwtimer_10p; // timer value at 10% of work completion
59+
unsigned long hwtimer_25p; // "" at 25%
60+
unsigned long hwtimer_50p; // "" at 50%
61+
unsigned long hwtimer_75p; // "" at 75%
62+
unsigned long hwtimer_90p; // "" at 90%
63+
64+
// hold/post durations from calibrate_timer()
65+
double hold_ns, post_ns;
66+
67+
// metrics only for osq_lock
68+
unsigned long osq_lock_wait_next_spins;
69+
unsigned long osq_unlock_wait_next_spins;
70+
unsigned long osq_lock_locked_spins;
71+
unsigned long osq_lock_unqueue_spins;
72+
unsigned long osq_lock_acquire_backoffs;
73+
74+
} per_thread_results_t;
75+
76+
77+
// thread_args_t -- pointer to an instance of this is passed to each thread
78+
typedef struct {
79+
unsigned long thread_num; // thread number, ordinal 0
80+
unsigned long num_threads; // number of worker threads in total for experiment
81+
unsigned long num_acquires; // -a flag, aka nacqrs, aka number of acquires per thread to do
82+
unsigned long *lock; // pointer to the lock variable
83+
84+
unsigned long *p_start_ns; // marshal thread's monotonic start time, in ns, for computing wall_elapsed_ns; only marshall thread sets this
85+
unsigned long hold, post; // ncrit, nparallel
86+
Units hold_unit, post_unit; // NS or INSTS, hold_unit = ncrit_units, post_unit = nparallel_units
87+
unsigned long hold_count;
88+
unsigned long post_count;
89+
90+
double tickspns; // number of ticks_per_ns
91+
92+
unsigned long run_on_this_cpu; // logical CPU on which a worker thread is to run
93+
94+
unsigned long run_limit_ticks; // if non-zero, the number of timer ticks to run for when using --run-limit-ticks or --run-limit-seconds
95+
unsigned long run_limit_inner_loop_iters; // the number of lock acquire/release sequences to run before checking the hwtimer when using --run-limit-ticks or --run-limit-seconds
96+
unsigned long hwtimer_frequency;
97+
98+
int verbose;
99+
unsigned long blackhole_numtries;
100+
101+
per_thread_results_t results; // output data structure
102+
103+
} thread_args_t;
104+
105+
// pinorder_t - describes a set of CPUs on which to run worker threads
106+
typedef struct {
107+
int * cpu_list; // pointer to an array of int. index into this array is the thread number, each element is the logical CPU on which that thread is to run.
108+
size_t num_threads; // number of threads defined for this pinorder (i.e. length of the number of valid entries in the pinorder array).
109+
} pinorder_t;
110+
111+
112+
typedef struct {
113+
unsigned long t; // duration time, either in nanoseconds or iterations
114+
Units unit; // duration unit, either NS or INSTS
115+
} duration_t;
116+
117+
// test_args_t - mostly command line parameters
118+
typedef struct {
119+
unsigned long num_acquires; // -a number of acquires (not documented?)
120+
duration_t * crits; // -c, --cn=, --ci= critical duration
121+
duration_t * pars; // -p, --pn=, --pi= parallel duration
122+
size_t num_crits;
123+
size_t num_pars;
124+
unsigned long ileave; // -i interleave value for SMT pinning
125+
int scheduling_policy; // -S use explicit scheduling policy
126+
size_t num_pinorders;
127+
pinorder_t * pinorders; // -o CPU pinning order
128+
unsigned long timeout_usec; // -A timeout_usec
129+
130+
int hugepagesz;
131+
int use_mmap;
132+
int mmap_hugepage_offset_exists;
133+
int print_hugepage_physaddr;
134+
size_t mmap_hugepage_offset;
135+
size_t mmap_hugepage_physaddr;
136+
unsigned long hwtimer_frequency;
137+
unsigned long probed_hwtimer_frequency;
138+
long estimate_hwtimer_freq_cpu;
139+
140+
double run_limit_seconds;
141+
unsigned long run_limit_ticks;
142+
unsigned long run_limit_inner_loop_iters;
143+
int ignore_unknown_scaling_governor;
144+
int suppress_cpu_frequency_warnings;
145+
const char * cpuorder_filename;
146+
#ifdef JSON_OUTPUT
147+
const char * json_output_filename;
38148
#endif
39-
#ifndef parse_test_args
40-
#define parse_test_args(args, argc, argv)
149+
#ifdef __aarch64__
150+
char disable_outline_atomics_lse;
41151
#endif
42-
#ifndef thread_local_init
43-
#define thread_local_init(smtid)
152+
int verbose;
153+
size_t iterations;
154+
size_t blackhole_numtries;
155+
} test_args_t;
156+
157+
// system_info_t - system configuration data
158+
typedef struct {
159+
unsigned long num_cores; // number of processors configured by the operating system
160+
size_t page_size_bytes; // page size in bytes
161+
size_t erg_bytes; // number of bytes per exclusive reservation granule (e.g. cache line/block)
162+
163+
cpu_set_t avail_cores; // cores that the CPU affinity mask allows us to run on
164+
size_t num_avail_cores; // number of cores that the CPU affinity mask allows us to run on
165+
size_t num_online_cores; // the number of cores that getconf _NPROCESSORS_ONLN returns
166+
167+
// num_online_cores can be less than num_cores because some may be offline or not permitted by affinity mask
168+
// num_avail_cores may be less than num_online_cores because some online cores may be isolated
169+
} system_info_t;
170+
171+
// locks_t -- pointers to the actual locks to be used
172+
typedef struct {
173+
unsigned long * p_test_lock; // address of main lock
174+
unsigned long * p_ready_lock; // lock to synchronize all threads' entry into hmr()
175+
unsigned long * p_sync_lock; // lock to synchronize before blackhole cabliration
176+
unsigned long * p_calibrate_lock; // lock to synchronize after blackhole calibration
177+
} locks_t;
178+
179+
// calibrate_blackhole -- (used in osq_lock)
180+
unsigned long calibrate_blackhole(unsigned long target, unsigned long tokens_low, unsigned long tokens_high, unsigned long core_id, unsigned long NUMTRIES);
181+
182+
// evaluate_blackhole -- returns average duration of NUMTRIES
183+
int64_t evaluate_blackhole( const unsigned long tokens_mid, const unsigned long NUMTRIES);
184+
185+
// blackhole() -- runs a small loop to consume time (also used in osq_lock)
186+
void blackhole(unsigned long iters);
187+
188+
// measure_setup_initialize_lock() -- calls lock-specific setup routine if it exists
189+
void measure_setup_initialize_lock(locks_t * p_locks, pinorder_t * pinorder);
190+
191+
// measure_setup_parse_test_args() -- calls lock-specific parsing routine if it exists
192+
void measure_setup_parse_test_args(test_args_t * p_test_args, int argc, char ** argv);
193+
194+
// convert the struct timespec to only nanoseconds
195+
unsigned long timespec_to_ns (struct timespec * ts);
196+
197+
// selectively disable LSE instructions in outline atomics/libgcc; in measure.c
198+
void handle_disable_outline_atomics_lse(void);
199+
200+
#if __GNUC__==1
201+
#define NOINLINE __attribute__((noinline))
202+
#elif __clang__==1
203+
#define NOINLINE __attribute__((noinline))
204+
#else
205+
#define NOINLINE
44206
#endif
45207

46-
enum units { NS,
47-
INSTS };
48-
typedef enum units Units;
208+
#if __GNUC__==1
209+
#define NO_UNROLL_LOOP _Pragma("GCC unroll 0")
210+
#elif __clang__==1
211+
#define NO_UNROLL_LOOP _Pragma("clang loop unroll(disable)")
212+
#else
213+
#define NO_UNROLL_LOOP
214+
#endif
49215

50-
struct thread_args {
51-
unsigned long ncores;
52-
unsigned long nthrds;
53-
unsigned long ileave;
54-
unsigned long iter;
55-
unsigned long *lock;
56-
unsigned long *rst;
57-
unsigned long *nsec;
58-
unsigned long *real_nsec;
59-
unsigned long *depth;
60-
unsigned long *nstart;
61-
unsigned long hold, post;
62-
Units hold_unit, post_unit;
63-
double tickspns;
64-
int *pinorder;
65-
};
66-
typedef struct thread_args thread_args;
67-
68-
struct test_args {
69-
unsigned long nthrds;
70-
unsigned long nacqrs;
71-
unsigned long ncrit;
72-
Units ncrit_units;
73-
unsigned long nparallel;
74-
Units nparallel_units;
75-
unsigned long ileave;
76-
unsigned char safemode;
77-
int *pinorder;
78-
};
79-
typedef struct test_args test_args;
80216

81217
#endif
218+
219+
/* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */

0 commit comments

Comments
 (0)