Skip to content

Commit 53599a0

Browse files
Yangzheng BaiYangzheng Bai
authored andcommitted
Adding new -o pinning order option for lockhammer
Because some platforms may have drastically different core-thread topology, especially for multi-socket NUMA scenario. We add another -o option to specify the preferred "order" for sweep tests. This parameter consists of individual core numbers from 0 to the max core number, separated by comma without space. With this option, lockhammer can pin its threads according to this arbitrarily provided order without any complex calculation. Then we can easily change the order in test_lockhammer.py without modifying lockhammer.c source code. Currently, we deduce the order from this command by extracting P#: lstopo --no-io --no-caches |grep PU For example, if we want to test 5 threads on 32-core single-socket EPYC server, we may use the following pinning order: ./build/lh_osq_lock -t 5 -a 10000000 -o 0,32,4,36,8,40,12,44,16,48, 20,52,24,56,28,60,1,33,5,37,9,41,13,45,17,49,21,53,25,57,29,61,2,34, 6,38,10,42,14,46,18,50,22,54,26,58,30,62,3,35,7,39,11,43,15,47,19, 51,23,55,27,59,31,63 Because there are only 5 threads, lockhammer will pin each thread to processor #0, #32, #4, #36, #8 individually. Processor #0 and #32 are in the same physical core. Processor #4 and #36 are in another same physical core. Processor #8 are in a different core compared to the first 4 logical processors. Therefore we removed the necessity of introducing yet another interleaving mode to lockhammer.
1 parent 9f00fd4 commit 53599a0

File tree

2 files changed

+71
-41
lines changed

2 files changed

+71
-41
lines changed

benchmarks/lockhammer/include/lockhammer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct thread_args {
5050
unsigned long hold, post;
5151
Units hold_unit, post_unit;
5252
double tickspns;
53+
int *pinorder;
5354
};
5455
typedef struct thread_args thread_args;
5556

@@ -62,6 +63,7 @@ struct test_args {
6263
Units nparallel_units;
6364
unsigned long ileave;
6465
unsigned char safemode;
66+
int *pinorder;
6567
};
6668
typedef struct test_args test_args;
6769

benchmarks/lockhammer/src/lockhammer.c

Lines changed: 69 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,19 @@ void print_usage (char *invoc) {
6262
"[-p <#>[ns | in] parallelizable iterations measured in ns or (in)structions, "
6363
"if no suffix, assumes (in)structions]\n\t"
6464
"[-s safe-mode operation for running as non-root\n\t"
65+
"[-i interleave value for thread pinning order, for example, 1 means "
66+
"sequential increasing, 2 means pinning hyperthread of the same core first "
67+
"before the next core.]\n\t"
68+
"[-o arbitrary core pinning order separated by comma without space, command "
69+
"lstopo can be used to deduce the correct order.]\n\t"
6570
"[-- <test specific arguments>]\n", invoc);
6671
}
6772

6873
int main(int argc, char** argv)
6974
{
7075
struct sched_param sparam;
7176

72-
unsigned long i;
77+
unsigned long opt;
7378
unsigned long num_cores;
7479
unsigned long result;
7580
unsigned long sched_elapsed = 0, real_elapsed = 0, realcpu_elapsed = 0;
@@ -84,16 +89,18 @@ int main(int argc, char** argv)
8489
.ncrit = 0,
8590
.nparallel = 0,
8691
.ileave = 1,
87-
.safemode = 0 };
92+
.safemode = 0,
93+
.pinorder = NULL };
8894

8995
opterr = 0;
9096

91-
while ((i = getopt(argc, argv, "t:a:c:p:i:s")) != -1)
97+
while ((opt = getopt(argc, argv, "t:a:c:p:i:o:s")) != -1)
9298
{
9399
long optval = 0;
94100
int len = 0;
95101
char buf[128];
96-
switch (i) {
102+
char *csv = NULL;
103+
switch (opt) {
97104
case 't':
98105
optval = strtol(optarg, (char **) NULL, 10);
99106
/* Do not allow number of threads to exceed online cores
@@ -158,12 +165,23 @@ int main(int argc, char** argv)
158165
case 'i':
159166
optval = strtol(optarg, (char **) NULL, 10);
160167
if (optval < 0) {
161-
fprintf(stderr, "ERROR: Core interleave must be positive.\n");
168+
fprintf(stderr, "ERROR: core interleave must be positive.\n");
162169
return 1;
163170
}
164171
else {
165172
args.ileave = optval;
166173
}
174+
break;
175+
case 'o':
176+
args.pinorder = calloc(num_cores, sizeof(int));
177+
if (args.pinorder == NULL) {
178+
fprintf(stderr, "ERROR: Cannot allocate enough memory for pinorder structure.\n");
179+
return 1;
180+
}
181+
csv = strtok(optarg, ",");
182+
for (int i = 0; i < num_cores && csv != NULL; csv = strtok(NULL, ","), ++i)
183+
*(args.pinorder + i) = strtol(csv, (char **) NULL, 10);
184+
break;
167185
case 's':
168186
args.safemode = 1;
169187
break;
@@ -213,7 +231,7 @@ int main(int argc, char** argv)
213231
tickspns = (double)timer_get_cnt_freq() / 1000000000.0;
214232

215233
thread_args t_args[args.nthrds];
216-
for (i = 0; i < args.nthrds; ++i) {
234+
for (int i = 0; i < args.nthrds; ++i) {
217235
hmrs[i] = 0;
218236
t_args[i].ncores = num_cores;
219237
t_args[i].nthrds = args.nthrds;
@@ -230,11 +248,12 @@ int main(int argc, char** argv)
230248
t_args[i].post = args.nparallel;
231249
t_args[i].post_unit = args.nparallel_units;
232250
t_args[i].tickspns = tickspns;
251+
t_args[i].pinorder = args.pinorder;
233252

234253
pthread_create(&hmr_threads[i], &hmr_attr, hmr, (void*)(&t_args[i]));
235254
}
236255

237-
for (i = 0; i < args.nthrds; ++i) {
256+
for (int i = 0; i < args.nthrds; ++i) {
238257
result = pthread_join(hmr_threads[i], NULL);
239258
}
240259
/* "Marshal" thread will collect start time once all threads have
@@ -245,7 +264,7 @@ int main(int argc, char** argv)
245264
pthread_attr_destroy(&hmr_attr);
246265

247266
result = 0;
248-
for (i = 0; i < args.nthrds; ++i) {
267+
for (int i = 0; i < args.nthrds; ++i) {
249268
result += hmrs[i];
250269
sched_elapsed += hmrtime[i];
251270
realcpu_elapsed += hmrrealtime[i];
@@ -320,6 +339,7 @@ void* hmr(void *ptr)
320339
unsigned long hold_count = x->hold;
321340
unsigned long post_count = x->post;
322341
double tickspns = x->tickspns;
342+
int *pinorder = x->pinorder;
323343

324344
unsigned long mycore = 0;
325345

@@ -356,40 +376,48 @@ void* hmr(void *ptr)
356376
/* Wait for all threads to arrive from calibrating. */
357377
synchronize_threads(&calibrate_lock, nthrds);
358378
clock_gettime(CLOCK_MONOTONIC, &tv_monot_start);
359-
}
360-
else {
361-
/* Calculate affinity mask for my core and set affinity */
362-
/* The concept of "interleave" is used here to allow for specifying
363-
* whether increasing cores counts first populate physical cores or
364-
* hardware threads within the same physical core. This assumes the
365-
* following relationship between logical core numbers (N), hardware
366-
* threads per core (K), and physical cores (N/K):
367-
*
368-
* physical core |___core_0__|___core_1__|_core_N/K-1|
369-
* thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
370-
* --------------|-|-|---|---|-|-|---|---|-|-|---|---|
371-
* logical core | | | | | | | | | | | | |
372-
* 0 |*| | | | | | | | | | | |
373-
* 1 | | | | |*| | | | | | | |
374-
* ... |...................................|
375-
* N/K-1 | | | | | | | | |*| | | |
376-
* N/K | |*| | | | | | | | | | |
377-
* N/K+1 | | | | | |*| | | | | | |
378-
* ... |...................................|
379-
* N-K | | | | * | | | | | | | | |
380-
* N-K+1 | | | | | | | | * | | | | |
381-
* ... |...................................|
382-
* N-1 | | | | | | | | | | | | * |
383-
*
384-
* Thus by setting the interleave value to 1 physical cores are filled
385-
* first with subsequent cores past N/K adding subsequent threads
386-
* on already populated physical cores. On the other hand, setting
387-
* interleave to K causes the algorithm to populate 0, N/K, 2N/K and
388-
* so on filling all hardware threads in the first physical core prior
389-
* to populating any threads on the second physical core.
379+
} else {
380+
/*
381+
* Non-zero core value indicates next core to pin, zero value means
382+
* fallback to default interleave mode.
390383
*/
391-
CPU_SET(((mycore * ncores / ileave) % ncores + (mycore / ileave)), &affin_mask);
392-
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
384+
if (pinorder && *(pinorder + mycore)) {
385+
CPU_SET(*(pinorder + mycore), &affin_mask);
386+
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
387+
} else {
388+
/* Calculate affinity mask for my core and set affinity */
389+
/* The concept of "interleave" is used here to allow for specifying
390+
* whether increasing cores counts first populate physical cores or
391+
* hardware threads within the same physical core. This assumes the
392+
* following relationship between logical core numbers (N), hardware
393+
* threads per core (K), and physical cores (N/K):
394+
*
395+
* physical core |___core_0__|___core_1__|_core_N/K-1|
396+
* thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
397+
* --------------|-|-|---|---|-|-|---|---|-|-|---|---|
398+
* logical core | | | | | | | | | | | | |
399+
* 0 |*| | | | | | | | | | | |
400+
* 1 | | | | |*| | | | | | | |
401+
* ... |...................................|
402+
* N/K-1 | | | | | | | | |*| | | |
403+
* N/K | |*| | | | | | | | | | |
404+
* N/K+1 | | | | | |*| | | | | | |
405+
* ... |...................................|
406+
* N-K | | | | * | | | | | | | | |
407+
* N-K+1 | | | | | | | | * | | | | |
408+
* ... |...................................|
409+
* N-1 | | | | | | | | | | | | * |
410+
*
411+
* Thus by setting the interleave value to 1 physical cores are filled
412+
* first with subsequent cores past N/K adding subsequent threads
413+
* on already populated physical cores. On the other hand, setting
414+
* interleave to K causes the algorithm to populate 0, N/K, 2N/K and
415+
* so on filling all hardware threads in the first physical core prior
416+
* to populating any threads on the second physical core.
417+
*/
418+
CPU_SET(((mycore * ncores / ileave) % ncores + (mycore / ileave)), &affin_mask);
419+
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
420+
}
393421

394422
fetchadd64_release(&ready_lock, 1);
395423

0 commit comments

Comments
 (0)