Skip to content

Commit 93667d4

Browse files
authored
Merge pull request #35 from ARM-software/integration
Add new thread pinning order option -o and lstopo parser to sweep test
2 parents 7de543b + c690213 commit 93667d4

File tree

4 files changed

+125
-43
lines changed

4 files changed

+125
-43
lines changed

benchmarks/lockhammer/include/lockhammer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct thread_args {
5050
unsigned long hold, post;
5151
Units hold_unit, post_unit;
5252
double tickspns;
53+
int *pinorder;
5354
};
5455
typedef struct thread_args thread_args;
5556

@@ -62,6 +63,7 @@ struct test_args {
6263
Units nparallel_units;
6364
unsigned long ileave;
6465
unsigned char safemode;
66+
int *pinorder;
6567
};
6668
typedef struct test_args test_args;
6769

benchmarks/lockhammer/scripts/lh_test_cfg.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,19 @@ sweeptest:
7676
- a: 5000
7777
c: 0ns
7878
p: 0ns
79+
o: lstopo
7980
- a: 5000
8081
c: 1000ns
8182
p: 0ns
83+
o: lstopo
8284
- a: 5000
8385
c: 200ns
8486
p: 1000ns
87+
o: lstopo
8588
- a: 5000
8689
c: 1000ns
8790
p: 5000ns
91+
o: lstopo
8892

8993
## Unittest Settings
9094
#
@@ -123,6 +127,7 @@ unittest:
123127
a: 100
124128
c: 50ns
125129
p: 0ns
130+
o: lstopo
126131
extra:
127132
u: 10
128133
s: 2
@@ -132,6 +137,8 @@ unittest:
132137
a: 100
133138
c: 50ns
134139
p: 0ns
140+
i: 1
141+
o: '0,1,2,3'
135142
extra:
136143
r: 4
137144
m: 1

benchmarks/lockhammer/scripts/test_lockhammer.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import sys
3737
import os
3838
import sh
39+
import re
3940
import errno
4041
import platform
4142
import unittest
@@ -50,7 +51,7 @@
5051
# config file should be in the same directory of this script
5152
LH_CFG = "lh_test_cfg.yaml"
5253
# lockhammer.c has these parameters
53-
LH_ARGU_LIST = ['t', 'a', 'c', 'p']
54+
LH_ARGU_LIST = ['t', 'a', 'c', 'p', 'i', 'o']
5455

5556

5657
# python unittest framework container class
@@ -130,6 +131,30 @@ def full_func_name(cmdName, paramList, fillZero):
130131
fullName += str(random.random())
131132
return fullName
132133

134+
# convert lstopo output to special cpu core order string, used by sweeptest -o
135+
def parse_lstopo():
136+
result = ''
137+
try:
138+
lst = sh.Command("lstopo")
139+
out = str(sh.grep(lst("--no-io", "--no-cache"), "PU"))
140+
except sh.CommandNotFound:
141+
print("Error, cannot find lstopo, need to install hwloc package first.")
142+
sys.exit(2)
143+
else:
144+
for line in out.splitlines():
145+
match = re.search("P#(\d+)", line.strip())
146+
if match:
147+
result += (match.group(1) + ',')
148+
finally:
149+
if result[-1] == ',':
150+
result = result[:-1]
151+
152+
# sample output for single-socket EPYC 7601 server:
153+
#0,32,8,40,16,48,24,56,4,36,12,44,20,52,28,60,1,33,9,41,17,49,25,57,5,37,\
154+
#13,45,21,53,29,61,2,34,10,42,18,50,26,58,6,38,14,46,22,54,30,62,3,35,11,\
155+
#43,19,51,27,59,7,39,15,47,23,55,31,63
156+
return result
157+
133158
# convert parameter from {key:value} to string list
134159
def expand_param(ctrl, valueList):
135160
outParam = []
@@ -146,6 +171,11 @@ def expand_param(ctrl, valueList):
146171
outParam.append(['-t', multiprocessing.cpu_count()])
147172
else:
148173
outParam.append(['-' + ctrl, valueList])
174+
elif isinstance(valueList, str):
175+
if ctrl == 'o' and valueList == 'lstopo':
176+
outParam.append(['-o', parse_lstopo()])
177+
else:
178+
outParam.append(['-' + ctrl, valueList])
149179
else:
150180
outParam.append(['-' + ctrl, valueList])
151181
return outParam
@@ -189,7 +219,10 @@ def prepare_param(arguList):
189219
for elem in arguList:
190220
paramList = []
191221
for key in elem:
192-
paramList.extend(['-'+key, elem[key]])
222+
if key == 'o' and str(elem[key]) == 'lstopo':
223+
paramList.extend(['-o', parse_lstopo()])
224+
else:
225+
paramList.extend(['-'+key, elem[key]])
193226
arguLL.append(paramList)
194227
return arguLL
195228

benchmarks/lockhammer/src/lockhammer.c

Lines changed: 81 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,19 @@ void print_usage (char *invoc) {
6262
"[-p <#>[ns | in] parallelizable iterations measured in ns or (in)structions, "
6363
"if no suffix, assumes (in)structions]\n\t"
6464
"[-s safe-mode operation for running as non-root\n\t"
65+
"[-i interleave value for thread pinning order, for example, 1 means "
66+
"sequential increasing, 2 means pinning hyperthread of the same core first "
67+
"before the next core.]\n\t"
68+
"[-o arbitrary core pinning order separated by comma without space, command "
69+
"lstopo can be used to deduce the correct order.]\n\t"
6570
"[-- <test specific arguments>]\n", invoc);
6671
}
6772

6873
int main(int argc, char** argv)
6974
{
7075
struct sched_param sparam;
7176

72-
unsigned long i;
77+
unsigned long opt;
7378
unsigned long num_cores;
7479
unsigned long result;
7580
unsigned long sched_elapsed = 0, real_elapsed = 0, realcpu_elapsed = 0;
@@ -84,16 +89,18 @@ int main(int argc, char** argv)
8489
.ncrit = 0,
8590
.nparallel = 0,
8691
.ileave = 1,
87-
.safemode = 0 };
92+
.safemode = 0,
93+
.pinorder = NULL };
8894

8995
opterr = 0;
9096

91-
while ((i = getopt(argc, argv, "t:a:c:p:i:s")) != -1)
97+
while ((opt = getopt(argc, argv, "t:a:c:p:i:o:s")) != -1)
9298
{
9399
long optval = 0;
94100
int len = 0;
95101
char buf[128];
96-
switch (i) {
102+
char *csv = NULL;
103+
switch (opt) {
97104
case 't':
98105
optval = strtol(optarg, (char **) NULL, 10);
99106
/* Do not allow number of threads to exceed online cores
@@ -158,12 +165,31 @@ int main(int argc, char** argv)
158165
case 'i':
159166
optval = strtol(optarg, (char **) NULL, 10);
160167
if (optval < 0) {
161-
fprintf(stderr, "ERROR: Core interleave must be positive.\n");
168+
fprintf(stderr, "ERROR: core interleave must be positive.\n");
162169
return 1;
163170
}
164171
else {
165172
args.ileave = optval;
166173
}
174+
break;
175+
case 'o':
176+
args.pinorder = calloc(num_cores, sizeof(int));
177+
if (args.pinorder == NULL) {
178+
fprintf(stderr, "ERROR: Cannot allocate enough memory for pinorder structure.\n");
179+
return 1;
180+
}
181+
csv = strtok(optarg, ",");
182+
for (int i = 0; i < num_cores && csv != NULL; ++i)
183+
{
184+
optval = strtol(csv, (char **) NULL, 10);
185+
if (optval >= 0 && optval < num_cores) {
186+
args.pinorder[i] = optval;
187+
} else {
188+
fprintf(stderr, "WARNING: core number %ld is out of range.\n", optval);
189+
}
190+
csv = strtok(NULL, ",");
191+
}
192+
break;
167193
case 's':
168194
args.safemode = 1;
169195
break;
@@ -213,7 +239,7 @@ int main(int argc, char** argv)
213239
tickspns = (double)timer_get_cnt_freq() / 1000000000.0;
214240

215241
thread_args t_args[args.nthrds];
216-
for (i = 0; i < args.nthrds; ++i) {
242+
for (int i = 0; i < args.nthrds; ++i) {
217243
hmrs[i] = 0;
218244
t_args[i].ncores = num_cores;
219245
t_args[i].nthrds = args.nthrds;
@@ -230,11 +256,12 @@ int main(int argc, char** argv)
230256
t_args[i].post = args.nparallel;
231257
t_args[i].post_unit = args.nparallel_units;
232258
t_args[i].tickspns = tickspns;
259+
t_args[i].pinorder = args.pinorder;
233260

234261
pthread_create(&hmr_threads[i], &hmr_attr, hmr, (void*)(&t_args[i]));
235262
}
236263

237-
for (i = 0; i < args.nthrds; ++i) {
264+
for (int i = 0; i < args.nthrds; ++i) {
238265
result = pthread_join(hmr_threads[i], NULL);
239266
}
240267
/* "Marshal" thread will collect start time once all threads have
@@ -245,7 +272,7 @@ int main(int argc, char** argv)
245272
pthread_attr_destroy(&hmr_attr);
246273

247274
result = 0;
248-
for (i = 0; i < args.nthrds; ++i) {
275+
for (int i = 0; i < args.nthrds; ++i) {
249276
result += hmrs[i];
250277
sched_elapsed += hmrtime[i];
251278
realcpu_elapsed += hmrrealtime[i];
@@ -320,6 +347,7 @@ void* hmr(void *ptr)
320347
unsigned long hold_count = x->hold;
321348
unsigned long post_count = x->post;
322349
double tickspns = x->tickspns;
350+
int *pinorder = x->pinorder;
323351

324352
unsigned long mycore = 0;
325353

@@ -356,40 +384,52 @@ void* hmr(void *ptr)
356384
/* Wait for all threads to arrive from calibrating. */
357385
synchronize_threads(&calibrate_lock, nthrds);
358386
clock_gettime(CLOCK_MONOTONIC, &tv_monot_start);
359-
}
360-
else {
361-
/* Calculate affinity mask for my core and set affinity */
362-
/* The concept of "interleave" is used here to allow for specifying
363-
* whether increasing cores counts first populate physical cores or
364-
* hardware threads within the same physical core. This assumes the
365-
* following relationship between logical core numbers (N), hardware
366-
* threads per core (K), and physical cores (N/K):
367-
*
368-
* physical core |___core_0__|___core_1__|_core_N/K-1|
369-
* thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
370-
* --------------|-|-|---|---|-|-|---|---|-|-|---|---|
371-
* logical core | | | | | | | | | | | | |
372-
* 0 |*| | | | | | | | | | | |
373-
* 1 | | | | |*| | | | | | | |
374-
* ... |...................................|
375-
* N/K-1 | | | | | | | | |*| | | |
376-
* N/K | |*| | | | | | | | | | |
377-
* N/K+1 | | | | | |*| | | | | | |
378-
* ... |...................................|
379-
* N-K | | | | * | | | | | | | | |
380-
* N-K+1 | | | | | | | | * | | | | |
381-
* ... |...................................|
382-
* N-1 | | | | | | | | | | | | * |
383-
*
384-
* Thus by setting the interleave value to 1 physical cores are filled
385-
* first with subsequent cores past N/K adding subsequent threads
386-
* on already populated physical cores. On the other hand, setting
387-
* interleave to K causes the algorithm to populate 0, N/K, 2N/K and
388-
* so on filling all hardware threads in the first physical core prior
389-
* to populating any threads on the second physical core.
387+
} else {
388+
/*
389+
* Non-zero core value indicates next core to pin, zero value means
390+
* fallback to default interleave mode. Note: -o and -i may have
391+
* conflicting pinning order that causes two or more threads to pin
392+
* on the same core. This feature interaction is intended by design
393+
* which allows 0 to serve as don't care mask and only changing the
394+
* pinning order we want to change for specific -i interleave mode.
390395
*/
391-
CPU_SET(((mycore * ncores / ileave) % ncores + (mycore / ileave)), &affin_mask);
392-
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
396+
if (pinorder && pinorder[mycore]) {
397+
CPU_SET(pinorder[mycore], &affin_mask);
398+
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
399+
} else { /* Calculate affinity mask for my core and set affinity */
400+
/*
401+
* The concept of "interleave" is used here to allow for specifying
402+
* whether increasing cores counts first populate physical cores or
403+
* hardware threads within the same physical core. This assumes the
404+
* following relationship between logical core numbers (N), hardware
405+
* threads per core (K), and physical cores (N/K):
406+
*
407+
* physical core |___core_0__|___core_1__|_core_N/K-1|
408+
* thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
409+
* --------------|-|-|---|---|-|-|---|---|-|-|---|---|
410+
* logical core | | | | | | | | | | | | |
411+
* 0 |*| | | | | | | | | | | |
412+
* 1 | | | | |*| | | | | | | |
413+
* ... |...................................|
414+
* N/K-1 | | | | | | | | |*| | | |
415+
* N/K | |*| | | | | | | | | | |
416+
* N/K+1 | | | | | |*| | | | | | |
417+
* ... |...................................|
418+
* N-K | | | | * | | | | | | | | |
419+
* N-K+1 | | | | | | | | * | | | | |
420+
* ... |...................................|
421+
* N-1 | | | | | | | | | | | | * |
422+
*
423+
* Thus by setting the interleave value to 1 physical cores are filled
424+
* first with subsequent cores past N/K adding subsequent threads
425+
* on already populated physical cores. On the other hand, setting
426+
* interleave to K causes the algorithm to populate 0, N/K, 2N/K and
427+
* so on filling all hardware threads in the first physical core prior
428+
* to populating any threads on the second physical core.
429+
*/
430+
CPU_SET(((mycore * ncores / ileave) % ncores + (mycore / ileave)), &affin_mask);
431+
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
432+
}
393433

394434
fetchadd64_release(&ready_lock, 1);
395435

0 commit comments

Comments
 (0)