Skip to content

Commit 7c1e80b

Browse files
authored
Merge pull request #33 from zoybai/lh_lstopo
Lockhammer new -o option with arbitrary pinning order
2 parents 8cc53eb + 5dd3949 commit 7c1e80b

File tree

2 files changed

+74
-41
lines changed

2 files changed

+74
-41
lines changed

benchmarks/lockhammer/include/lockhammer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct thread_args {
5050
unsigned long hold, post;
5151
Units hold_unit, post_unit;
5252
double tickspns;
53+
int *pinorder;
5354
};
5455
typedef struct thread_args thread_args;
5556

@@ -62,6 +63,7 @@ struct test_args {
6263
Units nparallel_units;
6364
unsigned long ileave;
6465
unsigned char safemode;
66+
int *pinorder;
6567
};
6668
typedef struct test_args test_args;
6769

benchmarks/lockhammer/src/lockhammer.c

Lines changed: 72 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,19 @@ void print_usage (char *invoc) {
6262
"[-p <#>[ns | in] parallelizable iterations measured in ns or (in)structions, "
6363
"if no suffix, assumes (in)structions]\n\t"
6464
"[-s safe-mode operation for running as non-root\n\t"
65+
"[-i interleave value for thread pinning order, for example, 1 means "
66+
"sequential increasing, 2 means pinning hyperthread of the same core first "
67+
"before the next core.]\n\t"
68+
"[-o arbitrary core pinning order separated by comma without space, command "
69+
"lstopo can be used to deduce the correct order.]\n\t"
6570
"[-- <test specific arguments>]\n", invoc);
6671
}
6772

6873
int main(int argc, char** argv)
6974
{
7075
struct sched_param sparam;
7176

72-
unsigned long i;
77+
unsigned long opt;
7378
unsigned long num_cores;
7479
unsigned long result;
7580
unsigned long sched_elapsed = 0, real_elapsed = 0, realcpu_elapsed = 0;
@@ -84,16 +89,18 @@ int main(int argc, char** argv)
8489
.ncrit = 0,
8590
.nparallel = 0,
8691
.ileave = 1,
87-
.safemode = 0 };
92+
.safemode = 0,
93+
.pinorder = NULL };
8894

8995
opterr = 0;
9096

91-
while ((i = getopt(argc, argv, "t:a:c:p:i:s")) != -1)
97+
while ((opt = getopt(argc, argv, "t:a:c:p:i:o:s")) != -1)
9298
{
9399
long optval = 0;
94100
int len = 0;
95101
char buf[128];
96-
switch (i) {
102+
char *csv = NULL;
103+
switch (opt) {
97104
case 't':
98105
optval = strtol(optarg, (char **) NULL, 10);
99106
/* Do not allow number of threads to exceed online cores
@@ -158,12 +165,26 @@ int main(int argc, char** argv)
158165
case 'i':
159166
optval = strtol(optarg, (char **) NULL, 10);
160167
if (optval < 0) {
161-
fprintf(stderr, "ERROR: Core interleave must be positive.\n");
168+
fprintf(stderr, "ERROR: core interleave must be positive.\n");
162169
return 1;
163170
}
164171
else {
165172
args.ileave = optval;
166173
}
174+
break;
175+
case 'o':
176+
args.pinorder = calloc(num_cores, sizeof(int));
177+
if (args.pinorder == NULL) {
178+
fprintf(stderr, "ERROR: Cannot allocate enough memory for pinorder structure.\n");
179+
return 1;
180+
}
181+
csv = strtok(optarg, ",");
182+
for (int i = 0; i < num_cores && csv != NULL; csv = strtok(NULL, ","), ++i) {
183+
optval = strtol(csv, (char **) NULL, 10);
184+
if (optval >= 0 && optval < num_cores) *(args.pinorder + i) = optval;
185+
else fprintf(stderr, "WARNING: core number %ld is out of range.\n", optval);
186+
}
187+
break;
167188
case 's':
168189
args.safemode = 1;
169190
break;
@@ -213,7 +234,7 @@ int main(int argc, char** argv)
213234
tickspns = (double)timer_get_cnt_freq() / 1000000000.0;
214235

215236
thread_args t_args[args.nthrds];
216-
for (i = 0; i < args.nthrds; ++i) {
237+
for (int i = 0; i < args.nthrds; ++i) {
217238
hmrs[i] = 0;
218239
t_args[i].ncores = num_cores;
219240
t_args[i].nthrds = args.nthrds;
@@ -230,11 +251,12 @@ int main(int argc, char** argv)
230251
t_args[i].post = args.nparallel;
231252
t_args[i].post_unit = args.nparallel_units;
232253
t_args[i].tickspns = tickspns;
254+
t_args[i].pinorder = args.pinorder;
233255

234256
pthread_create(&hmr_threads[i], &hmr_attr, hmr, (void*)(&t_args[i]));
235257
}
236258

237-
for (i = 0; i < args.nthrds; ++i) {
259+
for (int i = 0; i < args.nthrds; ++i) {
238260
result = pthread_join(hmr_threads[i], NULL);
239261
}
240262
/* "Marshal" thread will collect start time once all threads have
@@ -245,7 +267,7 @@ int main(int argc, char** argv)
245267
pthread_attr_destroy(&hmr_attr);
246268

247269
result = 0;
248-
for (i = 0; i < args.nthrds; ++i) {
270+
for (int i = 0; i < args.nthrds; ++i) {
249271
result += hmrs[i];
250272
sched_elapsed += hmrtime[i];
251273
realcpu_elapsed += hmrrealtime[i];
@@ -320,6 +342,7 @@ void* hmr(void *ptr)
320342
unsigned long hold_count = x->hold;
321343
unsigned long post_count = x->post;
322344
double tickspns = x->tickspns;
345+
int *pinorder = x->pinorder;
323346

324347
unsigned long mycore = 0;
325348

@@ -356,40 +379,48 @@ void* hmr(void *ptr)
356379
/* Wait for all threads to arrive from calibrating. */
357380
synchronize_threads(&calibrate_lock, nthrds);
358381
clock_gettime(CLOCK_MONOTONIC, &tv_monot_start);
359-
}
360-
else {
361-
/* Calculate affinity mask for my core and set affinity */
362-
/* The concept of "interleave" is used here to allow for specifying
363-
* whether increasing cores counts first populate physical cores or
364-
* hardware threads within the same physical core. This assumes the
365-
* following relationship between logical core numbers (N), hardware
366-
* threads per core (K), and physical cores (N/K):
367-
*
368-
* physical core |___core_0__|___core_1__|_core_N/K-1|
369-
* thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
370-
* --------------|-|-|---|---|-|-|---|---|-|-|---|---|
371-
* logical core | | | | | | | | | | | | |
372-
* 0 |*| | | | | | | | | | | |
373-
* 1 | | | | |*| | | | | | | |
374-
* ... |...................................|
375-
* N/K-1 | | | | | | | | |*| | | |
376-
* N/K | |*| | | | | | | | | | |
377-
* N/K+1 | | | | | |*| | | | | | |
378-
* ... |...................................|
379-
* N-K | | | | * | | | | | | | | |
380-
* N-K+1 | | | | | | | | * | | | | |
381-
* ... |...................................|
382-
* N-1 | | | | | | | | | | | | * |
383-
*
384-
* Thus by setting the interleave value to 1 physical cores are filled
385-
* first with subsequent cores past N/K adding subsequent threads
386-
* on already populated physical cores. On the other hand, setting
387-
* interleave to K causes the algorithm to populate 0, N/K, 2N/K and
388-
* so on filling all hardware threads in the first physical core prior
389-
* to populating any threads on the second physical core.
382+
} else {
383+
/*
384+
* Non-zero core value indicates next core to pin, zero value means
385+
* fallback to default interleave mode.
390386
*/
391-
CPU_SET(((mycore * ncores / ileave) % ncores + (mycore / ileave)), &affin_mask);
392-
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
387+
if (pinorder && *(pinorder + mycore)) {
388+
CPU_SET(*(pinorder + mycore), &affin_mask);
389+
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
390+
} else {
391+
/* Calculate affinity mask for my core and set affinity */
392+
/* The concept of "interleave" is used here to allow for specifying
393+
* whether increasing cores counts first populate physical cores or
394+
* hardware threads within the same physical core. This assumes the
395+
* following relationship between logical core numbers (N), hardware
396+
* threads per core (K), and physical cores (N/K):
397+
*
398+
* physical core |___core_0__|___core_1__|_core_N/K-1|
399+
* thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
400+
* --------------|-|-|---|---|-|-|---|---|-|-|---|---|
401+
* logical core | | | | | | | | | | | | |
402+
* 0 |*| | | | | | | | | | | |
403+
* 1 | | | | |*| | | | | | | |
404+
* ... |...................................|
405+
* N/K-1 | | | | | | | | |*| | | |
406+
* N/K | |*| | | | | | | | | | |
407+
* N/K+1 | | | | | |*| | | | | | |
408+
* ... |...................................|
409+
* N-K | | | | * | | | | | | | | |
410+
* N-K+1 | | | | | | | | * | | | | |
411+
* ... |...................................|
412+
* N-1 | | | | | | | | | | | | * |
413+
*
414+
* Thus by setting the interleave value to 1 physical cores are filled
415+
* first with subsequent cores past N/K adding subsequent threads
416+
* on already populated physical cores. On the other hand, setting
417+
* interleave to K causes the algorithm to populate 0, N/K, 2N/K and
418+
* so on filling all hardware threads in the first physical core prior
419+
* to populating any threads on the second physical core.
420+
*/
421+
CPU_SET(((mycore * ncores / ileave) % ncores + (mycore / ileave)), &affin_mask);
422+
sched_setaffinity(0, sizeof(cpu_set_t), &affin_mask);
423+
}
393424

394425
fetchadd64_release(&ready_lock, 1);
395426

0 commit comments

Comments
 (0)