@@ -62,14 +62,19 @@ void print_usage (char *invoc) {
6262 "[-p <#>[ns | in] parallelizable iterations measured in ns or (in)structions, "
6363 "if no suffix, assumes (in)structions]\n\t"
6464 "[-s safe-mode operation for running as non-root\n\t"
65+ "[-i interleave value for thread pinning order, for example, 1 means "
66+ "sequential increasing, 2 means pinning hyperthread of the same core first "
67+ "before the next core.]\n\t"
68+ "[-o arbitrary core pinning order separated by comma without space, command "
69+ "lstopo can be used to deduce the correct order.]\n\t"
6570 "[-- <test specific arguments>]\n" , invoc );
6671}
6772
6873int main (int argc , char * * argv )
6974{
7075 struct sched_param sparam ;
7176
72- unsigned long i ;
77+ unsigned long opt ;
7378 unsigned long num_cores ;
7479 unsigned long result ;
7580 unsigned long sched_elapsed = 0 , real_elapsed = 0 , realcpu_elapsed = 0 ;
@@ -84,16 +89,18 @@ int main(int argc, char** argv)
8489 .ncrit = 0 ,
8590 .nparallel = 0 ,
8691 .ileave = 1 ,
87- .safemode = 0 };
92+ .safemode = 0 ,
93+ .pinorder = NULL };
8894
8995 opterr = 0 ;
9096
91- while ((i = getopt (argc , argv , "t:a:c:p:i:s" )) != -1 )
97+ while ((opt = getopt (argc , argv , "t:a:c:p:i:o :s" )) != -1 )
9298 {
9399 long optval = 0 ;
94100 int len = 0 ;
95101 char buf [128 ];
96- switch (i ) {
102+ char * csv = NULL ;
103+ switch (opt ) {
97104 case 't' :
98105 optval = strtol (optarg , (char * * ) NULL , 10 );
99106 /* Do not allow number of threads to exceed online cores
@@ -158,12 +165,26 @@ int main(int argc, char** argv)
158165 case 'i' :
159166 optval = strtol (optarg , (char * * ) NULL , 10 );
160167 if (optval < 0 ) {
161- fprintf (stderr , "ERROR: Core interleave must be positive.\n" );
168+ fprintf (stderr , "ERROR: core interleave must be positive.\n" );
162169 return 1 ;
163170 }
164171 else {
165172 args .ileave = optval ;
166173 }
174+ break ;
175+ case 'o' :
176+ args .pinorder = calloc (num_cores , sizeof (int ));
177+ if (args .pinorder == NULL ) {
178+ fprintf (stderr , "ERROR: Cannot allocate enough memory for pinorder structure.\n" );
179+ return 1 ;
180+ }
181+ csv = strtok (optarg , "," );
182+ for (int i = 0 ; i < num_cores && csv != NULL ; csv = strtok (NULL , "," ), ++ i ) {
183+ optval = strtol (csv , (char * * ) NULL , 10 );
184+ if (optval >= 0 && optval < num_cores ) * (args .pinorder + i ) = optval ;
185+ else fprintf (stderr , "WARNING: core number %ld is out of range.\n" , optval );
186+ }
187+ break ;
167188 case 's' :
168189 args .safemode = 1 ;
169190 break ;
@@ -213,7 +234,7 @@ int main(int argc, char** argv)
213234 tickspns = (double )timer_get_cnt_freq () / 1000000000.0 ;
214235
215236 thread_args t_args [args .nthrds ];
216- for (i = 0 ; i < args .nthrds ; ++ i ) {
237+ for (int i = 0 ; i < args .nthrds ; ++ i ) {
217238 hmrs [i ] = 0 ;
218239 t_args [i ].ncores = num_cores ;
219240 t_args [i ].nthrds = args .nthrds ;
@@ -230,11 +251,12 @@ int main(int argc, char** argv)
230251 t_args [i ].post = args .nparallel ;
231252 t_args [i ].post_unit = args .nparallel_units ;
232253 t_args [i ].tickspns = tickspns ;
254+ t_args [i ].pinorder = args .pinorder ;
233255
234256 pthread_create (& hmr_threads [i ], & hmr_attr , hmr , (void * )(& t_args [i ]));
235257 }
236258
237- for (i = 0 ; i < args .nthrds ; ++ i ) {
259+ for (int i = 0 ; i < args .nthrds ; ++ i ) {
238260 result = pthread_join (hmr_threads [i ], NULL );
239261 }
240262 /* "Marshal" thread will collect start time once all threads have
@@ -245,7 +267,7 @@ int main(int argc, char** argv)
245267 pthread_attr_destroy (& hmr_attr );
246268
247269 result = 0 ;
248- for (i = 0 ; i < args .nthrds ; ++ i ) {
270+ for (int i = 0 ; i < args .nthrds ; ++ i ) {
249271 result += hmrs [i ];
250272 sched_elapsed += hmrtime [i ];
251273 realcpu_elapsed += hmrrealtime [i ];
@@ -320,6 +342,7 @@ void* hmr(void *ptr)
320342 unsigned long hold_count = x -> hold ;
321343 unsigned long post_count = x -> post ;
322344 double tickspns = x -> tickspns ;
345+ int * pinorder = x -> pinorder ;
323346
324347 unsigned long mycore = 0 ;
325348
@@ -356,40 +379,48 @@ void* hmr(void *ptr)
356379 /* Wait for all threads to arrive from calibrating. */
357380 synchronize_threads (& calibrate_lock , nthrds );
358381 clock_gettime (CLOCK_MONOTONIC , & tv_monot_start );
359- }
360- else {
361- /* Calculate affinity mask for my core and set affinity */
362- /* The concept of "interleave" is used here to allow for specifying
363- * whether increasing cores counts first populate physical cores or
364- * hardware threads within the same physical core. This assumes the
365- * following relationship between logical core numbers (N), hardware
366- * threads per core (K), and physical cores (N/K):
367- *
368- * physical core |___core_0__|___core_1__|_core_N/K-1|
369- * thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
370- * --------------|-|-|---|---|-|-|---|---|-|-|---|---|
371- * logical core | | | | | | | | | | | | |
372- * 0 |*| | | | | | | | | | | |
373- * 1 | | | | |*| | | | | | | |
374- * ... |...................................|
375- * N/K-1 | | | | | | | | |*| | | |
376- * N/K | |*| | | | | | | | | | |
377- * N/K+1 | | | | | |*| | | | | | |
378- * ... |...................................|
379- * N-K | | | | * | | | | | | | | |
380- * N-K+1 | | | | | | | | * | | | | |
381- * ... |...................................|
382- * N-1 | | | | | | | | | | | | * |
383- *
384- * Thus by setting the interleave value to 1 physical cores are filled
385- * first with subsequent cores past N/K adding subsequent threads
386- * on already populated physical cores. On the other hand, setting
387- * interleave to K causes the algorithm to populate 0, N/K, 2N/K and
388- * so on filling all hardware threads in the first physical core prior
389- * to populating any threads on the second physical core.
382+ } else {
383+ /*
384+ * Non-zero core value indicates next core to pin, zero value means
385+ * fallback to default interleave mode.
390386 */
391- CPU_SET (((mycore * ncores / ileave ) % ncores + (mycore / ileave )), & affin_mask );
392- sched_setaffinity (0 , sizeof (cpu_set_t ), & affin_mask );
387+ if (pinorder && * (pinorder + mycore )) {
388+ CPU_SET (* (pinorder + mycore ), & affin_mask );
389+ sched_setaffinity (0 , sizeof (cpu_set_t ), & affin_mask );
390+ } else {
391+ /* Calculate affinity mask for my core and set affinity */
392+ /* The concept of "interleave" is used here to allow for specifying
393+ * whether increasing cores counts first populate physical cores or
394+ * hardware threads within the same physical core. This assumes the
395+ * following relationship between logical core numbers (N), hardware
396+ * threads per core (K), and physical cores (N/K):
397+ *
398+ * physical core |___core_0__|___core_1__|_core_N/K-1|
399+ * thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
400+ * --------------|-|-|---|---|-|-|---|---|-|-|---|---|
401+ * logical core | | | | | | | | | | | | |
402+ * 0 |*| | | | | | | | | | | |
403+ * 1 | | | | |*| | | | | | | |
404+ * ... |...................................|
405+ * N/K-1 | | | | | | | | |*| | | |
406+ * N/K | |*| | | | | | | | | | |
407+ * N/K+1 | | | | | |*| | | | | | |
408+ * ... |...................................|
409+ * N-K | | | | * | | | | | | | | |
410+ * N-K+1 | | | | | | | | * | | | | |
411+ * ... |...................................|
412+ * N-1 | | | | | | | | | | | | * |
413+ *
414+ * Thus by setting the interleave value to 1 physical cores are filled
415+ * first with subsequent cores past N/K adding subsequent threads
416+ * on already populated physical cores. On the other hand, setting
417+ * interleave to K causes the algorithm to populate 0, N/K, 2N/K and
418+ * so on filling all hardware threads in the first physical core prior
419+ * to populating any threads on the second physical core.
420+ */
421+ CPU_SET (((mycore * ncores / ileave ) % ncores + (mycore / ileave )), & affin_mask );
422+ sched_setaffinity (0 , sizeof (cpu_set_t ), & affin_mask );
423+ }
393424
394425 fetchadd64_release (& ready_lock , 1 );
395426
0 commit comments