@@ -62,14 +62,19 @@ void print_usage (char *invoc) {
6262 "[-p <#>[ns | in] parallelizable iterations measured in ns or (in)structions, "
6363 "if no suffix, assumes (in)structions]\n\t"
6464 "[-s safe-mode operation for running as non-root\n\t"
65+ "[-i interleave value for thread pinning order, for example, 1 means "
66+ "sequential increasing, 2 means pinning hyperthread of the same core first "
67+ "before the next core.]\n\t"
68+ "[-o arbitrary core pinning order separated by comma without space, command "
69+ "lstopo can be used to deduce the correct order.]\n\t"
6570 "[-- <test specific arguments>]\n" , invoc );
6671}
6772
6873int main (int argc , char * * argv )
6974{
7075 struct sched_param sparam ;
7176
72- unsigned long i ;
77+ unsigned long opt ;
7378 unsigned long num_cores ;
7479 unsigned long result ;
7580 unsigned long sched_elapsed = 0 , real_elapsed = 0 , realcpu_elapsed = 0 ;
@@ -84,16 +89,18 @@ int main(int argc, char** argv)
8489 .ncrit = 0 ,
8590 .nparallel = 0 ,
8691 .ileave = 1 ,
87- .safemode = 0 };
92+ .safemode = 0 ,
93+ .pinorder = NULL };
8894
8995 opterr = 0 ;
9096
91- while ((i = getopt (argc , argv , "t:a:c:p:i:s" )) != -1 )
97+ while ((opt = getopt (argc , argv , "t:a:c:p:i:o :s" )) != -1 )
9298 {
9399 long optval = 0 ;
94100 int len = 0 ;
95101 char buf [128 ];
96- switch (i ) {
102+ char * csv = NULL ;
103+ switch (opt ) {
97104 case 't' :
98105 optval = strtol (optarg , (char * * ) NULL , 10 );
99106 /* Do not allow number of threads to exceed online cores
@@ -158,12 +165,23 @@ int main(int argc, char** argv)
158165 case 'i' :
159166 optval = strtol (optarg , (char * * ) NULL , 10 );
160167 if (optval < 0 ) {
161- fprintf (stderr , "ERROR: Core interleave must be positive.\n" );
168+ fprintf (stderr , "ERROR: core interleave must be positive.\n" );
162169 return 1 ;
163170 }
164171 else {
165172 args .ileave = optval ;
166173 }
174+ break ;
175+ case 'o' :
176+ args .pinorder = calloc (num_cores , sizeof (int ));
177+ if (args .pinorder == NULL ) {
178+ fprintf (stderr , "ERROR: Cannot allocate enough memory for pinorder structure.\n" );
179+ return 1 ;
180+ }
181+ csv = strtok (optarg , "," );
182+ for (int i = 0 ; i < num_cores && csv != NULL ; csv = strtok (NULL , "," ), ++ i )
183+ * (args .pinorder + i ) = strtol (csv , (char * * ) NULL , 10 );
184+ break ;
167185 case 's' :
168186 args .safemode = 1 ;
169187 break ;
@@ -213,7 +231,7 @@ int main(int argc, char** argv)
213231 tickspns = (double )timer_get_cnt_freq () / 1000000000.0 ;
214232
215233 thread_args t_args [args .nthrds ];
216- for (i = 0 ; i < args .nthrds ; ++ i ) {
234+ for (int i = 0 ; i < args .nthrds ; ++ i ) {
217235 hmrs [i ] = 0 ;
218236 t_args [i ].ncores = num_cores ;
219237 t_args [i ].nthrds = args .nthrds ;
@@ -230,11 +248,12 @@ int main(int argc, char** argv)
230248 t_args [i ].post = args .nparallel ;
231249 t_args [i ].post_unit = args .nparallel_units ;
232250 t_args [i ].tickspns = tickspns ;
251+ t_args [i ].pinorder = args .pinorder ;
233252
234253 pthread_create (& hmr_threads [i ], & hmr_attr , hmr , (void * )(& t_args [i ]));
235254 }
236255
237- for (i = 0 ; i < args .nthrds ; ++ i ) {
256+ for (int i = 0 ; i < args .nthrds ; ++ i ) {
238257 result = pthread_join (hmr_threads [i ], NULL );
239258 }
240259 /* "Marshal" thread will collect start time once all threads have
@@ -245,7 +264,7 @@ int main(int argc, char** argv)
245264 pthread_attr_destroy (& hmr_attr );
246265
247266 result = 0 ;
248- for (i = 0 ; i < args .nthrds ; ++ i ) {
267+ for (int i = 0 ; i < args .nthrds ; ++ i ) {
249268 result += hmrs [i ];
250269 sched_elapsed += hmrtime [i ];
251270 realcpu_elapsed += hmrrealtime [i ];
@@ -320,6 +339,7 @@ void* hmr(void *ptr)
320339 unsigned long hold_count = x -> hold ;
321340 unsigned long post_count = x -> post ;
322341 double tickspns = x -> tickspns ;
342+ int * pinorder = x -> pinorder ;
323343
324344 unsigned long mycore = 0 ;
325345
@@ -356,40 +376,48 @@ void* hmr(void *ptr)
356376 /* Wait for all threads to arrive from calibrating. */
357377 synchronize_threads (& calibrate_lock , nthrds );
358378 clock_gettime (CLOCK_MONOTONIC , & tv_monot_start );
359- }
360- else {
361- /* Calculate affinity mask for my core and set affinity */
362- /* The concept of "interleave" is used here to allow for specifying
363- * whether increasing cores counts first populate physical cores or
364- * hardware threads within the same physical core. This assumes the
365- * following relationship between logical core numbers (N), hardware
366- * threads per core (K), and physical cores (N/K):
367- *
368- * physical core |___core_0__|___core_1__|_core_N/K-1|
369- * thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
370- * --------------|-|-|---|---|-|-|---|---|-|-|---|---|
371- * logical core | | | | | | | | | | | | |
372- * 0 |*| | | | | | | | | | | |
373- * 1 | | | | |*| | | | | | | |
374- * ... |...................................|
375- * N/K-1 | | | | | | | | |*| | | |
376- * N/K | |*| | | | | | | | | | |
377- * N/K+1 | | | | | |*| | | | | | |
378- * ... |...................................|
379- * N-K | | | | * | | | | | | | | |
380- * N-K+1 | | | | | | | | * | | | | |
381- * ... |...................................|
382- * N-1 | | | | | | | | | | | | * |
383- *
384- * Thus by setting the interleave value to 1 physical cores are filled
385- * first with subsequent cores past N/K adding subsequent threads
386- * on already populated physical cores. On the other hand, setting
387- * interleave to K causes the algorithm to populate 0, N/K, 2N/K and
388- * so on filling all hardware threads in the first physical core prior
389- * to populating any threads on the second physical core.
379+ } else {
380+ /*
381+ * Non-zero core value indicates next core to pin, zero value means
382+ * fallback to default interleave mode.
390383 */
391- CPU_SET (((mycore * ncores / ileave ) % ncores + (mycore / ileave )), & affin_mask );
392- sched_setaffinity (0 , sizeof (cpu_set_t ), & affin_mask );
384+ if (pinorder && * (pinorder + mycore )) {
385+ CPU_SET (* (pinorder + mycore ), & affin_mask );
386+ sched_setaffinity (0 , sizeof (cpu_set_t ), & affin_mask );
387+ } else {
388+ /* Calculate affinity mask for my core and set affinity */
389+ /* The concept of "interleave" is used here to allow for specifying
390+ * whether increasing cores counts first populate physical cores or
391+ * hardware threads within the same physical core. This assumes the
392+ * following relationship between logical core numbers (N), hardware
393+ * threads per core (K), and physical cores (N/K):
394+ *
395+ * physical core |___core_0__|___core_1__|_core_N/K-1|
396+ * thread |0|1|...|K-1|0|1|...|K-1|0|1|...|K-1|
397+ * --------------|-|-|---|---|-|-|---|---|-|-|---|---|
398+ * logical core | | | | | | | | | | | | |
399+ * 0 |*| | | | | | | | | | | |
400+ * 1 | | | | |*| | | | | | | |
401+ * ... |...................................|
402+ * N/K-1 | | | | | | | | |*| | | |
403+ * N/K | |*| | | | | | | | | | |
404+ * N/K+1 | | | | | |*| | | | | | |
405+ * ... |...................................|
406+ * N-K | | | | * | | | | | | | | |
407+ * N-K+1 | | | | | | | | * | | | | |
408+ * ... |...................................|
409+ * N-1 | | | | | | | | | | | | * |
410+ *
411+ * Thus by setting the interleave value to 1 physical cores are filled
412+ * first with subsequent cores past N/K adding subsequent threads
413+ * on already populated physical cores. On the other hand, setting
414+ * interleave to K causes the algorithm to populate 0, N/K, 2N/K and
415+ * so on filling all hardware threads in the first physical core prior
416+ * to populating any threads on the second physical core.
417+ */
418+ CPU_SET (((mycore * ncores / ileave ) % ncores + (mycore / ileave )), & affin_mask );
419+ sched_setaffinity (0 , sizeof (cpu_set_t ), & affin_mask );
420+ }
393421
394422 fetchadd64_release (& ready_lock , 1 );
395423
0 commit comments