3535
3636/*
3737 * perf_timer.h
38- * Functions to read hardware timers, query timer frequency, and a
39- * blackhole function that wastes cpu time (useful for nanosecond waits)
38+ * Functions to read hardware timers and query timer frequency.
4039 * Supports x86 and AArch64 platforms
4140 *
4241 * Define DEBUG in makefile or here if you desire debug output,
@@ -67,7 +66,11 @@ extern __thread uint64_t prev_tsc;
6766 operating frequency. It may, for example,
6867 count cycles at the maximum frequency of the
6968 device, even if the CPU core is running at a
70- lower frequency.
69+ lower frequency, or it may count at a frequency
70+ unrelated to the operating frequency. Use
71+ the --estimate-hwtimer-frequency flag to measure
72+ the frequency and the --hwtimer-frequency flag to
73+ override the value detected by the code below.
7174 */
7275#ifdef __x86_64__
7376static inline uint64_t __attribute__((always_inline ))
@@ -184,7 +187,7 @@ get_raw_counter(void) {
184187static inline uint64_t __attribute__((always_inline ))
185188get_cntvct_el0 (void ) {
186189 uint64_t t ;
187- asm volatile ("mrs %0, cntvct_el0" : "=r" (t ));
190+ asm volatile ("ISB; mrs %0, cntvct_el0" : "=r" (t ));
188191 return t ;
189192}
190193
@@ -211,29 +214,29 @@ timer_reset_counter()
211214static inline uint64_t __attribute__((always_inline ))
212215timer_get_counter ()
213216{
214- /* this returns the cycle counter from a constant-rate timer */
217+ /* this returns the counter value from a constant-rate timer */
215218#ifdef __aarch64__
216- uint64_t timer ;
217- __asm__ __volatile__ ("isb; mrs %0, cntvct_el0" : "=r" (timer ));
219+ uint64_t counter_value ;
220+ __asm__ __volatile__ ("isb; mrs %0, cntvct_el0" : "=r" (counter_value ));
218221#elif __x86_64__
219- uint64_t timer = rdtscp (); // assume constant_tsc
222+ uint64_t counter_value = rdtscp (); // assume constant_tsc
220223#endif
221- return timer ;
224+ return counter_value ;
222225}
223226
224227/* Timer read for when at start of timing block
225228 */
226229static inline uint64_t __attribute__((always_inline ))
227230timer_get_counter_start ()
228231{
229- /* this returns the cycle counter from a constant-rate timer */
232+ /* this returns the counter value from a constant-rate timer */
230233#ifdef __aarch64__
231- uint64_t timer ;
232- __asm__ __volatile__ ("dsb ish; isb; mrs %0, cntvct_el0" : "=r" (timer ));
234+ uint64_t counter_value ;
235+ __asm__ __volatile__ ("dsb ish; isb; mrs %0, cntvct_el0" : "=r" (counter_value ));
233236#elif __x86_64__
234- uint64_t timer = rdtscp_start (); // assume constant_tsc
237+ uint64_t counter_value = rdtscp_start (); // assume constant_tsc
235238#endif
236- return timer ;
239+ return counter_value ;
237240}
238241
239242
@@ -242,14 +245,14 @@ timer_get_counter_start()
242245static inline uint64_t __attribute__((always_inline ))
243246timer_get_counter_end ()
244247{
245- /* this returns the cycle counter from a constant-rate timer */
248+ /* this returns the counter value from a constant-rate timer */
246249#ifdef __aarch64__
247- uint64_t timer ;
248- __asm__ __volatile__ ("isb; mrs %0, cntvct_el0; isb" : "=r" (timer ));
250+ uint64_t counter_value ;
251+ __asm__ __volatile__ ("isb; mrs %0, cntvct_el0; isb" : "=r" (counter_value ));
249252#elif __x86_64__
250- uint64_t timer = rdtscp_end (); // assume constant_tsc
253+ uint64_t counter_value = rdtscp_end (); // assume constant_tsc
251254#endif
252- return timer ;
255+ return counter_value ;
253256}
254257
255258static inline void __attribute__((always_inline ))
@@ -262,13 +265,30 @@ static inline void __attribute__((always_inline))
262265timer_init () {
263266}
264267
265- static inline uint32_t __attribute__((always_inline ))
266- timer_get_cnt_freq (void )
268+ static inline uint64_t __attribute__((always_inline ))
269+ timer_get_timer_freq (void )
267270{
268- uint32_t cnt_freq ;
271+ extern unsigned long hwtimer_frequency ;
272+ if (hwtimer_frequency ) { return hwtimer_frequency ; }
273+
274+ uint64_t cnt_freq ;
269275#ifdef __aarch64__
270276 __asm__ __volatile__ ("isb; mrs %0, cntfrq_el0" : "=r" (cnt_freq ));
271277#elif __x86_64__
278+ // This code attempts to get the TSC frequency. The assumption made
279+ // is TSC frequency equals the CPUFreq cpuinfo_max_freq attribute
280+ // value, which is the maximum operating frequency of the processor.
281+ // However, this equality is not always true, and less so in newer CPUs.
282+ // Also, the actual TSC frequency may not exactly match any nominal
283+ // frequency attribute value provided by CPUFreq, so the chances of
284+ // this returning the correct frequency have diminished.
285+
286+ // If the CPUFreq cpuinfo_max_freq attribute is not available, this code
287+ // then tries to quickly measure it.
288+
289+ // Use --timer-frequency flag to override the frequency value.
290+ // Use --estimate-timer-frequency to explicitly measure it.
291+
272292 char buf [100 ];
273293 FILE * f = fopen ("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" , "r" );
274294 if (f == NULL ) {
@@ -285,7 +305,7 @@ timer_get_cnt_freq(void)
285305
286306 // round down cycles
287307 uint64_t tmp = (time /iterations );
288- unsigned int len = log10 (tmp );
308+ unsigned long len = log10 (tmp );
289309 double div = pow (10 , len - 2 );
290310 return floor (tmp /div )* div ;
291311 }
@@ -308,182 +328,12 @@ timer_get_cnt_freq(void)
308328#endif
309329 return cnt_freq ;
310330}
311- #endif
312331
313332#define TOKENS_MAX_HIGH 1000000 /* good for ~41500 cntvct cycles */
314333#define THRESHOLD 1.05 // if the ratio of cycles to do the total eval loop to the sum of the individual
315334 // calls (e.g. due to context switch), rerun
316335
317- void __attribute__((noinline , optimize ("no-unroll-loops" ))) blackhole (unsigned long iters ) {
318- if (! iters ) { return ; }
319- #ifdef __aarch64__
320- asm volatile (".p2align 4; 1: add %0, %0, -1; cbnz %0, 1b" : "+r" (iters ) : "0" (iters ));
321- #elif __x86_64__
322- asm volatile (".p2align 4; 1: add $-1, %0; jne 1b" : "+r" (iters ) );
323- #endif
324- }
325-
326-
327- int64_t __attribute__((noinline , optimize ("no-unroll-loops" ))) evaluate_loop_overhead (const unsigned long NUMTRIES )
328- {
329- uint64_t LOOP_TEST_OVERHEAD = 0 ;
330- int64_t outer_cycles_start , outer_cycles_end ;
331- unsigned long i , j ;
332- int64_t outer_elapsed_total = 0 ;
333-
334- for (j = 0 ; j < 1000 ; j ++ ) {
335- int64_t elapsed_total = 0 ;
336- outer_cycles_start = timer_get_counter_start ();
337- for (i = 0 ; i < NUMTRIES ; i ++ ) {
338-
339- uint64_t cycles_start , cycles_end ;
340- cycles_start = timer_get_counter_start ();
341- cycles_end = timer_get_counter_end ();
342-
343- int64_t elapsed = MAX ((int64_t )(cycles_end - cycles_start ), 0 );
344- elapsed_total += elapsed ;
345- }
346- outer_cycles_end = timer_get_counter_end ();
347- outer_elapsed_total = outer_cycles_end - outer_cycles_start ;
348- LOOP_TEST_OVERHEAD += (outer_elapsed_total - elapsed_total );
349- }
350- LOOP_TEST_OVERHEAD = LOOP_TEST_OVERHEAD /j ;
351- return LOOP_TEST_OVERHEAD ;
352- }
353-
354-
355- int64_t evaluate_timer_overhead (void )
356- {
357- uint64_t TIMER_OVERHEAD = 0 ;
358- int64_t outer_cycles_start , outer_cycles_end ;
359- outer_cycles_start = timer_get_counter_start ();
360- outer_cycles_end = timer_get_counter_end ();
361- // Force measurement to 0 if it somehow goes negative
362- int64_t elapsed = MAX (outer_cycles_end - outer_cycles_start , 0 );
363- TIMER_OVERHEAD = elapsed ;
364- return TIMER_OVERHEAD ;
365- }
366-
367-
368- int64_t __attribute__((noinline , optimize ("no-unroll-loops" ))) evaluate_blackhole (
369- const unsigned long tokens_mid , const unsigned long NUMTRIES )
370- {
371- unsigned long i , j ;
372- int64_t outer_cycles_start , outer_cycles_end ;
373- int64_t sum_elapsed_total = 0 ;
374- int64_t avg_elapsed_total = 0 ;
375- int64_t outer_elapsed_total ;
376- int64_t outer_inner_diff ;
377- int64_t elapsed_total_diff ;
378- double percent ;
379-
380- int64_t LOOP_TEST_OVERHEAD = evaluate_loop_overhead (NUMTRIES );
381- int64_t TIMER_OVERHEAD = evaluate_timer_overhead ();
382-
383- for (j = 0 ; j < NUMTRIES ; j ++ ) {
384-
385- int64_t elapsed_total = 0 ;
386-
387- outer_cycles_start = timer_get_counter_start ();
388- for (i = 0 ; i < NUMTRIES ; i ++ ) {
389336
390- uint64_t cycles_start , cycles_end ;
391- cycles_start = timer_get_counter_start ();
392- blackhole (tokens_mid );
393- cycles_end = timer_get_counter_end ();
394-
395- uint64_t elapsed = cycles_end - cycles_start ;
396- // printf("elapsed = %lu\n", elapsed);
397-
398- elapsed_total += elapsed ;
399- }
400- outer_cycles_end = timer_get_counter_end ();
401-
402- outer_elapsed_total = outer_cycles_end - outer_cycles_start ;
403- outer_inner_diff = abs (outer_elapsed_total - elapsed_total );
404-
405- // Force measurements to zero if overhead swamps loop run time, in this
406- // case we can't measure this low of a requested time accurately.
407- sum_elapsed_total += MAX ((int64_t )(elapsed_total - TIMER_OVERHEAD * NUMTRIES ), 0 );
408- avg_elapsed_total = sum_elapsed_total / (j + 1 );
409- elapsed_total_diff = abs (avg_elapsed_total - elapsed_total );
410-
411- #ifdef DDEBUG
412- if (outer_inner_diff > LOOP_TEST_OVERHEAD ) {
413- percent = outer_inner_diff / (double ) LOOP_TEST_OVERHEAD ;
414- } else {
415- percent = LOOP_TEST_OVERHEAD / (double ) outer_inner_diff ;
416- }
417-
418- printf ("outer_elapsed_total = %lu "
419- "elapsed_total = %lu "
420- "outer_inner_diff = %lu percent_oh = %f percent_loop = %f\n" ,
421- outer_elapsed_total , elapsed_total , outer_inner_diff , percent ,
422- (double ) elapsed_total_diff / avg_elapsed_total );
423- #endif
424- }
425-
426- // returns average duration of NUMTRIES calls to blackhole with tokens_mid
427- long result = avg_elapsed_total ;
428- return result ;
429- }
430-
431- unsigned long calibrate_blackhole (unsigned long target , unsigned long tokens_low , unsigned long tokens_high ,
432- unsigned long core_id )
433- {
434- unsigned long tokens_diff = tokens_high - tokens_low ;
435- unsigned long tokens_mid = (tokens_diff / 2 ) + tokens_low ;
436- unsigned long NUMTRIES = 15 ;
437- unsigned long target_elapsed_total = NUMTRIES * target ;
438-
439- #ifdef DDEBUG
440- printf ("target = %lu, target_elapsed_total = %lu, tokens_low = %lu, tokens_high = %lu, "
441- "tokens_diff = %lu, tokens_mid = %lu\n" ,
442- target , target_elapsed_total , tokens_low , tokens_high , tokens_diff , tokens_mid );
443337#endif
444338
445- if (tokens_diff == 1 ) {
446- // the answer is either tokens_low or tokens_high
447-
448- unsigned long ret_low = evaluate_blackhole (tokens_low , NUMTRIES );
449- unsigned long ret_high = evaluate_blackhole (tokens_high , NUMTRIES );
450-
451- #ifdef DEBUG
452- printf ("t(%lu) = %lu, tokens_mid = %lu target_elapsed_total = %lu\n" ,
453- core_id , ret_low , tokens_low , target_elapsed_total );
454- printf ("t(%lu) = %lu, tokens_mid = %lu target_elapsed_total = %lu\n" ,
455- core_id , ret_high , tokens_high , target_elapsed_total );
456- #endif
457- long low_diff = abs (ret_low - target_elapsed_total );
458- long high_diff = abs (ret_high - target_elapsed_total );
459-
460- if (low_diff < high_diff ) {
461- if (tokens_low >= (TOKENS_MAX_HIGH - 1 )) {
462- printf ("tokens is TOKENS_MAX_HIGH or TOKENS_MAX_HIGH -1. requested delay is too long or too short.\n" );
463- }
464-
465- return tokens_low ;
466- }
467-
468- if (tokens_high >= (TOKENS_MAX_HIGH - 1 )) {
469- printf ("tokens is TOKENS_MAX_HIGH or TOKENS_MAX_HIGH -1. requested delay is too long or too short.\n" );
470- }
471-
472- return tokens_high ;
473- }
474-
475- // Measure if this # of tokens is the proper #.
476- unsigned long t = evaluate_blackhole (tokens_mid , NUMTRIES );
477-
478- #ifdef DEBUG
479- printf ("t(%lu) = %lu, tokens_mid = %lu target_elapsed_total = %lu\n" , core_id , t , tokens_mid , target_elapsed_total );
480- #endif
481-
482- if (t > target_elapsed_total ) {
483- tokens_mid = calibrate_blackhole (target , tokens_low , tokens_mid , core_id );
484- } else if (t < target_elapsed_total ) {
485- tokens_mid = calibrate_blackhole (target , tokens_mid , tokens_high , core_id );
486- }
487-
488- return tokens_mid ;
489- }
339+ /* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
0 commit comments