Skip to content

Commit d81cea4

Browse files
committed
benchmarks/lockhammer/include/perf_timer.h
- inline ISB ahead of reading cntvct_el0 so as to be more similar to executing rdtscp - refer to the "hardware counter" instead of "cycle counter" because the register does not necessarily count cycles - provide a way to override the value returned by timer_get_timer_freq() for x86 because the TSC frequency may not match the CPU frequency - blackhole code moved to measure.c Change-Id: I96a088fe868e929b228c6410684c5080f337d17c
1 parent 02fcfe5 commit d81cea4

File tree

1 file changed

+44
-194
lines changed

1 file changed

+44
-194
lines changed

benchmarks/lockhammer/include/perf_timer.h

Lines changed: 44 additions & 194 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@
3535

3636
/*
3737
* perf_timer.h
38-
* Functions to read hardware timers, query timer frequency, and a
39-
* blackhole function that wastes cpu time (useful for nanosecond waits)
38+
* Functions to read hardware timers and query timer frequency.
4039
* Supports x86 and AArch64 platforms
4140
*
4241
* Define DEBUG in makefile or here if you desire debug output,
@@ -67,7 +66,11 @@ extern __thread uint64_t prev_tsc;
6766
operating frequency. It may, for example,
6867
count cycles at the maximum frequency of the
6968
device, even if the CPU core is running at a
70-
lower frequency.
69+
lower frequency, or it may count at a frequency
70+
unrelated to the operating frequency. Use
71+
the --estimate-hwtimer-frequency flag to measure
72+
the frequency and the --hwtimer-frequency flag to
73+
override the value detected by the code below.
7174
*/
7275
#ifdef __x86_64__
7376
static inline uint64_t __attribute__((always_inline))
@@ -184,7 +187,7 @@ get_raw_counter(void) {
184187
static inline uint64_t __attribute__((always_inline))
185188
get_cntvct_el0(void) {
186189
uint64_t t;
187-
asm volatile ("mrs %0, cntvct_el0" : "=r" (t));
190+
asm volatile ("ISB; mrs %0, cntvct_el0" : "=r" (t));
188191
return t;
189192
}
190193

@@ -211,29 +214,29 @@ timer_reset_counter()
211214
static inline uint64_t __attribute__((always_inline))
212215
timer_get_counter()
213216
{
214-
/* this returns the cycle counter from a constant-rate timer */
217+
/* this returns the counter value from a constant-rate timer */
215218
#ifdef __aarch64__
216-
uint64_t timer;
217-
__asm__ __volatile__ ("isb; mrs %0, cntvct_el0" : "=r" (timer));
219+
uint64_t counter_value;
220+
__asm__ __volatile__ ("isb; mrs %0, cntvct_el0" : "=r" (counter_value));
218221
#elif __x86_64__
219-
uint64_t timer = rdtscp(); // assume constant_tsc
222+
uint64_t counter_value = rdtscp(); // assume constant_tsc
220223
#endif
221-
return timer;
224+
return counter_value;
222225
}
223226

224227
/* Timer read for when at start of timing block
225228
*/
226229
static inline uint64_t __attribute__((always_inline))
227230
timer_get_counter_start()
228231
{
229-
/* this returns the cycle counter from a constant-rate timer */
232+
/* this returns the counter value from a constant-rate timer */
230233
#ifdef __aarch64__
231-
uint64_t timer;
232-
__asm__ __volatile__ ("dsb ish; isb; mrs %0, cntvct_el0" : "=r" (timer));
234+
uint64_t counter_value;
235+
__asm__ __volatile__ ("dsb ish; isb; mrs %0, cntvct_el0" : "=r" (counter_value));
233236
#elif __x86_64__
234-
uint64_t timer = rdtscp_start(); // assume constant_tsc
237+
uint64_t counter_value = rdtscp_start(); // assume constant_tsc
235238
#endif
236-
return timer;
239+
return counter_value;
237240
}
238241

239242

@@ -242,14 +245,14 @@ timer_get_counter_start()
242245
static inline uint64_t __attribute__((always_inline))
243246
timer_get_counter_end()
244247
{
245-
/* this returns the cycle counter from a constant-rate timer */
248+
/* this returns the counter value from a constant-rate timer */
246249
#ifdef __aarch64__
247-
uint64_t timer;
248-
__asm__ __volatile__ ("isb; mrs %0, cntvct_el0; isb" : "=r" (timer));
250+
uint64_t counter_value;
251+
__asm__ __volatile__ ("isb; mrs %0, cntvct_el0; isb" : "=r" (counter_value));
249252
#elif __x86_64__
250-
uint64_t timer = rdtscp_end(); // assume constant_tsc
253+
uint64_t counter_value = rdtscp_end(); // assume constant_tsc
251254
#endif
252-
return timer;
255+
return counter_value;
253256
}
254257

255258
static inline void __attribute__((always_inline))
@@ -262,13 +265,30 @@ static inline void __attribute__((always_inline))
262265
timer_init() {
263266
}
264267

265-
static inline uint32_t __attribute__((always_inline))
266-
timer_get_cnt_freq(void)
268+
static inline uint64_t __attribute__((always_inline))
269+
timer_get_timer_freq(void)
267270
{
268-
uint32_t cnt_freq;
271+
extern unsigned long hwtimer_frequency;
272+
if (hwtimer_frequency) { return hwtimer_frequency; }
273+
274+
uint64_t cnt_freq;
269275
#ifdef __aarch64__
270276
__asm__ __volatile__ ("isb; mrs %0, cntfrq_el0" : "=r" (cnt_freq));
271277
#elif __x86_64__
278+
// This code attempts to get the TSC frequency. The assumption made
279+
// is TSC frequency equals the CPUFreq cpuinfo_max_freq attribute
280+
// value, which is the maximum operating frequency of the processor.
281+
// However, this equality is not always true, and less so in newer CPUs.
282+
// Also, the actual TSC frequency may not exactly match any nominal
283+
// frequency attribute value provided by CPUFreq, so the chances of
284+
// this returning the correct frequency have diminished.
285+
286+
// If the CPUFreq cpuinfo_max_freq attribute is not available, this code
287+
// then tries to quickly measure it.
288+
289+
// Use --timer-frequency flag to override the frequency value.
290+
// Use --estimate-timer-frequency to explicitly measure it.
291+
272292
char buf[100];
273293
FILE * f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r");
274294
if (f == NULL) {
@@ -285,7 +305,7 @@ timer_get_cnt_freq(void)
285305

286306
// round down cycles
287307
uint64_t tmp = (time/iterations);
288-
unsigned int len = log10(tmp);
308+
unsigned long len = log10(tmp);
289309
double div = pow(10, len-2);
290310
return floor(tmp/div)*div;
291311
}
@@ -308,182 +328,12 @@ timer_get_cnt_freq(void)
308328
#endif
309329
return cnt_freq;
310330
}
311-
#endif
312331

313332
#define TOKENS_MAX_HIGH 1000000 /* good for ~41500 cntvct cycles */
314333
#define THRESHOLD 1.05 // if the ratio of cycles to do the total eval loop to the sum of the individual
315334
// calls (e.g. due to context switch), rerun
316335

317-
void __attribute__((noinline, optimize("no-unroll-loops"))) blackhole(unsigned long iters) {
318-
if (! iters) { return; }
319-
#ifdef __aarch64__
320-
asm volatile (".p2align 4; 1: add %0, %0, -1; cbnz %0, 1b" : "+r" (iters) : "0" (iters));
321-
#elif __x86_64__
322-
asm volatile (".p2align 4; 1: add $-1, %0; jne 1b" : "+r" (iters) );
323-
#endif
324-
}
325-
326-
327-
int64_t __attribute__((noinline, optimize("no-unroll-loops"))) evaluate_loop_overhead(const unsigned long NUMTRIES)
328-
{
329-
uint64_t LOOP_TEST_OVERHEAD = 0;
330-
int64_t outer_cycles_start, outer_cycles_end;
331-
unsigned long i, j;
332-
int64_t outer_elapsed_total = 0;
333-
334-
for (j = 0; j < 1000; j++) {
335-
int64_t elapsed_total = 0;
336-
outer_cycles_start = timer_get_counter_start();
337-
for (i = 0; i < NUMTRIES; i++) {
338-
339-
uint64_t cycles_start, cycles_end;
340-
cycles_start = timer_get_counter_start();
341-
cycles_end = timer_get_counter_end();
342-
343-
int64_t elapsed = MAX((int64_t)(cycles_end - cycles_start), 0);
344-
elapsed_total += elapsed;
345-
}
346-
outer_cycles_end = timer_get_counter_end();
347-
outer_elapsed_total = outer_cycles_end - outer_cycles_start;
348-
LOOP_TEST_OVERHEAD += (outer_elapsed_total - elapsed_total);
349-
}
350-
LOOP_TEST_OVERHEAD = LOOP_TEST_OVERHEAD/j;
351-
return LOOP_TEST_OVERHEAD;
352-
}
353-
354-
355-
int64_t evaluate_timer_overhead(void)
356-
{
357-
uint64_t TIMER_OVERHEAD = 0;
358-
int64_t outer_cycles_start, outer_cycles_end;
359-
outer_cycles_start = timer_get_counter_start();
360-
outer_cycles_end = timer_get_counter_end();
361-
// Force measurement to 0 if it somehow goes negative
362-
int64_t elapsed = MAX(outer_cycles_end - outer_cycles_start, 0);
363-
TIMER_OVERHEAD = elapsed;
364-
return TIMER_OVERHEAD;
365-
}
366-
367-
368-
int64_t __attribute__((noinline, optimize("no-unroll-loops"))) evaluate_blackhole(
369-
const unsigned long tokens_mid, const unsigned long NUMTRIES)
370-
{
371-
unsigned long i, j;
372-
int64_t outer_cycles_start, outer_cycles_end;
373-
int64_t sum_elapsed_total = 0;
374-
int64_t avg_elapsed_total = 0;
375-
int64_t outer_elapsed_total;
376-
int64_t outer_inner_diff;
377-
int64_t elapsed_total_diff;
378-
double percent;
379-
380-
int64_t LOOP_TEST_OVERHEAD = evaluate_loop_overhead(NUMTRIES);
381-
int64_t TIMER_OVERHEAD = evaluate_timer_overhead();
382-
383-
for (j = 0; j < NUMTRIES; j++) {
384-
385-
int64_t elapsed_total = 0;
386-
387-
outer_cycles_start = timer_get_counter_start();
388-
for (i = 0; i < NUMTRIES; i++) {
389336

390-
uint64_t cycles_start, cycles_end;
391-
cycles_start = timer_get_counter_start();
392-
blackhole(tokens_mid);
393-
cycles_end = timer_get_counter_end();
394-
395-
uint64_t elapsed = cycles_end - cycles_start;
396-
// printf("elapsed = %lu\n", elapsed);
397-
398-
elapsed_total += elapsed;
399-
}
400-
outer_cycles_end = timer_get_counter_end();
401-
402-
outer_elapsed_total = outer_cycles_end - outer_cycles_start;
403-
outer_inner_diff = abs(outer_elapsed_total - elapsed_total);
404-
405-
// Force measurements to zero if overhead swamps loop run time, in this
406-
// case we can't measure this low of a requested time accurately.
407-
sum_elapsed_total += MAX((int64_t)(elapsed_total - TIMER_OVERHEAD*NUMTRIES), 0);
408-
avg_elapsed_total = sum_elapsed_total / (j + 1);
409-
elapsed_total_diff = abs(avg_elapsed_total - elapsed_total);
410-
411-
#ifdef DDEBUG
412-
if (outer_inner_diff > LOOP_TEST_OVERHEAD) {
413-
percent = outer_inner_diff / (double) LOOP_TEST_OVERHEAD;
414-
} else {
415-
percent = LOOP_TEST_OVERHEAD/ (double) outer_inner_diff;
416-
}
417-
418-
printf("outer_elapsed_total = %lu "
419-
"elapsed_total = %lu "
420-
"outer_inner_diff = %lu percent_oh = %f percent_loop = %f\n",
421-
outer_elapsed_total, elapsed_total, outer_inner_diff, percent,
422-
(double) elapsed_total_diff / avg_elapsed_total);
423-
#endif
424-
}
425-
426-
// returns average duration of NUMTRIES calls to blackhole with tokens_mid
427-
long result = avg_elapsed_total;
428-
return result;
429-
}
430-
431-
unsigned long calibrate_blackhole(unsigned long target, unsigned long tokens_low, unsigned long tokens_high,
432-
unsigned long core_id)
433-
{
434-
unsigned long tokens_diff = tokens_high - tokens_low;
435-
unsigned long tokens_mid = (tokens_diff / 2) + tokens_low;
436-
unsigned long NUMTRIES = 15;
437-
unsigned long target_elapsed_total = NUMTRIES * target;
438-
439-
#ifdef DDEBUG
440-
printf("target = %lu, target_elapsed_total = %lu, tokens_low = %lu, tokens_high = %lu, "
441-
"tokens_diff = %lu, tokens_mid = %lu\n",
442-
target, target_elapsed_total, tokens_low, tokens_high, tokens_diff, tokens_mid);
443337
#endif
444338

445-
if (tokens_diff == 1) {
446-
// the answer is either tokens_low or tokens_high
447-
448-
unsigned long ret_low = evaluate_blackhole(tokens_low, NUMTRIES);
449-
unsigned long ret_high = evaluate_blackhole(tokens_high, NUMTRIES);
450-
451-
#ifdef DEBUG
452-
printf("t(%lu) = %lu, tokens_mid = %lu target_elapsed_total = %lu\n",
453-
core_id, ret_low, tokens_low, target_elapsed_total);
454-
printf("t(%lu) = %lu, tokens_mid = %lu target_elapsed_total = %lu\n",
455-
core_id, ret_high, tokens_high, target_elapsed_total);
456-
#endif
457-
long low_diff = abs(ret_low - target_elapsed_total);
458-
long high_diff = abs(ret_high - target_elapsed_total);
459-
460-
if (low_diff < high_diff) {
461-
if (tokens_low >= (TOKENS_MAX_HIGH-1)) {
462-
printf("tokens is TOKENS_MAX_HIGH or TOKENS_MAX_HIGH -1. requested delay is too long or too short.\n");
463-
}
464-
465-
return tokens_low;
466-
}
467-
468-
if (tokens_high >= (TOKENS_MAX_HIGH-1)) {
469-
printf("tokens is TOKENS_MAX_HIGH or TOKENS_MAX_HIGH -1. requested delay is too long or too short.\n");
470-
}
471-
472-
return tokens_high;
473-
}
474-
475-
// Measure if this # of tokens is the proper #.
476-
unsigned long t = evaluate_blackhole(tokens_mid, NUMTRIES);
477-
478-
#ifdef DEBUG
479-
printf("t(%lu) = %lu, tokens_mid = %lu target_elapsed_total = %lu\n", core_id, t, tokens_mid, target_elapsed_total);
480-
#endif
481-
482-
if (t > target_elapsed_total) {
483-
tokens_mid = calibrate_blackhole(target, tokens_low, tokens_mid, core_id);
484-
} else if (t < target_elapsed_total) {
485-
tokens_mid = calibrate_blackhole(target, tokens_mid, tokens_high, core_id);
486-
}
487-
488-
return tokens_mid;
489-
}
339+
/* vim: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */

0 commit comments

Comments
 (0)