@@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
391391 for obs in self .continuous_observers :
392392 result .update (obs .get_results ())
393393
394- def benchmark (self , func , gpu_args , instance , verbose , objective ):
395- """benchmark the kernel instance"""
396- logging .debug ("benchmark " + instance .name )
397- logging .debug ("thread block dimensions x,y,z=%d,%d,%d" , * instance .threads )
398- logging .debug ("grid dimensions x,y,z=%d,%d,%d" , * instance .grid )
399-
394+ def set_nvml_parameters (self , instance ):
395+ """Set the NVML parameters. Avoids setting time leaking into benchmark time."""
400396 if self .use_nvml :
401397 if "nvml_pwr_limit" in instance .params :
402398 new_limit = int (
@@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
409405 if "nvml_mem_clock" in instance .params :
410406 self .nvml .mem_clock = instance .params ["nvml_mem_clock" ]
411407
408+ def benchmark (self , func , gpu_args , instance , verbose , objective , skip_nvml_setting = False ):
409+ """Benchmark the kernel instance."""
410+ logging .debug ("benchmark " + instance .name )
411+ logging .debug ("thread block dimensions x,y,z=%d,%d,%d" , * instance .threads )
412+ logging .debug ("grid dimensions x,y,z=%d,%d,%d" , * instance .grid )
413+
414+ if self .use_nvml and not skip_nvml_setting :
415+ self .set_nvml_parameters (instance )
416+
412417 # Call the observers to register the configuration to be benchmarked
413418 for obs in self .dev .observers :
414419 obs .register_configuration (instance .params )
@@ -577,11 +582,15 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
577582
578583 # benchmark
579584 if func :
585+ # setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead
586+ if self .use_nvml :
587+ self .set_nvml_parameters (instance )
580588 start_benchmark = time .perf_counter ()
581589 result .update (
582- self .benchmark (func , gpu_args , instance , verbose , to .objective )
590+ self .benchmark (func , gpu_args , instance , verbose , to .objective , skip_nvml_setting = False )
583591 )
584592 last_benchmark_time = 1000 * (time .perf_counter () - start_benchmark )
593+ print (f"Benchmark time: { last_benchmark_time } " )
585594
586595 except Exception as e :
587596 # dump kernel sources to temp file
0 commit comments