@@ -108,132 +108,133 @@ struct Benchmark : public ::testing::Test {
108108 const std::vector<at::Tensor>& outputs) {
109109 return true ;
110110 }) {
111+ // 1. Compile, run and check
111112 auto pExecutor =
112113 tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions);
113114 std::vector<at::Tensor> outputs =
114115 tc::aten::prepareOutputs (tc, name, inputs);
115116 tc::aten::run (*pExecutor, inputs, outputs);
116117 EXPECT_TRUE (check_fun (inputs, outputs));
118+ // 2. Run and report compiled kernel runtime
119+ std::vector<at::Tensor> outputs2 =
120+ tc::aten::prepareOutputs (tc, name, inputs);
121+ RunAndReport (
122+ [&pExecutor, &inputs, &outputs2]() {
123+ tc::aten::run (*pExecutor, inputs, outputs2);
124+ },
125+ [&pExecutor, &inputs, &outputs2]() {
126+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
127+ auto timings = tc::aten::profile (*pExecutor, inputs, outputs2);
128+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
129+ return timings.kernelRuntime ;
130+ },
131+ " COMPILED KERNEL" );
132+ // 3. Run and report total compiled time (kernel runtime + CPU overhead)
133+ RunAndReport (
134+ [&pExecutor, &inputs, &outputs2]() {
135+ tc::aten::run (*pExecutor, inputs, outputs2);
136+ },
137+ [&pExecutor, &inputs, &outputs2]() {
138+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
139+ auto start (std::chrono::system_clock::now ());
140+ tc::aten::uncheckedRun (*pExecutor, inputs, outputs2);
141+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
142+ return tc::Duration::since (start);
143+ },
144+ " COMPILED KERNEL + CPU" );
145+ return outputs;
146+ }
147+
148+ template <typename InitFunction, typename InplaceFunction>
149+ void Reference (InitFunction init, InplaceFunction compute) {
150+ // 1. Initialize1
151+ auto res = init ();
152+ // 2. Run and report reference runtime
153+ RunAndReport (
154+ [&res, compute]() { compute (res); },
155+ [&res, compute]() {
156+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
157+ auto start (std::chrono::system_clock::now ());
158+ compute (res);
159+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
160+ return tc::Duration::since (start);
161+ },
162+ " REFERENCE IMPL." );
163+ }
164+
165+ std::vector<tc::CudaMappingOptions> autotune (
166+ std::string cacheFilename,
167+ std::string resultsFilename,
168+ std::string tc,
169+ std::string kernelName,
170+ std::vector<at::Tensor> inputs,
171+ tc::CudaMappingOptions baseMapping,
172+ CheckFunction check_fun =
173+ [](const std::vector<at::Tensor>&, const std::vector<at::Tensor>&) {
174+ return true ;
175+ },
176+ const tc::autotune::TuningParameterFixer& fixedParams = {}) {
177+ if (!FLAGS_autotune) {
178+ return {};
179+ }
180+ tc::aten::ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch>
181+ geneticAutotuneATen (tc);
182+ auto bestOptions = [&]() {
183+ auto options = geneticAutotuneATen.tune (
184+ kernelName, inputs, baseMapping, cacheFilename, fixedParams);
185+ CHECK_GE (options.size (), 1u ) << " Benchmark mode: at least one "
186+ << " options expected" ;
187+ return options[0 ];
188+ }();
189+ Check (tc, kernelName, bestOptions, inputs, check_fun);
190+ return {bestOptions};
191+ }
192+
193+ private:
194+ void RunAndReport (
195+ std::function<void (void )> warmupFn,
196+ std::function<tc::Duration(void )> runFn,
197+ const std::string& reportName) {
117198 for (size_t i = 1 ; i < tc::FLAGS_benchmark_warmup; ++i) {
118- tc::aten::run (*pExecutor, inputs, outputs);
119199 }
120- std::vector<tc::Duration> kernelTimes;
121- kernelTimes.reserve (tc::FLAGS_benchmark_iterations);
122- std::vector<tc::Duration> totalTimes;
123- totalTimes.reserve (tc::FLAGS_benchmark_iterations);
200+ std::vector<tc::Duration> durations;
124201 for (size_t i = 0 ; i < tc::FLAGS_benchmark_iterations; ++i) {
125- auto timings = tc::aten::profile (*pExecutor, inputs, outputs);
126- kernelTimes.push_back (timings.kernelRuntime );
127- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
128- auto start (std::chrono::system_clock::now ());
129- tc::aten::uncheckedRun (*pExecutor, inputs, outputs);
130- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
131- totalTimes.push_back (tc::Duration::since (start));
202+ durations.push_back (runFn ());
132203 }
133204
134- auto p50idx = static_cast <int >(std::ceil (0.5 * kernelTimes .size ()));
135- auto p90idx = static_cast <int >(std::ceil (0.9 * kernelTimes .size ()));
136- auto p99idx = static_cast <int >(std::ceil (0.99 * kernelTimes .size ()));
205+ auto p50idx = static_cast <int >(std::ceil (0.5 * durations .size ()));
206+ auto p90idx = static_cast <int >(std::ceil (0.9 * durations .size ()));
207+ auto p99idx = static_cast <int >(std::ceil (0.99 * durations .size ()));
137208
138- std::sort (kernelTimes .begin (), kernelTimes .end ());
209+ std::sort (durations .begin (), durations .end ());
139210#define GET_US (X ) ((X)).toMicroSeconds()
140211
141212 std::cout << " \n ---------------------------------------------------------" ;
142- std::cout << " \n ------------------ COMPILED KERNEL STATS -- --------------" ;
213+ std::cout << " \n ---------------- " << reportName << " STATS --------------" ;
143214 std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
144215 << " ITERATIONS ----------------" ;
145216 std::cout << " \n ---------------------------------------------------------" ;
146217 std::cout << " \n " ;
147218 std::cout
148- << " Min: " << GET_US (kernelTimes .front ()) << " us, "
219+ << " Min: " << GET_US (durations .front ()) << " us, "
149220 << " p50: "
150- << GET_US (kernelTimes .at (std::min (p50idx, (int )kernelTimes .size () - 1 )))
221+ << GET_US (durations .at (std::min (p50idx, (int )durations .size () - 1 )))
151222 << " us, "
152223 << " p90: "
153- << GET_US (kernelTimes .at (std::min (p90idx, (int )kernelTimes .size () - 1 )))
224+ << GET_US (durations .at (std::min (p90idx, (int )durations .size () - 1 )))
154225 << " us, "
155226 << " p99: "
156- << GET_US (kernelTimes .at (std::min (p99idx, (int )kernelTimes .size () - 1 )))
227+ << GET_US (durations .at (std::min (p99idx, (int )durations .size () - 1 )))
157228 << " us, "
158- << " Max: " << GET_US (kernelTimes.back ()) << " us" ;
159- std::cout << " \n ---------------------------------------------------------" ;
160- std::cout << " \n\n " ;
161-
162- #undef GET_US
163-
164- std::sort (totalTimes.begin (), totalTimes.end ());
165- #define GET_US (X ) ((X)).toMicroSeconds()
166-
167- std::cout << " \n ---------------------------------------------------------" ;
168- std::cout << " \n ------------------ COMPILED TOTAL STATS ----------------" ;
169- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
170- << " ITERATIONS ----------------" ;
171- std::cout << " \n ---------------------------------------------------------" ;
172- std::cout << " \n " ;
173- std::cout
174- << " Min: " << GET_US (totalTimes.front ()) << " us, "
175- << " p50: "
176- << GET_US (totalTimes.at (std::min (p50idx, (int )totalTimes.size () - 1 )))
177- << " us, "
178- << " p90: "
179- << GET_US (totalTimes.at (std::min (p90idx, (int )totalTimes.size () - 1 )))
180- << " us, "
181- << " p99: "
182- << GET_US (totalTimes.at (std::min (p99idx, (int )totalTimes.size () - 1 )))
183- << " us, "
184- << " Max: " << GET_US (totalTimes.back ()) << " us" ;
185- std::cout << " \n ---------------------------------------------------------" ;
186- std::cout << " \n\n " ;
187-
188- #undef GET_US
189-
190- return outputs;
191- }
192-
193- template <typename InitFunction, typename InplaceFunction>
194- void Reference (InitFunction init, InplaceFunction compute) {
195- auto res = init ();
196- for (size_t i = 1 ; i < tc::FLAGS_benchmark_warmup; ++i) {
197- compute (res);
198- }
199- std::vector<tc::Duration> times;
200- times.reserve (tc::FLAGS_benchmark_iterations);
201- for (size_t i = 0 ; i < tc::FLAGS_benchmark_iterations; ++i) {
202- auto start (std::chrono::system_clock::now ());
203- compute (res);
204- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
205- times.push_back (tc::Duration::since (start));
206- }
207- std::sort (times.begin (), times.end ());
208- auto p50idx = static_cast <int >(std::ceil (0.5 * times.size ()));
209- auto p90idx = static_cast <int >(std::ceil (0.9 * times.size ()));
210- auto p99idx = static_cast <int >(std::ceil (0.99 * times.size ()));
211-
212- #define GET_US (X ) ((X)).toMicroSeconds()
213-
214- std::cout << " \n ---------------------------------------------------------" ;
215- std::cout << " \n ------------------ REFERENCE IMPL. STATS ----------------" ;
216- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
217- << " ITERATIONS ----------------" ;
218- std::cout << " \n ---------------------------------------------------------" ;
219- std::cout << " \n " ;
220- std::cout << " Min: " << GET_US (times.front ()) << " us, "
221- << " p50: "
222- << GET_US (times.at (std::min (p50idx, (int )times.size () - 1 )))
223- << " us, "
224- << " p90: "
225- << GET_US (times.at (std::min (p90idx, (int )times.size () - 1 )))
226- << " us, "
227- << " p99: "
228- << GET_US (times.at (std::min (p99idx, (int )times.size () - 1 )))
229- << " us, "
230- << " Max: " << GET_US (times.back ()) << " us" ;
229+ << " Max: " << GET_US (durations.back ()) << " us" ;
231230 std::cout << " \n ---------------------------------------------------------" ;
232231 std::cout << " \n\n " ;
233232
234233#undef GET_US
235234 }
236235
236+ // Will disappear soon
237+ public:
237238 void validateProto (
238239 std::string cacheFilename,
239240 const std::string& tc,
@@ -344,110 +345,4 @@ struct Benchmark : public ::testing::Test {
344345
345346#undef GET_US
346347 }
347-
348- std::vector<tc::CudaMappingOptions> autotune (
349- std::string cacheFilename,
350- std::string resultsFilename,
351- std::string tc,
352- std::string kernelName,
353- std::vector<at::Tensor> inputs,
354- tc::CudaMappingOptions baseMapping,
355- CheckFunction check_fun =
356- [](const std::vector<at::Tensor>&, const std::vector<at::Tensor>&) {
357- return true ;
358- },
359- const tc::autotune::TuningParameterFixer& fixedParams = {}) {
360- if (FLAGS_autotune) {
361- tc::aten::ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch>
362- geneticAutotuneATen (tc);
363- auto bestOptions = [&]() {
364- auto options = geneticAutotuneATen.tune (
365- kernelName, inputs, baseMapping, cacheFilename, fixedParams);
366- CHECK_GE (options.size (), 1u ) << " Benchmark mode: at least one "
367- << " options expected" ;
368- return options[0 ];
369- }();
370-
371- auto pExecutor = tc::aten::compile<tc::CudaBackend>(
372- tc, kernelName, inputs, bestOptions);
373- auto outputs = tc::aten::prepareOutputs (tc, kernelName, inputs);
374- tc::aten::run (*pExecutor, inputs, outputs);
375- EXPECT_TRUE (check_fun (inputs, outputs));
376- for (size_t i = 1 ; i < tc::FLAGS_benchmark_warmup; ++i) {
377- tc::aten::run (*pExecutor, inputs, outputs);
378- }
379- std::vector<tc::Duration> kernelTimes;
380- kernelTimes.reserve (tc::FLAGS_benchmark_iterations);
381- std::vector<tc::Duration> totalTimes;
382- totalTimes.reserve (tc::FLAGS_benchmark_iterations);
383- for (size_t i = 0 ; i < tc::FLAGS_benchmark_iterations; ++i) {
384- auto timings = tc::aten::profile (*pExecutor, inputs, outputs);
385- kernelTimes.push_back (timings.kernelRuntime );
386- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
387- auto start (std::chrono::system_clock::now ());
388- tc::aten::uncheckedRun (*pExecutor, inputs, outputs);
389- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
390- totalTimes.push_back (tc::Duration::since (start));
391- }
392-
393- auto p50idx = static_cast <int >(std::ceil (0.5 * kernelTimes.size ()));
394- auto p90idx = static_cast <int >(std::ceil (0.9 * kernelTimes.size ()));
395- auto p99idx = static_cast <int >(std::ceil (0.99 * kernelTimes.size ()));
396- std::sort (kernelTimes.begin (), kernelTimes.end ());
397-
398- #define GET_US (X ) ((X)).toMicroSeconds()
399-
400- {
401- std::ofstream out (resultsFilename);
402- out << " tc version: " << tc::git_version << " \n "
403- << bestOptions << " \n "
404- << " Min: " << GET_US (kernelTimes.front ()) << " us, "
405- << " p50: "
406- << GET_US (kernelTimes.at (
407- std::min (p50idx, (int )kernelTimes.size () - 1 )))
408- << " us, "
409- << " p90: "
410- << GET_US (kernelTimes.at (
411- std::min (p90idx, (int )kernelTimes.size () - 1 )))
412- << " us, "
413- << " p99: "
414- << GET_US (kernelTimes.at (
415- std::min (p99idx, (int )kernelTimes.size () - 1 )))
416- << " us, "
417- << " Max: " << GET_US (kernelTimes.back ()) << " us\n " ;
418- }
419-
420- std::cout
421- << " \n ---------------------------------------------------------" ;
422- std::cout
423- << " \n ------------------ AUTOTUNED KERNEL STATS ---------------" ;
424- std::cout << " \n ------------------ " << tc::FLAGS_benchmark_iterations
425- << " ITERATIONS ----------------" ;
426- std::cout
427- << " \n ---------------------------------------------------------" ;
428- std::cout << " \n " ;
429- std::cout << " Min: " << GET_US (kernelTimes.front ()) << " us, "
430- << " p50: "
431- << GET_US (kernelTimes.at (
432- std::min (p50idx, (int )kernelTimes.size () - 1 )))
433- << " us, "
434- << " p90: "
435- << GET_US (kernelTimes.at (
436- std::min (p90idx, (int )kernelTimes.size () - 1 )))
437- << " us, "
438- << " p99: "
439- << GET_US (kernelTimes.at (
440- std::min (p99idx, (int )kernelTimes.size () - 1 )))
441- << " us, "
442- << " Max: " << GET_US (kernelTimes.back ()) << " us" ;
443- std::cout
444- << " \n ---------------------------------------------------------" ;
445- std::cout << " \n\n " ;
446- #undef GET_US
447-
448- return {bestOptions};
449- }
450-
451- return {};
452- }
453348};
0 commit comments