Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit f798d3e

Browse files
Kill copy-pasta in benchmark_fixture.h
1 parent c55e338 commit f798d3e

File tree

1 file changed

+95
-200
lines changed

1 file changed

+95
-200
lines changed

tc/benchmarks/benchmark_fixture.h

Lines changed: 95 additions & 200 deletions
Original file line numberDiff line numberDiff line change
@@ -108,132 +108,133 @@ struct Benchmark : public ::testing::Test {
108108
const std::vector<at::Tensor>& outputs) {
109109
return true;
110110
}) {
111+
// 1. Compile, run and check
111112
auto pExecutor =
112113
tc::aten::compile<tc::CudaBackend>(tc, name, inputs, mappingOptions);
113114
std::vector<at::Tensor> outputs =
114115
tc::aten::prepareOutputs(tc, name, inputs);
115116
tc::aten::run(*pExecutor, inputs, outputs);
116117
EXPECT_TRUE(check_fun(inputs, outputs));
118+
// 2. Run and report compiled kernel runtime
119+
std::vector<at::Tensor> outputs2 =
120+
tc::aten::prepareOutputs(tc, name, inputs);
121+
RunAndReport(
122+
[&pExecutor, &inputs, &outputs2]() {
123+
tc::aten::run(*pExecutor, inputs, outputs2);
124+
},
125+
[&pExecutor, &inputs, &outputs2]() {
126+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
127+
auto timings = tc::aten::profile(*pExecutor, inputs, outputs2);
128+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
129+
return timings.kernelRuntime;
130+
},
131+
"COMPILED KERNEL");
132+
// 3. Run and report total compiled time (kernel runtime + CPU overhead)
133+
RunAndReport(
134+
[&pExecutor, &inputs, &outputs2]() {
135+
tc::aten::run(*pExecutor, inputs, outputs2);
136+
},
137+
[&pExecutor, &inputs, &outputs2]() {
138+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
139+
auto start(std::chrono::system_clock::now());
140+
tc::aten::uncheckedRun(*pExecutor, inputs, outputs2);
141+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
142+
return tc::Duration::since(start);
143+
},
144+
"COMPILED KERNEL + CPU");
145+
return outputs;
146+
}
147+
148+
template <typename InitFunction, typename InplaceFunction>
149+
void Reference(InitFunction init, InplaceFunction compute) {
150+
// 1. Initialize1
151+
auto res = init();
152+
// 2. Run and report reference runtime
153+
RunAndReport(
154+
[&res, compute]() { compute(res); },
155+
[&res, compute]() {
156+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
157+
auto start(std::chrono::system_clock::now());
158+
compute(res);
159+
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
160+
return tc::Duration::since(start);
161+
},
162+
"REFERENCE IMPL.");
163+
}
164+
165+
std::vector<tc::CudaMappingOptions> autotune(
166+
std::string cacheFilename,
167+
std::string resultsFilename,
168+
std::string tc,
169+
std::string kernelName,
170+
std::vector<at::Tensor> inputs,
171+
tc::CudaMappingOptions baseMapping,
172+
CheckFunction check_fun =
173+
[](const std::vector<at::Tensor>&, const std::vector<at::Tensor>&) {
174+
return true;
175+
},
176+
const tc::autotune::TuningParameterFixer& fixedParams = {}) {
177+
if (!FLAGS_autotune) {
178+
return {};
179+
}
180+
tc::aten::ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch>
181+
geneticAutotuneATen(tc);
182+
auto bestOptions = [&]() {
183+
auto options = geneticAutotuneATen.tune(
184+
kernelName, inputs, baseMapping, cacheFilename, fixedParams);
185+
CHECK_GE(options.size(), 1u) << "Benchmark mode: at least one "
186+
<< "options expected";
187+
return options[0];
188+
}();
189+
Check(tc, kernelName, bestOptions, inputs, check_fun);
190+
return {bestOptions};
191+
}
192+
193+
private:
194+
void RunAndReport(
195+
std::function<void(void)> warmupFn,
196+
std::function<tc::Duration(void)> runFn,
197+
const std::string& reportName) {
117198
for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
118-
tc::aten::run(*pExecutor, inputs, outputs);
119199
}
120-
std::vector<tc::Duration> kernelTimes;
121-
kernelTimes.reserve(tc::FLAGS_benchmark_iterations);
122-
std::vector<tc::Duration> totalTimes;
123-
totalTimes.reserve(tc::FLAGS_benchmark_iterations);
200+
std::vector<tc::Duration> durations;
124201
for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
125-
auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
126-
kernelTimes.push_back(timings.kernelRuntime);
127-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
128-
auto start(std::chrono::system_clock::now());
129-
tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
130-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
131-
totalTimes.push_back(tc::Duration::since(start));
202+
durations.push_back(runFn());
132203
}
133204

134-
auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
135-
auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
136-
auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
205+
auto p50idx = static_cast<int>(std::ceil(0.5 * durations.size()));
206+
auto p90idx = static_cast<int>(std::ceil(0.9 * durations.size()));
207+
auto p99idx = static_cast<int>(std::ceil(0.99 * durations.size()));
137208

138-
std::sort(kernelTimes.begin(), kernelTimes.end());
209+
std::sort(durations.begin(), durations.end());
139210
#define GET_US(X) ((X)).toMicroSeconds()
140211

141212
std::cout << "\n---------------------------------------------------------";
142-
std::cout << "\n------------------ COMPILED KERNEL STATS ----------------";
213+
std::cout << "\n---------------- " << reportName << " STATS --------------";
143214
std::cout << "\n------------------ " << tc::FLAGS_benchmark_iterations
144215
<< " ITERATIONS ----------------";
145216
std::cout << "\n---------------------------------------------------------";
146217
std::cout << "\n";
147218
std::cout
148-
<< "Min: " << GET_US(kernelTimes.front()) << "us, "
219+
<< "Min: " << GET_US(durations.front()) << "us, "
149220
<< "p50: "
150-
<< GET_US(kernelTimes.at(std::min(p50idx, (int)kernelTimes.size() - 1)))
221+
<< GET_US(durations.at(std::min(p50idx, (int)durations.size() - 1)))
151222
<< "us, "
152223
<< "p90: "
153-
<< GET_US(kernelTimes.at(std::min(p90idx, (int)kernelTimes.size() - 1)))
224+
<< GET_US(durations.at(std::min(p90idx, (int)durations.size() - 1)))
154225
<< "us, "
155226
<< "p99: "
156-
<< GET_US(kernelTimes.at(std::min(p99idx, (int)kernelTimes.size() - 1)))
227+
<< GET_US(durations.at(std::min(p99idx, (int)durations.size() - 1)))
157228
<< "us, "
158-
<< "Max: " << GET_US(kernelTimes.back()) << "us";
159-
std::cout << "\n---------------------------------------------------------";
160-
std::cout << "\n\n";
161-
162-
#undef GET_US
163-
164-
std::sort(totalTimes.begin(), totalTimes.end());
165-
#define GET_US(X) ((X)).toMicroSeconds()
166-
167-
std::cout << "\n---------------------------------------------------------";
168-
std::cout << "\n------------------ COMPILED TOTAL STATS ----------------";
169-
std::cout << "\n------------------ " << tc::FLAGS_benchmark_iterations
170-
<< " ITERATIONS ----------------";
171-
std::cout << "\n---------------------------------------------------------";
172-
std::cout << "\n";
173-
std::cout
174-
<< "Min: " << GET_US(totalTimes.front()) << "us, "
175-
<< "p50: "
176-
<< GET_US(totalTimes.at(std::min(p50idx, (int)totalTimes.size() - 1)))
177-
<< "us, "
178-
<< "p90: "
179-
<< GET_US(totalTimes.at(std::min(p90idx, (int)totalTimes.size() - 1)))
180-
<< "us, "
181-
<< "p99: "
182-
<< GET_US(totalTimes.at(std::min(p99idx, (int)totalTimes.size() - 1)))
183-
<< "us, "
184-
<< "Max: " << GET_US(totalTimes.back()) << "us";
185-
std::cout << "\n---------------------------------------------------------";
186-
std::cout << "\n\n";
187-
188-
#undef GET_US
189-
190-
return outputs;
191-
}
192-
193-
template <typename InitFunction, typename InplaceFunction>
194-
void Reference(InitFunction init, InplaceFunction compute) {
195-
auto res = init();
196-
for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
197-
compute(res);
198-
}
199-
std::vector<tc::Duration> times;
200-
times.reserve(tc::FLAGS_benchmark_iterations);
201-
for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
202-
auto start(std::chrono::system_clock::now());
203-
compute(res);
204-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
205-
times.push_back(tc::Duration::since(start));
206-
}
207-
std::sort(times.begin(), times.end());
208-
auto p50idx = static_cast<int>(std::ceil(0.5 * times.size()));
209-
auto p90idx = static_cast<int>(std::ceil(0.9 * times.size()));
210-
auto p99idx = static_cast<int>(std::ceil(0.99 * times.size()));
211-
212-
#define GET_US(X) ((X)).toMicroSeconds()
213-
214-
std::cout << "\n---------------------------------------------------------";
215-
std::cout << "\n------------------ REFERENCE IMPL. STATS ----------------";
216-
std::cout << "\n------------------ " << tc::FLAGS_benchmark_iterations
217-
<< " ITERATIONS ----------------";
218-
std::cout << "\n---------------------------------------------------------";
219-
std::cout << "\n";
220-
std::cout << "Min: " << GET_US(times.front()) << "us, "
221-
<< "p50: "
222-
<< GET_US(times.at(std::min(p50idx, (int)times.size() - 1)))
223-
<< "us, "
224-
<< "p90: "
225-
<< GET_US(times.at(std::min(p90idx, (int)times.size() - 1)))
226-
<< "us, "
227-
<< "p99: "
228-
<< GET_US(times.at(std::min(p99idx, (int)times.size() - 1)))
229-
<< "us, "
230-
<< "Max: " << GET_US(times.back()) << "us";
229+
<< "Max: " << GET_US(durations.back()) << "us";
231230
std::cout << "\n---------------------------------------------------------";
232231
std::cout << "\n\n";
233232

234233
#undef GET_US
235234
}
236235

236+
// Will disappear soon
237+
public:
237238
void validateProto(
238239
std::string cacheFilename,
239240
const std::string& tc,
@@ -344,110 +345,4 @@ struct Benchmark : public ::testing::Test {
344345

345346
#undef GET_US
346347
}
347-
348-
std::vector<tc::CudaMappingOptions> autotune(
349-
std::string cacheFilename,
350-
std::string resultsFilename,
351-
std::string tc,
352-
std::string kernelName,
353-
std::vector<at::Tensor> inputs,
354-
tc::CudaMappingOptions baseMapping,
355-
CheckFunction check_fun =
356-
[](const std::vector<at::Tensor>&, const std::vector<at::Tensor>&) {
357-
return true;
358-
},
359-
const tc::autotune::TuningParameterFixer& fixedParams = {}) {
360-
if (FLAGS_autotune) {
361-
tc::aten::ATenAutotuner<tc::CudaBackend, tc::autotune::GeneticSearch>
362-
geneticAutotuneATen(tc);
363-
auto bestOptions = [&]() {
364-
auto options = geneticAutotuneATen.tune(
365-
kernelName, inputs, baseMapping, cacheFilename, fixedParams);
366-
CHECK_GE(options.size(), 1u) << "Benchmark mode: at least one "
367-
<< "options expected";
368-
return options[0];
369-
}();
370-
371-
auto pExecutor = tc::aten::compile<tc::CudaBackend>(
372-
tc, kernelName, inputs, bestOptions);
373-
auto outputs = tc::aten::prepareOutputs(tc, kernelName, inputs);
374-
tc::aten::run(*pExecutor, inputs, outputs);
375-
EXPECT_TRUE(check_fun(inputs, outputs));
376-
for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
377-
tc::aten::run(*pExecutor, inputs, outputs);
378-
}
379-
std::vector<tc::Duration> kernelTimes;
380-
kernelTimes.reserve(tc::FLAGS_benchmark_iterations);
381-
std::vector<tc::Duration> totalTimes;
382-
totalTimes.reserve(tc::FLAGS_benchmark_iterations);
383-
for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
384-
auto timings = tc::aten::profile(*pExecutor, inputs, outputs);
385-
kernelTimes.push_back(timings.kernelRuntime);
386-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
387-
auto start(std::chrono::system_clock::now());
388-
tc::aten::uncheckedRun(*pExecutor, inputs, outputs);
389-
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
390-
totalTimes.push_back(tc::Duration::since(start));
391-
}
392-
393-
auto p50idx = static_cast<int>(std::ceil(0.5 * kernelTimes.size()));
394-
auto p90idx = static_cast<int>(std::ceil(0.9 * kernelTimes.size()));
395-
auto p99idx = static_cast<int>(std::ceil(0.99 * kernelTimes.size()));
396-
std::sort(kernelTimes.begin(), kernelTimes.end());
397-
398-
#define GET_US(X) ((X)).toMicroSeconds()
399-
400-
{
401-
std::ofstream out(resultsFilename);
402-
out << "tc version: " << tc::git_version << "\n"
403-
<< bestOptions << "\n"
404-
<< "Min: " << GET_US(kernelTimes.front()) << "us, "
405-
<< "p50: "
406-
<< GET_US(kernelTimes.at(
407-
std::min(p50idx, (int)kernelTimes.size() - 1)))
408-
<< "us, "
409-
<< "p90: "
410-
<< GET_US(kernelTimes.at(
411-
std::min(p90idx, (int)kernelTimes.size() - 1)))
412-
<< "us, "
413-
<< "p99: "
414-
<< GET_US(kernelTimes.at(
415-
std::min(p99idx, (int)kernelTimes.size() - 1)))
416-
<< "us, "
417-
<< "Max: " << GET_US(kernelTimes.back()) << "us\n";
418-
}
419-
420-
std::cout
421-
<< "\n---------------------------------------------------------";
422-
std::cout
423-
<< "\n------------------ AUTOTUNED KERNEL STATS ---------------";
424-
std::cout << "\n------------------ " << tc::FLAGS_benchmark_iterations
425-
<< " ITERATIONS ----------------";
426-
std::cout
427-
<< "\n---------------------------------------------------------";
428-
std::cout << "\n";
429-
std::cout << "Min: " << GET_US(kernelTimes.front()) << "us, "
430-
<< "p50: "
431-
<< GET_US(kernelTimes.at(
432-
std::min(p50idx, (int)kernelTimes.size() - 1)))
433-
<< "us, "
434-
<< "p90: "
435-
<< GET_US(kernelTimes.at(
436-
std::min(p90idx, (int)kernelTimes.size() - 1)))
437-
<< "us, "
438-
<< "p99: "
439-
<< GET_US(kernelTimes.at(
440-
std::min(p99idx, (int)kernelTimes.size() - 1)))
441-
<< "us, "
442-
<< "Max: " << GET_US(kernelTimes.back()) << "us";
443-
std::cout
444-
<< "\n---------------------------------------------------------";
445-
std::cout << "\n\n";
446-
#undef GET_US
447-
448-
return {bestOptions};
449-
}
450-
451-
return {};
452-
}
453348
};

0 commit comments

Comments
 (0)