@@ -345,6 +345,39 @@ void GeneticTunerHarness::doCompile(
345345 }
346346}
347347
348+ namespace {
349+ std::vector<const DLTensor*> toConstDlpackTensors (
350+ const std::vector<DLTensor*>& v) {
351+ std::vector<const DLTensor*> out (v.begin (), v.end ());
352+ return out;
353+ }
354+ } // namespace
355+
356+ template <typename ExecutorType>
357+ std::vector<Duration> retrieveCachedRuntimes (
358+ ExecutorType& engine,
359+ const std::string& id,
360+ const std::vector<const DLTensor*>& inputs,
361+ const std::vector<DLTensor*>& outputs,
362+ const MappingOptions& options) {
363+ if (not OptionsCache::cacheEnabled ()) {
364+ return {};
365+ }
366+ auto cache = OptionsCache::getCache ();
367+ auto allResults = cache->retrieveOptionsAndRuntimes (
368+ id, inputs, toConstDlpackTensors (outputs));
369+ auto wantedResult = std::find_if (
370+ allResults.begin (),
371+ allResults.end (),
372+ [&options](const OptionsCache::RetrievalResult& r) {
373+ return r.options == options;
374+ });
375+ if (wantedResult == allResults.end ()) {
376+ return {};
377+ }
378+ return wantedResult->recordedRuntimes ;
379+ }
380+
348381template <typename ExecutorType, typename Population>
349382void GeneticTunerHarness::doGpuWork (
350383 size_t gpu,
@@ -398,51 +431,56 @@ void GeneticTunerHarness::doGpuWork(
398431 LOG_LINE_BY_LINE (INFO, ssInfo);
399432 }
400433
401- std::vector<Duration> runtimes;
402- try {
403- size_t bestTimeSoFar;
404- {
405- std::lock_guard<std::mutex> lock (bestTimeMtx_);
406- bestTimeSoFar = bestTime_;
407- }
408- auto prune =
409- warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
410- if (prune) {
434+ auto runtimes =
435+ retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
436+ if (runtimes.empty ()) {
437+ try {
438+ size_t bestTimeSoFar;
439+ {
440+ std::lock_guard<std::mutex> lock (bestTimeMtx_);
441+ bestTimeSoFar = bestTime_;
442+ }
443+ auto prune =
444+ warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
445+ if (prune) {
446+ pConf->invalid = true ;
447+ continue ;
448+ } else {
449+ runtimes.reserve (kReducedBenchmarkIterations );
450+ for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
451+ runtimes.push_back (engine.run (handle, inputs, outputs, true ));
452+ }
453+ engine.clear (handle);
454+ }
455+ } catch (std::exception& e) {
456+ if (FLAGS_debug_tuner) {
457+ LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
458+ std::stringstream ssWarning;
459+ MappingOptionsCppPrinter warningPrinter (ssWarning);
460+ warningPrinter << options;
461+ LOG (WARNING) << " Aborted execution on gpu " << gpu;
462+ LOG_LINE_BY_LINE (WARNING, ssWarning);
463+ }
464+ while (cudaGetLastError () != cudaSuccess) {
465+ // In case of errors in the generated, we cannot rely on deviceReset
466+ // to set the GPU in a clean state. So instead we just pop and discard
467+ // all the errors accumulated on the GPU until we get to a clean slate
468+ // (i.e. cudaSuccess).
469+ ;
470+ }
471+ try {
472+ // Some errors, such as illegal memory access, cannot be recovered
473+ // from without a cudaDeviceReset (i.e. because user protection) In
474+ // those cases we have no choice than to fail hard.
475+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
476+ } catch (const std::exception& e) {
477+ LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
478+ << e.what () << " \n "
479+ << MappingOptionsAsCpp (options);
480+ }
411481 pConf->invalid = true ;
412482 continue ;
413- } else {
414- runtimes.reserve (kReducedBenchmarkIterations );
415- for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
416- runtimes.push_back (engine.run (handle, inputs, outputs, true ));
417- }
418- engine.clear (handle);
419483 }
420- } catch (std::exception& e) {
421- LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
422- std::stringstream ssWarning;
423- MappingOptionsCppPrinter warningPrinter (ssWarning);
424- warningPrinter << options;
425- LOG (WARNING) << " Aborted execution on gpu " << gpu;
426- LOG_LINE_BY_LINE (WARNING, ssWarning);
427- while (cudaGetLastError () != cudaSuccess) {
428- // In case of errors in the generated, we cannot rely on deviceReset to
429- // set the GPU in a clean state. So instead we just pop and discard all
430- // the errors accumulated on the GPU until we get to a clean slate
431- // (i.e. cudaSuccess).
432- ;
433- }
434- try {
435- // Some errors, such as illegal memory access, cannot be recovered from
436- // without a cudaDeviceReset (i.e. because user protection)
437- // In those cases we have no choice than to fail hard.
438- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
439- } catch (const std::exception& e) {
440- LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
441- << e.what () << " \n "
442- << MappingOptionsAsCpp (options);
443- }
444- pConf->invalid = true ;
445- continue ;
446484 }
447485
448486 auto prof = median (runtimes);
0 commit comments