@@ -347,6 +347,39 @@ void GeneticTunerHarness::doCompile(
347347 }
348348}
349349
350+ namespace {
351+ std::vector<const DLTensor*> toConstDlpackTensors (
352+ const std::vector<DLTensor*>& v) {
353+ std::vector<const DLTensor*> out (v.begin (), v.end ());
354+ return out;
355+ }
356+ } // namespace
357+
358+ template <typename ExecutorType>
359+ std::vector<Duration> retrieveCachedRuntimes (
360+ ExecutorType& engine,
361+ const std::string& id,
362+ const std::vector<const DLTensor*>& inputs,
363+ const std::vector<DLTensor*>& outputs,
364+ const MappingOptions& options) {
365+ if (not OptionsCache::cacheEnabled ()) {
366+ return {};
367+ }
368+ auto cache = OptionsCache::getCache ();
369+ auto allResults = cache->retrieveOptionsAndRuntimes (
370+ id, inputs, toConstDlpackTensors (outputs));
371+ auto wantedResult = std::find_if (
372+ allResults.begin (),
373+ allResults.end (),
374+ [&options](const OptionsCache::RetrievalResult& r) {
375+ return r.options == options;
376+ });
377+ if (wantedResult == allResults.end ()) {
378+ return {};
379+ }
380+ return wantedResult->recordedRuntimes ;
381+ }
382+
350383template <typename ExecutorType, typename Population>
351384void GeneticTunerHarness::doGpuWork (
352385 size_t gpu,
@@ -400,53 +433,56 @@ void GeneticTunerHarness::doGpuWork(
400433 LOG_LINE_BY_LINE (INFO, ssInfo);
401434 }
402435
403- std::vector<Duration> runtimes;
404- try {
405- size_t bestTimeSoFar;
406- {
407- std::lock_guard<std::mutex> lock (bestTimeMtx_);
408- bestTimeSoFar = bestTime_;
409- }
410- auto prune =
411- warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
412- if (prune) {
436+ auto runtimes =
437+ retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
438+ if (runtimes.empty ()) {
439+ try {
440+ size_t bestTimeSoFar;
441+ {
442+ std::lock_guard<std::mutex> lock (bestTimeMtx_);
443+ bestTimeSoFar = bestTime_;
444+ }
445+ auto prune =
446+ warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
447+ if (prune) {
448+ pConf->invalid = true ;
449+ continue ;
450+ } else {
451+ runtimes.reserve (kReducedBenchmarkIterations );
452+ for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
453+ runtimes.push_back (engine.run (handle, inputs, outputs, true ));
454+ }
455+ engine.clear (handle);
456+ }
457+ } catch (std::exception& e) {
458+ if (FLAGS_debug_tuner) {
459+ LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
460+ std::stringstream ssWarning;
461+ MappingOptionsCppPrinter warningPrinter (ssWarning);
462+ warningPrinter << options;
463+ LOG (WARNING) << " Aborted execution on gpu " << gpu;
464+ LOG_LINE_BY_LINE (WARNING, ssWarning);
465+ }
466+ while (cudaGetLastError () != cudaSuccess) {
467+ // In case of errors in the generated, we cannot rely on deviceReset
468+ // to set the GPU in a clean state. So instead we just pop and discard
469+ // all the errors accumulated on the GPU until we get to a clean slate
470+ // (i.e. cudaSuccess).
471+ ;
472+ }
473+ try {
474+ // Some errors, such as illegal memory access, cannot be recovered
475+ // from without a cudaDeviceReset (i.e. because user protection) In
476+ // those cases we have no choice than to fail hard.
477+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
478+ } catch (const std::exception& e) {
479+ LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
480+ << e.what () << " \n "
481+ << MappingOptionsAsCpp (options);
482+ }
413483 pConf->invalid = true ;
414484 continue ;
415- } else {
416- runtimes.reserve (kReducedBenchmarkIterations );
417- for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
418- runtimes.push_back (engine.run (handle, inputs, outputs, true ));
419- }
420- engine.clear (handle);
421- }
422- } catch (std::exception& e) {
423- if (FLAGS_debug_tuner) {
424- LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
425- std::stringstream ssWarning;
426- MappingOptionsCppPrinter warningPrinter (ssWarning);
427- warningPrinter << options;
428- LOG (WARNING) << " Aborted execution on gpu " << gpu;
429- LOG_LINE_BY_LINE (WARNING, ssWarning);
430- }
431- while (cudaGetLastError () != cudaSuccess) {
432- // In case of errors in the generated, we cannot rely on deviceReset to
433- // set the GPU in a clean state. So instead we just pop and discard all
434- // the errors accumulated on the GPU until we get to a clean slate
435- // (i.e. cudaSuccess).
436- ;
437485 }
438- try {
439- // Some errors, such as illegal memory access, cannot be recovered from
440- // without a cudaDeviceReset (i.e. because user protection)
441- // In those cases we have no choice than to fail hard.
442- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
443- } catch (const std::exception& e) {
444- LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
445- << e.what () << " \n "
446- << MappingOptionsAsCpp (options);
447- }
448- pConf->invalid = true ;
449- continue ;
450486 }
451487
452488 auto prof = median (runtimes);
0 commit comments