@@ -346,6 +346,39 @@ void GeneticTunerHarness::doCompile(
346346 }
347347}
348348
349+ namespace {
350+ std::vector<const DLTensor*> toConstDlpackTensors (
351+ const std::vector<DLTensor*>& v) {
352+ std::vector<const DLTensor*> out (v.begin (), v.end ());
353+ return out;
354+ }
355+ } // namespace
356+
357+ template <typename ExecutorType>
358+ std::vector<Duration> retrieveCachedRuntimes (
359+ ExecutorType& engine,
360+ const std::string& id,
361+ const std::vector<const DLTensor*>& inputs,
362+ const std::vector<DLTensor*>& outputs,
363+ const CudaMappingOptions& options) {
364+ if (not OptionsCache::cacheEnabled ()) {
365+ return {};
366+ }
367+ auto cache = OptionsCache::getCache ();
368+ auto allResults = cache->retrieveOptionsAndRuntimes (
369+ id, inputs, toConstDlpackTensors (outputs));
370+ auto wantedResult = std::find_if (
371+ allResults.begin (),
372+ allResults.end (),
373+ [&options](const OptionsCache::RetrievalResult& r) {
374+ return r.options == options;
375+ });
376+ if (wantedResult == allResults.end ()) {
377+ return {};
378+ }
379+ return wantedResult->recordedRuntimes ;
380+ }
381+
349382template <typename ExecutorType, typename Population>
350383void GeneticTunerHarness::doGpuWork (
351384 size_t gpu,
@@ -399,51 +432,56 @@ void GeneticTunerHarness::doGpuWork(
399432 LOG_LINE_BY_LINE (INFO, ssInfo);
400433 }
401434
402- std::vector<Duration> runtimes;
403- try {
404- size_t bestTimeSoFar;
405- {
406- std::lock_guard<std::mutex> lock (bestTimeMtx_);
407- bestTimeSoFar = bestTime_;
408- }
409- auto prune =
410- warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
411- if (prune) {
435+ auto runtimes =
436+ retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
437+ if (runtimes.empty ()) {
438+ try {
439+ size_t bestTimeSoFar;
440+ {
441+ std::lock_guard<std::mutex> lock (bestTimeMtx_);
442+ bestTimeSoFar = bestTime_;
443+ }
444+ auto prune =
445+ warmupOrPrune (engine, outputs, inputs, handle, bestTimeSoFar);
446+ if (prune) {
447+ pConf->invalid = true ;
448+ continue ;
449+ } else {
450+ runtimes.reserve (kReducedBenchmarkIterations );
451+ for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
452+ runtimes.push_back (engine.run (handle, inputs, outputs, true ));
453+ }
454+ engine.clear (handle);
455+ }
456+ } catch (std::exception& e) {
457+ if (FLAGS_debug_tuner) {
458+ LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
459+ std::stringstream ssWarning;
460+ CudaMappingOptionsCppPrinter warningPrinter (ssWarning);
461+ warningPrinter << options;
462+ LOG (WARNING) << " Aborted execution on gpu " << gpu;
463+ LOG_LINE_BY_LINE (WARNING, ssWarning);
464+ }
465+ while (cudaGetLastError () != cudaSuccess) {
466+ // In case of errors in the generated, we cannot rely on deviceReset
467+ // to set the GPU in a clean state. So instead we just pop and discard
468+ // all the errors accumulated on the GPU until we get to a clean slate
469+ // (i.e. cudaSuccess).
470+ ;
471+ }
472+ try {
473+ // Some errors, such as illegal memory access, cannot be recovered
474+ // from without a cudaDeviceReset (i.e. because user protection) In
475+ // those cases we have no choice than to fail hard.
476+ TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
477+ } catch (const std::exception& e) {
478+ LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
479+ << e.what () << " \n "
480+ << CudaMappingOptionsAsCpp (options);
481+ }
412482 pConf->invalid = true ;
413483 continue ;
414- } else {
415- runtimes.reserve (kReducedBenchmarkIterations );
416- for (size_t i = 0 ; i < kReducedBenchmarkIterations ; ++i) {
417- runtimes.push_back (engine.run (handle, inputs, outputs, true ));
418- }
419- engine.clear (handle);
420484 }
421- } catch (std::exception& e) {
422- LOG (WARNING) << " Runtime error gpu " << gpu << " : " << e.what ();
423- std::stringstream ssWarning;
424- CudaMappingOptionsCppPrinter warningPrinter (ssWarning);
425- warningPrinter << options;
426- LOG (WARNING) << " Aborted execution on gpu " << gpu;
427- LOG_LINE_BY_LINE (WARNING, ssWarning);
428- while (cudaGetLastError () != cudaSuccess) {
429- // In case of errors in the generated, we cannot rely on deviceReset to
430- // set the GPU in a clean state. So instead we just pop and discard all
431- // the errors accumulated on the GPU until we get to a clean slate
432- // (i.e. cudaSuccess).
433- ;
434- }
435- try {
436- // Some errors, such as illegal memory access, cannot be recovered from
437- // without a cudaDeviceReset (i.e. because user protection)
438- // In those cases we have no choice than to fail hard.
439- TC_CUDA_RUNTIMEAPI_ENFORCE (cudaDeviceSynchronize ());
440- } catch (const std::exception& e) {
441- LOG (FATAL) << " [CUDA][FATAL] cuda error on gpu " << gpu << " : "
442- << e.what () << " \n "
443- << CudaMappingOptionsAsCpp (options);
444- }
445- pConf->invalid = true ;
446- continue ;
447485 }
448486
449487 auto prof = median (runtimes);
0 commit comments