@@ -314,6 +314,7 @@ void GeneticTunerHarness::doCompile(
314314 if (current >= population.size ()) {
315315 break ;
316316 }
317+
317318 auto & pConf = population.at (current);
318319 auto options = makeOptions (*pConf);
319320 try {
@@ -432,7 +433,7 @@ void GeneticTunerHarness::doGpuWork(
432433 LOG_LINE_BY_LINE (INFO, ssInfo);
433434 }
434435
435- auto runtimes =
436+ std::vector<Duration> runtimes =
436437 retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
437438 if (runtimes.empty ()) {
438439 try {
@@ -526,59 +527,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
526527
527528 auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
528529 const std::string& printerText) {
529- // Initialize for this round
530- currentCompilationJob_.store (0 );
531- numEvaluations_.store (0 );
532- readyToEvaluate_.resize (0 );
533- for (size_t i = 0 ; i < population.size (); ++i) {
534- readyToEvaluate_.emplace_back ();
535- readyToEvaluate_[i].store (false );
536- }
537- Printer printer (
538- printerText,
539- readyToEvaluate_.size (),
540- currentCompilationJob_,
541- numEvaluations_);
542- auto logGenerations = FLAGS_tuner_gen_log_generations;
543- ScopeGuard sgPrinter ([logGenerations, &printer]() {
544- printer.stop ();
545- if (logGenerations) {
546- printer.printAll ();
547- }
548- });
549-
550- // Just spawn and join new threads for each generation
551- std::vector<std::thread> cpuCompilationThreads;
552- cpuCompilationThreads.reserve (FLAGS_tuner_threads);
553- ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
554- for (auto & cpuCompilationThread : cpuCompilationThreads) {
555- cpuCompilationThread.join ();
530+ // Most candidates should have been evaluated during the previous
531+ // generation's selection phase.
532+ // There are two exceptions:
533+ // 1) the 1st generation
534+ // 2) too many invalid configurations were previously encounted and the
535+ // valid ones were not enough to form a new generation.
536+ auto firstNew = std::partition (
537+ population.begin (),
538+ population.end (),
539+ [](const std::unique_ptr<CandidateConfiguration>& c) {
540+ return c->runtime != Duration::zero ();
541+ });
542+ GeneticSearch::Population newCandidates (
543+ std::distance (firstNew, population.end ()));
544+ std::move (firstNew, population.end (), newCandidates.begin ());
545+ {
546+ // Initialize for this round
547+ currentCompilationJob_.store (0 );
548+ numEvaluations_.store (0 );
549+ readyToEvaluate_.resize (0 );
550+ for (size_t i = 0 ; i < newCandidates.size (); ++i) {
551+ readyToEvaluate_.emplace_back ();
552+ readyToEvaluate_[i].store (false );
556553 }
557- });
558- for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
559- cpuCompilationThreads.emplace_back ([this , &engine, &population]() {
560- this ->doCompile (engine, population);
554+ Printer printer (
555+ printerText,
556+ readyToEvaluate_.size (),
557+ currentCompilationJob_,
558+ numEvaluations_);
559+ auto logGenerations = FLAGS_tuner_gen_log_generations;
560+ ScopeGuard sgPrinter ([logGenerations, &printer]() {
561+ printer.stop ();
562+ if (logGenerations) {
563+ printer.printAll ();
564+ }
561565 });
562- }
563566
564- // Just spawn and join new threads for each generation
565- std::vector<std::thread> gpuWorkerThreads;
566- gpuWorkerThreads.reserve (gpus.size ());
567- ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
568- for (auto & gpuWorkerThread : gpuWorkerThreads) {
569- gpuWorkerThread.join ();
567+ // Just spawn and join new threads for each generation
568+ std::vector<std::thread> cpuCompilationThreads;
569+ cpuCompilationThreads.reserve (FLAGS_tuner_threads);
570+ ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
571+ for (auto & cpuCompilationThread : cpuCompilationThreads) {
572+ cpuCompilationThread.join ();
573+ }
574+ });
575+ for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
576+ cpuCompilationThreads.emplace_back ([this , &engine, &newCandidates]() {
577+ this ->doCompile (engine, newCandidates);
578+ });
579+ }
580+
581+ // Just spawn and join new threads for each generation
582+ std::vector<std::thread> gpuWorkerThreads;
583+ gpuWorkerThreads.reserve (gpus.size ());
584+ ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
585+ for (auto & gpuWorkerThread : gpuWorkerThreads) {
586+ gpuWorkerThread.join ();
587+ }
588+ });
589+ for (auto gpu : gpus) {
590+ gpuWorkerThreads.emplace_back (
591+ [this , gpu, &engine, &newCandidates, &printer]() {
592+ this ->doGpuWork (gpu, engine, newCandidates, printer);
593+ });
570594 }
571- });
572- for (auto gpu : gpus) {
573- gpuWorkerThreads.emplace_back (
574- [this , gpu, &engine, &population, &printer]() {
575- this ->doGpuWork (gpu, engine, population, printer);
576- });
577595 }
578596 // At this point everything is synchronized because out of scope, done
597+ std::move (newCandidates.begin (), newCandidates.end (), firstNew);
579598 };
580- std::cout << " Generation " << generation << std::endl;
581- setUpJobsAndRun (tuner_->population , " Population " );
599+ std::cout << " Generation " << generation << ' : ' << std::endl;
600+ setUpJobsAndRun (tuner_->population , " New Candidates " );
582601 tuner_->generateSelectionPool ();
583602 setUpJobsAndRun (tuner_->selectionPool , " Selection Pool" );
584603 tuner_->selectSurvivors ();
0 commit comments