@@ -313,6 +313,7 @@ void GeneticTunerHarness::doCompile(
313313 if (current >= population.size ()) {
314314 break ;
315315 }
316+
316317 auto & pConf = population.at (current);
317318 auto options = makeOptions (*pConf);
318319 try {
@@ -433,7 +434,7 @@ void GeneticTunerHarness::doGpuWork(
433434 LOG_LINE_BY_LINE (INFO, ssInfo);
434435 }
435436
436- auto runtimes =
437+ std::vector<Duration> runtimes =
437438 retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
438439 if (runtimes.empty ()) {
439440 try {
@@ -527,59 +528,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
527528
528529 auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
529530 const std::string& printerText) {
530- // Initialize for this round
531- currentCompilationJob_.store (0 );
532- numEvaluations_.store (0 );
533- readyToEvaluate_.resize (0 );
534- for (size_t i = 0 ; i < population.size (); ++i) {
535- readyToEvaluate_.emplace_back ();
536- readyToEvaluate_[i].store (false );
537- }
538- Printer printer (
539- printerText,
540- readyToEvaluate_.size (),
541- currentCompilationJob_,
542- numEvaluations_);
543- auto logGenerations = FLAGS_tuner_gen_log_generations;
544- ScopeGuard sgPrinter ([logGenerations, &printer]() {
545- printer.stop ();
546- if (logGenerations) {
547- printer.printAll ();
548- }
549- });
550-
551- // Just spawn and join new threads for each generation
552- std::vector<std::thread> cpuCompilationThreads;
553- cpuCompilationThreads.reserve (FLAGS_tuner_threads);
554- ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
555- for (auto & cpuCompilationThread : cpuCompilationThreads) {
556- cpuCompilationThread.join ();
531+ // Most candidates should have been evaluated during the previous
532+ // generation's selection phase.
533+ // There are two exceptions:
534+ // 1) the 1st generation
535+ // 2) too many invalid configurations were previously encounted and the
536+ // valid ones were not enough to form a new generation.
537+ auto firstNew = std::partition (
538+ population.begin (),
539+ population.end (),
540+ [](const std::unique_ptr<CandidateConfiguration>& c) {
541+ return c->runtime != Duration::zero ();
542+ });
543+ GeneticSearch::Population newCandidates (
544+ std::distance (firstNew, population.end ()));
545+ std::move (firstNew, population.end (), newCandidates.begin ());
546+ {
547+ // Initialize for this round
548+ currentCompilationJob_.store (0 );
549+ numEvaluations_.store (0 );
550+ readyToEvaluate_.resize (0 );
551+ for (size_t i = 0 ; i < newCandidates.size (); ++i) {
552+ readyToEvaluate_.emplace_back ();
553+ readyToEvaluate_[i].store (false );
557554 }
558- });
559- for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
560- cpuCompilationThreads.emplace_back ([this , &engine, &population]() {
561- this ->doCompile (engine, population);
555+ Printer printer (
556+ printerText,
557+ readyToEvaluate_.size (),
558+ currentCompilationJob_,
559+ numEvaluations_);
560+ auto logGenerations = FLAGS_tuner_gen_log_generations;
561+ ScopeGuard sgPrinter ([logGenerations, &printer]() {
562+ printer.stop ();
563+ if (logGenerations) {
564+ printer.printAll ();
565+ }
562566 });
563- }
564567
565- // Just spawn and join new threads for each generation
566- std::vector<std::thread> gpuWorkerThreads;
567- gpuWorkerThreads.reserve (gpus.size ());
568- ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
569- for (auto & gpuWorkerThread : gpuWorkerThreads) {
570- gpuWorkerThread.join ();
568+ // Just spawn and join new threads for each generation
569+ std::vector<std::thread> cpuCompilationThreads;
570+ cpuCompilationThreads.reserve (FLAGS_tuner_threads);
571+ ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
572+ for (auto & cpuCompilationThread : cpuCompilationThreads) {
573+ cpuCompilationThread.join ();
574+ }
575+ });
576+ for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
577+ cpuCompilationThreads.emplace_back ([this , &engine, &newCandidates]() {
578+ this ->doCompile (engine, newCandidates);
579+ });
580+ }
581+
582+ // Just spawn and join new threads for each generation
583+ std::vector<std::thread> gpuWorkerThreads;
584+ gpuWorkerThreads.reserve (gpus.size ());
585+ ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
586+ for (auto & gpuWorkerThread : gpuWorkerThreads) {
587+ gpuWorkerThread.join ();
588+ }
589+ });
590+ for (auto gpu : gpus) {
591+ gpuWorkerThreads.emplace_back (
592+ [this , gpu, &engine, &newCandidates, &printer]() {
593+ this ->doGpuWork (gpu, engine, newCandidates, printer);
594+ });
571595 }
572- });
573- for (auto gpu : gpus) {
574- gpuWorkerThreads.emplace_back (
575- [this , gpu, &engine, &population, &printer]() {
576- this ->doGpuWork (gpu, engine, population, printer);
577- });
578596 }
579597 // At this point everything is synchronized because out of scope, done
598+ std::move (newCandidates.begin (), newCandidates.end (), firstNew);
580599 };
581- std::cout << " Generation " << generation << std::endl;
582- setUpJobsAndRun (tuner_->population , " Population " );
600+ std::cout << " Generation " << generation << ' : ' << std::endl;
601+ setUpJobsAndRun (tuner_->population , " New Candidates " );
583602 tuner_->generateSelectionPool ();
584603 setUpJobsAndRun (tuner_->selectionPool , " Selection Pool" );
585604 tuner_->selectSurvivors ();
0 commit comments