@@ -313,6 +313,7 @@ void GeneticTunerHarness::doCompile(
313313 if (current >= population.size ()) {
314314 break ;
315315 }
316+
316317 auto & pConf = population.at (current);
317318 auto options = makeOptions (*pConf);
318319 try {
@@ -431,7 +432,7 @@ void GeneticTunerHarness::doGpuWork(
431432 LOG_LINE_BY_LINE (INFO, ssInfo);
432433 }
433434
434- auto runtimes =
435+ std::vector<Duration> runtimes =
435436 retrieveCachedRuntimes (engine, kKernelName_ , inputs, outputs, options);
436437 if (runtimes.empty ()) {
437438 try {
@@ -525,59 +526,77 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
525526
526527 auto setUpJobsAndRun = [&](GeneticSearch::Population& population,
527528 const std::string& printerText) {
528- // Initialize for this round
529- currentCompilationJob_.store (0 );
530- numEvaluations_.store (0 );
531- readyToEvaluate_.resize (0 );
532- for (size_t i = 0 ; i < population.size (); ++i) {
533- readyToEvaluate_.emplace_back ();
534- readyToEvaluate_[i].store (false );
535- }
536- Printer printer (
537- printerText,
538- readyToEvaluate_.size (),
539- currentCompilationJob_,
540- numEvaluations_);
541- auto logGenerations = FLAGS_tuner_gen_log_generations;
542- ScopeGuard sgPrinter ([logGenerations, &printer]() {
543- printer.stop ();
544- if (logGenerations) {
545- printer.printAll ();
546- }
547- });
548-
549- // Just spawn and join new threads for each generation
550- std::vector<std::thread> cpuCompilationThreads;
551- cpuCompilationThreads.reserve (FLAGS_tuner_threads);
552- ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
553- for (auto & cpuCompilationThread : cpuCompilationThreads) {
554- cpuCompilationThread.join ();
529+ // Most candidates should have been evaluated during the previous
530+ // generation's selection phase.
531+ // There are two exceptions:
532+ // 1) the 1st generation
533+ // 2) too many invalid configurations were previously encounted and the
534+ // valid ones were not enough to form a new generation.
535+ auto firstNew = std::partition (
536+ population.begin (),
537+ population.end (),
538+ [](const std::unique_ptr<CandidateConfiguration>& c) {
539+ return c->runtime != Duration::zero ();
540+ });
541+ GeneticSearch::Population newCandidates (
542+ std::distance (firstNew, population.end ()));
543+ std::move (firstNew, population.end (), newCandidates.begin ());
544+ {
545+ // Initialize for this round
546+ currentCompilationJob_.store (0 );
547+ numEvaluations_.store (0 );
548+ readyToEvaluate_.resize (0 );
549+ for (size_t i = 0 ; i < newCandidates.size (); ++i) {
550+ readyToEvaluate_.emplace_back ();
551+ readyToEvaluate_[i].store (false );
555552 }
556- });
557- for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
558- cpuCompilationThreads.emplace_back ([this , &engine, &population]() {
559- this ->doCompile (engine, population);
553+ Printer printer (
554+ printerText,
555+ readyToEvaluate_.size (),
556+ currentCompilationJob_,
557+ numEvaluations_);
558+ auto logGenerations = FLAGS_tuner_gen_log_generations;
559+ ScopeGuard sgPrinter ([logGenerations, &printer]() {
560+ printer.stop ();
561+ if (logGenerations) {
562+ printer.printAll ();
563+ }
560564 });
561- }
562565
563- // Just spawn and join new threads for each generation
564- std::vector<std::thread> gpuWorkerThreads;
565- gpuWorkerThreads.reserve (gpus.size ());
566- ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
567- for (auto & gpuWorkerThread : gpuWorkerThreads) {
568- gpuWorkerThread.join ();
566+ // Just spawn and join new threads for each generation
567+ std::vector<std::thread> cpuCompilationThreads;
568+ cpuCompilationThreads.reserve (FLAGS_tuner_threads);
569+ ScopeGuard sgCompilationThreads ([&cpuCompilationThreads]() {
570+ for (auto & cpuCompilationThread : cpuCompilationThreads) {
571+ cpuCompilationThread.join ();
572+ }
573+ });
574+ for (int i = 0 ; i < FLAGS_tuner_threads; ++i) {
575+ cpuCompilationThreads.emplace_back ([this , &engine, &newCandidates]() {
576+ this ->doCompile (engine, newCandidates);
577+ });
578+ }
579+
580+ // Just spawn and join new threads for each generation
581+ std::vector<std::thread> gpuWorkerThreads;
582+ gpuWorkerThreads.reserve (gpus.size ());
583+ ScopeGuard sgGpuWorkerThreads ([&gpuWorkerThreads]() {
584+ for (auto & gpuWorkerThread : gpuWorkerThreads) {
585+ gpuWorkerThread.join ();
586+ }
587+ });
588+ for (auto gpu : gpus) {
589+ gpuWorkerThreads.emplace_back (
590+ [this , gpu, &engine, &newCandidates, &printer]() {
591+ this ->doGpuWork (gpu, engine, newCandidates, printer);
592+ });
569593 }
570- });
571- for (auto gpu : gpus) {
572- gpuWorkerThreads.emplace_back (
573- [this , gpu, &engine, &population, &printer]() {
574- this ->doGpuWork (gpu, engine, population, printer);
575- });
576594 }
577595 // At this point everything is synchronized because out of scope, done
596+ std::move (newCandidates.begin (), newCandidates.end (), firstNew);
578597 };
579- std::cout << " Generation " << generation << std::endl;
580- setUpJobsAndRun (tuner_->population , " Population " );
598+ std::cout << " Generation " << generation << ' : ' << std::endl;
599+ setUpJobsAndRun (tuner_->population , " New Candidates " );
581600 tuner_->generateSelectionPool ();
582601 setUpJobsAndRun (tuner_->selectionPool , " Selection Pool" );
583602 tuner_->selectSurvivors ();
0 commit comments