From e827e75f997d76077f83cbaf787fdc98ec71d1f0 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 22 Jan 2024 15:06:30 +0100
Subject: [PATCH 001/106] created remoteRunner class

---
 .vscode/settings.json          |  4 +-
 kernel_tuner/runners/remote.py | 98 ++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 kernel_tuner/runners/remote.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3a4d473dd..3089f374a 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,8 +10,8 @@
         "editor.formatOnType": true,
         "editor.formatOnSave": false,
         "editor.codeActionsOnSave": {
-            "source.fixAll": true,
-            "source.organizeImports": true,
+            "source.fixAll": "explicit",
+            "source.organizeImports": "explicit"
         }
     },
     "black-formatter.args": [
diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
new file mode 100644
index 000000000..cb4df9ed5
--- /dev/null
+++ b/kernel_tuner/runners/remote.py
@@ -0,0 +1,98 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.runner import Runner
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+
+
+class RemotelRunner(Runner):
+
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+        #detect language and create high-level device interface
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+
+        self.units = self.dev.units
+        self.quiet = device_options.quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+
+        ray.init()
+        # Get cluster resources
+        cluster_resources = ray.cluster_resources()
+        self.num_gpus = cluster_resources.get("GPU", 0)  # Default to 0 if no GPUs are found
+
+    def get_environment(self):
+        return self.dev.get_environment()
+
+    
+    def run(self, parameter_space, tuning_options):
+
+        logging.debug('remote runner started for ' + self.kernel_options.kernel_name)
+
+        results = []
+
+        # iterate over parameter space
+        for element in parameter_space:
+            results = [self.remote_run.remote(element, tuning_options) for _ in range(self.num_gpus)]
+
+        return ray.get(results)
+
+    @ray.remote(num_gpus=1)  # Requesting 1 GPU for this task
+    def remote_run(self, element, tuning_options):
+        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file

From deab579334c05aa168c50bda226ca9e6562d5a81 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:15:53 +0100
Subject: [PATCH 002/106] added remote actor class

---
 kernel_tuner/runners/remote_actor.py | 79 ++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 kernel_tuner/runners/remote_actor.py

diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
new file mode 100644
index 000000000..e6ced0892
--- /dev/null
+++ b/kernel_tuner/runners/remote_actor.py
@@ -0,0 +1,79 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+
+@ray.remote(num_gpus=1)
+class RemoteActor:
+    def __init__(self, 
+                 units,
+                 quiet,
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers):
+        
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = units
+        self.quiet = quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+
+    def execute(self, element, tuning_options):
+        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file

From 52287b9d7b6236b74ff8b5cb6fda7d57fa38dd69 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:26:13 +0100
Subject: [PATCH 003/106] update remote runner

---
 kernel_tuner/runners/remote.py | 88 +++++++++-------------------------
 1 file changed, 23 insertions(+), 65 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index cb4df9ed5..4d81ce877 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -5,15 +5,15 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
+from kernel_tuner.runners.remote_actor import RemoteActor
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 
 
-class RemotelRunner(Runner):
+class RemoteRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-
         self.units = self.dev.units
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
@@ -24,75 +24,33 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
 
-        ray.init()
+        # Initialize Ray
+        ray.init(ignore_reinit_error=True)
+
         # Get cluster resources
         cluster_resources = ray.cluster_resources()
-        self.num_gpus = cluster_resources.get("GPU", 0)  # Default to 0 if no GPUs are found
-
-    def get_environment(self):
+        self.num_gpus = int(cluster_resources.get("GPU", 0))  # Default to 0 if no GPUs are found
+
+        # Create RemoteActor instances
+        self.actors = [RemoteActor.remote(self.dev.units, 
+                                          device_options.quiet,
+                                          kernel_source, 
+                                          kernel_options, 
+                                          device_options, 
+                                          iterations, 
+                                          observers) for _ in range(self.num_gpus)]
+
+    def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
     def run(self, parameter_space, tuning_options):
+        future_results = []
 
-        logging.debug('remote runner started for ' + self.kernel_options.kernel_name)
-
-        results = []
-
-        # iterate over parameter space
+        # Iterate over parameter space and distribute work to actors
         for element in parameter_space:
-            results = [self.remote_run.remote(element, tuning_options) for _ in range(self.num_gpus)]
-
-        return ray.get(results)
-
-    @ray.remote(num_gpus=1)  # Requesting 1 GPU for this task
-    def remote_run(self, element, tuning_options):
-        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
-
-        params = dict(zip(tuning_options.tune_params.keys(), element))
-
-        result = None
-        warmup_time = 0
-
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
+            future = [actor.execute.remote(element, tuning_options) for actor in self.actors]
+            future_results.extend(future)
 
-        return params
\ No newline at end of file
+        return ray.get(future_results)
+    
\ No newline at end of file

From 40cc888896ef5a7a6755f71416139824754c58da Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:27:23 +0100
Subject: [PATCH 004/106] added remote_mode function argument to tune_kernel
 and related remote runner selection logic

---
 kernel_tuner/interface.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 1d206307b..ec32bafb9 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,6 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.remote import RemoteRunner # ADDED HERE
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -574,6 +575,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
+    remote_mode=False, # ADDED HERE
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -650,7 +652,7 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+    selected_runner = SimulationRunner if simulation_mode else (RemoteRunner if remote_mode else SequentialRunner) # ADDED HERE
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 

From b14aaf01b9f1ab8c931479a21596233d4196ea9a Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 23 Jan 2024 23:28:01 +0100
Subject: [PATCH 005/106] added parallel tuning test

---
 test/test_parallel_tuning.py | 40 ++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 test/test_parallel_tuning.py

diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
new file mode 100644
index 000000000..f0da92dc5
--- /dev/null
+++ b/test/test_parallel_tuning.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.backends import nvcuda
+from kernel_tuner.core import KernelInstance, KernelSource
+from .context import skip_if_no_pycuda
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = dict()
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
+@skip_if_no_pycuda
+def test_parallel_tune_kernel(env):
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, remote_mode=True)
+    assert len(result) > 0
\ No newline at end of file

From 1a55a5c8e8ba6577bbf545ccc052d9adc221deab Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 24 Jan 2024 00:04:22 +0100
Subject: [PATCH 006/106] added pool of actors

---
 kernel_tuner/runners/remote.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index 4d81ce877..60db5a36c 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -2,6 +2,7 @@
 from datetime import datetime, timezone
 from time import perf_counter
 import ray
+from ray.util.actor_pool import ActorPool
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -39,18 +40,16 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
                                           device_options, 
                                           iterations, 
                                           observers) for _ in range(self.num_gpus)]
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
     def run(self, parameter_space, tuning_options):
-        future_results = []
+        
+        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
 
-        # Iterate over parameter space and distribute work to actors
-        for element in parameter_space:
-            future = [actor.execute.remote(element, tuning_options) for actor in self.actors]
-            future_results.extend(future)
-
-        return ray.get(future_results)
+        return results
     
\ No newline at end of file

From 4fef594d60f5f44e0a6da8315980e0658481528a Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 24 Jan 2024 11:42:58 +0100
Subject: [PATCH 007/106] clean up remote runner and actor

---
 kernel_tuner/runners/remote.py       | 18 ------------------
 kernel_tuner/runners/remote_actor.py |  6 +++---
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index 60db5a36c..7c858af99 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -1,37 +1,21 @@
 import logging
-from datetime import datetime, timezone
-from time import perf_counter
 import ray
 from ray.util.actor_pool import ActorPool
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.remote_actor import RemoteActor
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-
 
 class RemoteRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = device_options.quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-
         # Initialize Ray
         ray.init(ignore_reinit_error=True)
-
         # Get cluster resources
         cluster_resources = ray.cluster_resources()
         self.num_gpus = int(cluster_resources.get("GPU", 0))  # Default to 0 if no GPUs are found
-
         # Create RemoteActor instances
         self.actors = [RemoteActor.remote(self.dev.units, 
                                           device_options.quiet,
@@ -48,8 +32,6 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
-        
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
-
         return results
     
\ No newline at end of file
diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
index e6ced0892..ce6a06a7e 100644
--- a/kernel_tuner/runners/remote_actor.py
+++ b/kernel_tuner/runners/remote_actor.py
@@ -26,11 +26,11 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-
-    def execute(self, element, tuning_options):
-        #move data to the GPU NOT SURE IF TO PUT HERE, IN SEQUENTIAL IS IN INIT
+        #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
 
+    def execute(self, element, tuning_options):
+        
         params = dict(zip(tuning_options.tune_params.keys(), element))
 
         result = None

From 3f3b9e60135e375963a1902bc877183b25c67960 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 30 Jan 2024 11:02:22 +0100
Subject: [PATCH 008/106] updates on remote code

---
 kernel_tuner/runners/remote.py       | 76 +++++++++++++++++++++++-----
 kernel_tuner/runners/remote_actor.py | 11 ++--
 test/test_parallel_tuning.py         |  1 +
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/remote.py
index 7c858af99..40d72c8c9 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/remote.py
@@ -1,6 +1,9 @@
 import logging
 import ray
+import sys
+import os
 from ray.util.actor_pool import ActorPool
+from time import perf_counter
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -9,21 +12,32 @@
 class RemoteRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
-        #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = self.dev.units
+        self.quiet = device_options.quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+        self.observers = observers
+        self.iterations = iterations
+        self.device_options = device_options
+
+        # Define cluster resources
+        self.num_gpus = get_num_devices(kernel_source.lang)
+        print(f"Number of GPUs in use: {self.num_gpus}", file=sys. stderr)
+        resources = {}
+        for id in range(self.num_gpus):
+            gpu_resource_name = f"gpu_{id}"
+            resources[gpu_resource_name] = 1
         # Initialize Ray
-        ray.init(ignore_reinit_error=True)
-        # Get cluster resources
-        cluster_resources = ray.cluster_resources()
-        self.num_gpus = int(cluster_resources.get("GPU", 0))  # Default to 0 if no GPUs are found
+        os.environ["RAY_DEDUP_LOGS"] = "0"
+        ray.init(resources=resources, include_dashboard=True)
         # Create RemoteActor instances
-        self.actors = [RemoteActor.remote(self.dev.units, 
-                                          device_options.quiet,
-                                          kernel_source, 
-                                          kernel_options, 
-                                          device_options, 
-                                          iterations, 
-                                          observers) for _ in range(self.num_gpus)]
+        self.actors = [self.create_actor_on_gpu(id) for id in range(self.num_gpus)]
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
 
@@ -32,6 +46,42 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
+        print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
-    
\ No newline at end of file
+    
+    def create_actor_on_gpu(self, gpu_id):
+        gpu_resource_name = f"gpu_{gpu_id}"
+        return RemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
+                                                                            self.kernel_source, 
+                                                                            self.kernel_options, 
+                                                                            self.device_options, 
+                                                                            self.iterations, 
+                                                                            self.observers,
+                                                                            gpu_id)
+
+# DONT KNOW WHERE TO PUT IT YET
+def get_num_devices(lang):
+    num_devices = 0
+    if lang.upper() == "CUDA":
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "CUPY":
+        import cupy
+        num_devices = cupy.cuda.runtime.getDeviceCount()
+    elif lang.upper() == "NVCUDA":
+        # NVCUDA usually refers to NVIDIA's CUDA, so you can use pycuda or a similar approach
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "OPENCL":
+        import pyopencl as cl
+        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
+    elif lang.upper() == "HIP":
+        from pyhip import hip
+        num_devices = hip.hipGetDeviceCount()
+    else:
+        raise ValueError(f"Unsupported language: {lang}")
+
+    return num_devices
\ No newline at end of file
diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
index ce6a06a7e..57624fe6f 100644
--- a/kernel_tuner/runners/remote_actor.py
+++ b/kernel_tuner/runners/remote_actor.py
@@ -2,22 +2,25 @@
 from datetime import datetime, timezone
 from time import perf_counter
 import ray
+import sys
+
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 from kernel_tuner.core import DeviceInterface
 
 @ray.remote(num_gpus=1)
 class RemoteActor:
     def __init__(self, 
-                 units,
                  quiet,
                  kernel_source,
                  kernel_options, 
                  device_options,
                  iterations,
-                 observers):
+                 observers,
+                 gpu_id):
         
+        self.gpu_id = gpu_id
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = units
+        self.units = self.dev.units
         self.quiet = quiet
         self.kernel_source = kernel_source
         self.warmed_up = False
@@ -30,7 +33,7 @@ def __init__(self,
         self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
 
     def execute(self, element, tuning_options):
-        
+        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
         params = dict(zip(tuning_options.tune_params.keys(), element))
 
         result = None
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index f0da92dc5..9419d11a1 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+import logging
 
 from kernel_tuner import tune_kernel
 from kernel_tuner.backends import nvcuda

From fe5da39289ad18d0552339570a3d248b070db7c5 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Tue, 2 Apr 2024 16:07:31 +0200
Subject: [PATCH 009/106] changed naming from remote to parallel

---
 kernel_tuner/interface.py                       | 9 ++++++---
 kernel_tuner/runners/{remote.py => parallel.py} | 4 ++--
 test/test_parallel_tuning.py                    | 2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)
 rename kernel_tuner/runners/{remote.py => parallel.py} (94%)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index ec32bafb9..6b53e599b 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,7 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.remote import RemoteRunner # ADDED HERE
+from kernel_tuner.runners.parallel import ParallelRunner # ADDED HERE
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -58,6 +58,7 @@
     pso,
     random_sample,
     simulated_annealing,
+    ensemble
 )
 
 strategy_map = {
@@ -76,6 +77,7 @@
     "simulated_annealing": simulated_annealing,
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
+    "ensemble": ensemble
 }
 
 
@@ -385,6 +387,7 @@ def __deepcopy__(self, _):
             * "pso" particle swarm optimization
             * "random_sample" takes a random sample of the search space
             * "simulated_annealing" simulated annealing strategy
+            * "ensemble" Ensemble Strategy
 
         Strategy-specific parameters and options are explained under strategy_options.
 
@@ -575,7 +578,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
-    remote_mode=False, # ADDED HERE
+    parallel_mode=False, # ADDED HERE
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -652,7 +655,7 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else (RemoteRunner if remote_mode else SequentialRunner) # ADDED HERE
+    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner) # ADDED HERE
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
diff --git a/kernel_tuner/runners/remote.py b/kernel_tuner/runners/parallel.py
similarity index 94%
rename from kernel_tuner/runners/remote.py
rename to kernel_tuner/runners/parallel.py
index 40d72c8c9..550fdb057 100644
--- a/kernel_tuner/runners/remote.py
+++ b/kernel_tuner/runners/parallel.py
@@ -9,7 +9,7 @@
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.remote_actor import RemoteActor
 
-class RemoteRunner(Runner):
+class ParallelRunner(Runner):
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
@@ -47,6 +47,7 @@ def get_environment(self, tuning_options):
     
     def run(self, parameter_space, tuning_options):
         print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
+        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
     
@@ -71,7 +72,6 @@ def get_num_devices(lang):
         import cupy
         num_devices = cupy.cuda.runtime.getDeviceCount()
     elif lang.upper() == "NVCUDA":
-        # NVCUDA usually refers to NVIDIA's CUDA, so you can use pycuda or a similar approach
         import pycuda.driver as cuda
         cuda.init()
         num_devices = cuda.Device.count()
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index 9419d11a1..9a2e6a362 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -37,5 +37,5 @@ def env():
 
 @skip_if_no_pycuda
 def test_parallel_tune_kernel(env):
-    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, remote_mode=True)
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, parallel_mode=True)
     assert len(result) > 0
\ No newline at end of file

From a43dc8491202685953227591e0c142bd89fce1b1 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:07:24 +0200
Subject: [PATCH 010/106] added get_num_devices function

---
 kernel_tuner/util.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 19b29c0f1..90acb77bb 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1253,3 +1253,27 @@ def cuda_error_check(error):
         if error != nvrtc.nvrtcResult.NVRTC_SUCCESS:
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
+
+def get_num_devices(lang):
+    num_devices = 0
+    if lang.upper() == "CUDA":
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "CUPY":
+        import cupy
+        num_devices = cupy.cuda.runtime.getDeviceCount()
+    elif lang.upper() == "NVCUDA":
+        import pycuda.driver as cuda
+        cuda.init()
+        num_devices = cuda.Device.count()
+    elif lang.upper() == "OPENCL":
+        import pyopencl as cl
+        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
+    elif lang.upper() == "HIP":
+        from pyhip import hip
+        num_devices = hip.hipGetDeviceCount()
+    else:
+        raise ValueError(f"Unsupported language: {lang}")
+
+    return num_devices
\ No newline at end of file

From ab3aa243bc33c929827fc46c96528176838b7910 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:08:57 +0200
Subject: [PATCH 011/106] added ensemble and parallel runner related stuff

---
 kernel_tuner/interface.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 6b53e599b..d7a97fbe8 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -34,7 +34,7 @@
 from kernel_tuner.integration import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.parallel import ParallelRunner # ADDED HERE
+from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.searchspace import Searchspace
 
 try:
@@ -578,7 +578,7 @@ def tune_kernel(
     cache=None,
     metrics=None,
     simulation_mode=False,
-    parallel_mode=False, # ADDED HERE
+    parallel_mode=False,
     observers=None,
     objective=None,
     objective_higher_is_better=None,
@@ -616,6 +616,8 @@ def tune_kernel(
         tuning_options["max_fevals"] = strategy_options["max_fevals"]
     if strategy_options and "time_limit" in strategy_options:
         tuning_options["time_limit"] = strategy_options["time_limit"]
+    if strategy_options and "ensemble" in strategy_options:
+        tuning_options["ensemble"] = strategy_options["ensemble"]
 
     logging.debug("tune_kernel called")
     logging.debug("kernel_options: %s", util.get_config_string(kernel_options))
@@ -655,7 +657,7 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner) # ADDED HERE
+    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner)
     tuning_options.simulated_time = 0
     runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 

From e8a7228fe412bac5697e376ec8df5905dfa593d0 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:11:06 +0200
Subject: [PATCH 012/106] switched to new naming of parallel remote and some
 clean up

---
 kernel_tuner/runners/parallel.py | 32 ++++----------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 550fdb057..65a32cf37 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -7,7 +7,8 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.remote_actor import RemoteActor
+from kernel_tuner.runners.parallel_remote_actor import ParallelRemoteActor
+from kernel_tuner.util import get_num_devices
 
 class ParallelRunner(Runner):
 
@@ -46,42 +47,17 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
-        print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
+        #print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
     
     def create_actor_on_gpu(self, gpu_id):
         gpu_resource_name = f"gpu_{gpu_id}"
-        return RemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
+        return ParallelRemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
                                                                             self.kernel_source, 
                                                                             self.kernel_options, 
                                                                             self.device_options, 
                                                                             self.iterations, 
                                                                             self.observers,
                                                                             gpu_id)
-
-# DONT KNOW WHERE TO PUT IT YET
-def get_num_devices(lang):
-    num_devices = 0
-    if lang.upper() == "CUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "CUPY":
-        import cupy
-        num_devices = cupy.cuda.runtime.getDeviceCount()
-    elif lang.upper() == "NVCUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "OPENCL":
-        import pyopencl as cl
-        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
-    elif lang.upper() == "HIP":
-        from pyhip import hip
-        num_devices = hip.hipGetDeviceCount()
-    else:
-        raise ValueError(f"Unsupported language: {lang}")
-
-    return num_devices
\ No newline at end of file

From 3dd748c0a9b4bfa7f07b4a9b0bf1c0cb8770d81f Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:14:29 +0200
Subject: [PATCH 013/106] added class instances needed down the line in the
 execution of the ensemble strategy

---
 kernel_tuner/runners/sequential.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index c493a0089..bf4cd6303 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -40,6 +40,9 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
+        self.device_options = device_options # needed for the ensemble strategy down the line
+        self.iterations = iterations # needed for the ensemble strategy down the line
+        self.observers = observers # needed for the ensemble strategy down the line
 
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)

From e743bec8e18dd85dce672cdddd266709a23311ac Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:16:13 +0200
Subject: [PATCH 014/106] changed naming due to ensemble implementation, this
 was the original remote_actor

---
 kernel_tuner/runners/parallel_remote_actor.py | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 kernel_tuner/runners/parallel_remote_actor.py

diff --git a/kernel_tuner/runners/parallel_remote_actor.py b/kernel_tuner/runners/parallel_remote_actor.py
new file mode 100644
index 000000000..e913974a7
--- /dev/null
+++ b/kernel_tuner/runners/parallel_remote_actor.py
@@ -0,0 +1,82 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+import sys
+
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+
+@ray.remote(num_gpus=1)
+class ParallelRemoteActor():
+    def __init__(self, 
+                 quiet,
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers,
+                 gpu_id):
+        
+        self.gpu_id = gpu_id
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = self.dev.units
+        self.quiet = quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+        #move data to the GPU
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+    def execute(self, element, tuning_options):
+        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file

From df949d09c1832f38f8791647688ce09c9b73fccb Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:17:23 +0200
Subject: [PATCH 015/106] started ensemble implementation, very basic
 functionality works

---
 kernel_tuner/runners/remote_actor.py |  61 +++------------
 kernel_tuner/strategies/ensemble.py  | 106 +++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 49 deletions(-)
 create mode 100644 kernel_tuner/strategies/ensemble.py

diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
index 57624fe6f..6332972fd 100644
--- a/kernel_tuner/runners/remote_actor.py
+++ b/kernel_tuner/runners/remote_actor.py
@@ -6,9 +6,11 @@
 
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.sequential import SequentialRunner
+from kernel_tuner.runners.simulation import SimulationRunner
 
 @ray.remote(num_gpus=1)
-class RemoteActor:
+class RemoteActor():
     def __init__(self, 
                  quiet,
                  kernel_source,
@@ -29,54 +31,15 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.observers = observers
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
 
-    def execute(self, element, tuning_options):
-        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
-        params = dict(zip(tuning_options.tune_params.keys(), element))
-
-        result = None
-        warmup_time = 0
-
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
-
-        return params
\ No newline at end of file
+    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
+        selected_runner = SimulationRunner if simulation_mode else SequentialRunner
+        runner = selected_runner(self.kernel_source, self.kernel_options, self.device_options, 
+                                 self.iterations, self.observers)
+        results = strategy.tune(searchspace, runner, tuning_options)
+        return results
\ No newline at end of file
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
new file mode 100644
index 000000000..43e83348b
--- /dev/null
+++ b/kernel_tuner/strategies/ensemble.py
@@ -0,0 +1,106 @@
+import random
+import sys
+import os
+import ray
+from ray.util.actor_pool import ActorPool
+
+import numpy as np
+
+from kernel_tuner import util
+from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.strategies import common
+from kernel_tuner.strategies.common import CostFunc, scale_from_params
+from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.remote_actor import RemoteActor
+from kernel_tuner.util import get_num_devices
+
+from kernel_tuner.strategies import (
+    basinhopping,
+    bayes_opt,
+    brute_force,
+    diff_evo,
+    dual_annealing,
+    firefly_algorithm,
+    genetic_algorithm,
+    greedy_ils,
+    greedy_mls,
+    minimize,
+    mls,
+    ordered_greedy_mls,
+    pso,
+    random_sample,
+    simulated_annealing,
+)
+
+strategy_map = {
+    "brute_force": brute_force,
+    "random_sample": random_sample,
+    "minimize": minimize,
+    "basinhopping": basinhopping,
+    "diff_evo": diff_evo,
+    "genetic_algorithm": genetic_algorithm,
+    "greedy_mls": greedy_mls,
+    "ordered_greedy_mls": ordered_greedy_mls,
+    "greedy_ils": greedy_ils,
+    "dual_annealing": dual_annealing,
+    "mls": mls,
+    "pso": pso,
+    "simulated_annealing": simulated_annealing,
+    "firefly_algorithm": firefly_algorithm,
+    "bayes_opt": bayes_opt,
+}
+
+def tune(searchspace: Searchspace, runner, tuning_options):
+    # Define cluster resources
+    num_gpus = get_num_devices(runner.kernel_source.lang)
+    print(f"Number of GPUs in use: {num_gpus}", file=sys. stderr)
+    resources = {}
+    for id in range(num_gpus):
+        gpu_resource_name = f"gpu_{id}"
+        resources[gpu_resource_name] = 1
+    # Initialize Ray
+    os.environ["RAY_DEDUP_LOGS"] = "0"
+    ray.init(resources=resources, include_dashboard=True)
+    # Create RemoteActor instances
+    actors = [create_actor_on_gpu(id, runner) for id in range(num_gpus)]
+    # Create a pool of RemoteActor actors
+    #actor_pool = ActorPool(actors)
+    
+    if "ensemble" in tuning_options:
+        ensemble = tuning_options["ensemble"]
+    else:
+        ensemble = ["random_sample", "random_sample", "random_sample"] # For now its just a random ensemble not based on any logic
+    
+    ensemble = [strategy_map[strategy] for strategy in ensemble]
+    tasks = []
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    for i in range(len(ensemble)):
+        strategy = ensemble[i]
+        actor = actors[i]
+        task = actor.execute.remote(strategy, searchspace, tuning_options, simulation_mode)
+        tasks.append(task)
+    all_results = ray.get(tasks)
+
+    unique_configs = set()
+    final_results = []
+
+    for strategy_results in all_results:
+        for new_result in strategy_results:
+            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+
+            if config_signature not in unique_configs:
+                final_results.append(new_result)
+                unique_configs.add(config_signature)
+
+    return final_results
+
+# ITS REPEATING CODE, SAME IN parallel.py
+def create_actor_on_gpu(gpu_id, runner):
+    gpu_resource_name = f"gpu_{gpu_id}"
+    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.quiet,
+                                                                        runner.kernel_source, 
+                                                                        runner.kernel_options, 
+                                                                        runner.device_options, 
+                                                                        runner.iterations, 
+                                                                        runner.observers,
+                                                                        gpu_id)
\ No newline at end of file

From 45a1747f1929f2f1d608bff0e99c5113d790e0fd Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 4 Apr 2024 11:17:46 +0200
Subject: [PATCH 016/106] updated tests

---
 test/test_ensemble_tuning.py | 42 ++++++++++++++++++++++++++++++++++++
 test/test_parallel_tuning.py |  1 +
 2 files changed, 43 insertions(+)
 create mode 100644 test/test_ensemble_tuning.py

diff --git a/test/test_ensemble_tuning.py b/test/test_ensemble_tuning.py
new file mode 100644
index 000000000..e5c807d43
--- /dev/null
+++ b/test/test_ensemble_tuning.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pytest
+import logging
+import sys
+
+from kernel_tuner import tune_kernel
+from kernel_tuner.backends import nvcuda
+from kernel_tuner.core import KernelInstance, KernelSource
+from .context import skip_if_no_pycuda
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+@pytest.fixture
+def env():
+    kernel_string = """
+    extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
+        int i = blockIdx.x * block_size_x + threadIdx.x;
+        if (i<n) {
+            c[i] = a[i] + b[i];
+        }
+    }
+    """
+
+    size = 100
+    a = np.random.randn(size).astype(np.float32)
+    b = np.random.randn(size).astype(np.float32)
+    c = np.zeros_like(b)
+    n = np.int32(size)
+
+    args = [c, a, b, n]
+    tune_params = dict()
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    return ["vector_add", kernel_string, size, args, tune_params]
+
+@skip_if_no_pycuda
+def test_parallel_tune_kernel(env):
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", parallel_mode=True)
+    assert len(result) > 0
\ No newline at end of file
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index 9a2e6a362..bbe4d96b7 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 import logging
+import sys
 
 from kernel_tuner import tune_kernel
 from kernel_tuner.backends import nvcuda

From 5fb592785d232e59700aed8a66b45e6b58481d7c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:25:02 +0200
Subject: [PATCH 017/106] clean up in parallel runner

---
 kernel_tuner/runners/parallel.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 65a32cf37..0f7477652 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -7,7 +7,7 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.parallel_remote_actor import ParallelRemoteActor
+from kernel_tuner.runners.ray.parallel_remote_actor import ParallelRemoteActor
 from kernel_tuner.util import get_num_devices
 
 class ParallelRunner(Runner):
@@ -47,7 +47,6 @@ def get_environment(self, tuning_options):
 
     
     def run(self, parameter_space, tuning_options):
-        #print(f"Size parameter_space: {len(parameter_space)}", file=sys. stderr)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results

From a96ef433c58bcebdb28c9f0750d9c55042dc843c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:26:28 +0200
Subject: [PATCH 018/106] moved to sub directory ray

---
 kernel_tuner/runners/parallel_remote_actor.py | 82 -------------------
 kernel_tuner/runners/remote_actor.py          | 45 ----------
 2 files changed, 127 deletions(-)
 delete mode 100644 kernel_tuner/runners/parallel_remote_actor.py
 delete mode 100644 kernel_tuner/runners/remote_actor.py

diff --git a/kernel_tuner/runners/parallel_remote_actor.py b/kernel_tuner/runners/parallel_remote_actor.py
deleted file mode 100644
index e913974a7..000000000
--- a/kernel_tuner/runners/parallel_remote_actor.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
-import ray
-import sys
-
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
-
-@ray.remote(num_gpus=1)
-class ParallelRemoteActor():
-    def __init__(self, 
-                 quiet,
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers,
-                 gpu_id):
-        
-        self.gpu_id = gpu_id
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-        #move data to the GPU
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
-
-    def execute(self, element, tuning_options):
-        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
-        params = dict(zip(tuning_options.tune_params.keys(), element))
-
-        result = None
-        warmup_time = 0
-
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
-
-        return params
\ No newline at end of file
diff --git a/kernel_tuner/runners/remote_actor.py b/kernel_tuner/runners/remote_actor.py
deleted file mode 100644
index 6332972fd..000000000
--- a/kernel_tuner/runners/remote_actor.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
-import ray
-import sys
-
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
-from kernel_tuner.runners.sequential import SequentialRunner
-from kernel_tuner.runners.simulation import SimulationRunner
-
-@ray.remote(num_gpus=1)
-class RemoteActor():
-    def __init__(self, 
-                 quiet,
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers,
-                 gpu_id):
-        
-        self.gpu_id = gpu_id
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-        self.device_options = device_options
-        self.iterations = iterations
-        self.observers = observers
-        #move data to the GPU
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
-
-    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
-        selected_runner = SimulationRunner if simulation_mode else SequentialRunner
-        runner = selected_runner(self.kernel_source, self.kernel_options, self.device_options, 
-                                 self.iterations, self.observers)
-        results = strategy.tune(searchspace, runner, tuning_options)
-        return results
\ No newline at end of file

From c831f5f7563483333cdcd74330e0bc0c4207edb6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:27:29 +0200
Subject: [PATCH 019/106] added subdirectory ray with all 3 actor classes

---
 kernel_tuner/runners/ray/cache_manager.py     | 23 ++++++
 .../runners/ray/parallel_remote_actor.py      | 82 +++++++++++++++++++
 kernel_tuner/runners/ray/remote_actor.py      | 39 +++++++++
 3 files changed, 144 insertions(+)
 create mode 100644 kernel_tuner/runners/ray/cache_manager.py
 create mode 100644 kernel_tuner/runners/ray/parallel_remote_actor.py
 create mode 100644 kernel_tuner/runners/ray/remote_actor.py

diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
new file mode 100644
index 000000000..437499352
--- /dev/null
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -0,0 +1,23 @@
+import ray
+import json
+
+from kernel_tuner.util import store_cache
+
+@ray.remote
+class CacheManager:
+    def __init__(self, tuning_options):
+        self.tuning_options = tuning_options
+
+    def store(self, key, params):
+        store_cache(key, params, self.tuning_options)
+
+    def check_and_retrieve(self, key):
+        """Checks if a result exists for the given key and returns it if found."""
+        if self.tuning_options.cache:
+            return self.tuning_options.cache.get(key, None)
+        else:
+            return None
+    
+    def get_tuning_options(self):
+        """Returns the current tuning options."""
+        return self.tuning_options
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
new file mode 100644
index 000000000..e913974a7
--- /dev/null
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -0,0 +1,82 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+import sys
+
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+
+@ray.remote(num_gpus=1)
+class ParallelRemoteActor():
+    def __init__(self, 
+                 quiet,
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers,
+                 gpu_id):
+        
+        self.gpu_id = gpu_id
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.units = self.dev.units
+        self.quiet = quiet
+        self.kernel_source = kernel_source
+        self.warmed_up = False
+        self.simulation_mode = False
+        self.start_time = perf_counter()
+        self.last_strategy_start_time = self.start_time
+        self.last_strategy_time = 0
+        self.kernel_options = kernel_options
+        #move data to the GPU
+        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+
+    def execute(self, element, tuning_options):
+        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
+        params = dict(zip(tuning_options.tune_params.keys(), element))
+
+        result = None
+        warmup_time = 0
+
+        # check if configuration is in the cache
+        x_int = ",".join([str(i) for i in element])
+        if tuning_options.cache and x_int in tuning_options.cache:
+            params.update(tuning_options.cache[x_int])
+            params['compile_time'] = 0
+            params['verification_time'] = 0
+            params['benchmark_time'] = 0
+        else:
+            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
+            if not self.warmed_up:
+                warmup_time = perf_counter()
+                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                self.warmed_up = True
+                warmup_time = 1e3 * (perf_counter() - warmup_time)
+
+            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
+            params.update(result)
+
+            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+
+        # only compute metrics on configs that have not errored
+        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            params = process_metrics(params, tuning_options.metrics)
+
+        # get the framework time by estimating based on other times
+        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
+        params['strategy_time'] = self.last_strategy_time
+        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
+        params['timestamp'] = str(datetime.now(timezone.utc))
+        self.start_time = perf_counter()
+
+        if result:
+            # print configuration to the console
+            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+            # add configuration to cache
+            store_cache(x_int, params, tuning_options)
+
+        return params
\ No newline at end of file
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
new file mode 100644
index 000000000..c092d78e7
--- /dev/null
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -0,0 +1,39 @@
+import logging
+from datetime import datetime, timezone
+from time import perf_counter
+import ray
+import sys
+
+from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.sequential import SequentialRunner
+from kernel_tuner.runners.simulation import SimulationRunner
+
+@ray.remote(num_gpus=1)
+class RemoteActor():
+    def __init__(self, 
+                 kernel_source,
+                 kernel_options, 
+                 device_options,
+                 iterations,
+                 observers,
+                 cache_manager):
+        
+        self.kernel_source = kernel_source
+        self.simulation_mode = False
+        self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.observers = observers
+        self.cache_manager = cache_manager
+        
+    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
+        if simulation_mode:
+            runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
+                                 self.iterations, self.observers)
+        else:
+            runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
+                                 self.iterations, self.observers, cache_manager=self.cache_manager)
+        results = strategy.tune(searchspace, runner, tuning_options)
+        return results
+    
\ No newline at end of file

From 0cc2a6e273cccc6a602ae3ea43f87424955f0b90 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:28:24 +0200
Subject: [PATCH 020/106] itegrated calls to cache manager functions when
 running in ensemble

---
 kernel_tuner/runners/sequential.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index bf4cd6303..0e6855ece 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -2,6 +2,7 @@
 import logging
 from datetime import datetime, timezone
 from time import perf_counter
+import ray
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
@@ -11,7 +12,7 @@
 class SequentialRunner(Runner):
     """SequentialRunner is used for tuning with a single process/thread."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None):
         """Instantiate the SequentialRunner.
 
         :param kernel_source: The kernel source
@@ -43,6 +44,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.device_options = device_options # needed for the ensemble strategy down the line
         self.iterations = iterations # needed for the ensemble strategy down the line
         self.observers = observers # needed for the ensemble strategy down the line
+        self.cache_manager = cache_manager
 
         #move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
@@ -78,8 +80,9 @@ def run(self, parameter_space, tuning_options):
 
             # check if configuration is in the cache
             x_int = ",".join([str(i) for i in element])
-            if tuning_options.cache and x_int in tuning_options.cache:
-                params.update(tuning_options.cache[x_int])
+            cache_result = self.config_in_cache(x_int, tuning_options)
+            if cache_result:
+                params.update(cache_result)
                 params['compile_time'] = 0
                 params['verification_time'] = 0
                 params['benchmark_time'] = 0
@@ -114,9 +117,23 @@ def run(self, parameter_space, tuning_options):
                 print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
 
                 # add configuration to cache
-                store_cache(x_int, params, tuning_options)
+                self.store_in_cache(x_int, params, tuning_options)
 
             # all visited configurations are added to results to provide a trace for optimization strategies
             results.append(params)
 
         return results
+
+    def config_in_cache(self, x_int, tuning_options):
+        if self.cache_manager:
+            return ray.get(self.cache_manager.check_and_retrieve.remote(x_int))
+        elif tuning_options.cache and x_int in tuning_options.cache:
+            return tuning_options.cache[x_int]
+        else:
+            return None
+
+    def store_in_cache(self, x_int, params, tuning_options):
+        if self.cache_manager:
+            ray.get(self.cache_manager.store.remote(x_int, params))
+        else:
+            store_cache(x_int, params, tuning_options)
\ No newline at end of file

From b816f3d2cd9ffcf321f1a0ba131140bf29b16d6a Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Fri, 5 Apr 2024 20:28:59 +0200
Subject: [PATCH 021/106] added cache manager logic

---
 kernel_tuner/strategies/ensemble.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 43e83348b..203785ad8 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -11,8 +11,9 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc, scale_from_params
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.remote_actor import RemoteActor
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.cache_manager import CacheManager
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -61,15 +62,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
     ray.init(resources=resources, include_dashboard=True)
+    cache_manager = CacheManager.remote(tuning_options)
     # Create RemoteActor instances
-    actors = [create_actor_on_gpu(id, runner) for id in range(num_gpus)]
-    # Create a pool of RemoteActor actors
-    #actor_pool = ActorPool(actors)
+    actors = [create_actor_on_gpu(id, runner, cache_manager) for id in range(num_gpus)]
     
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
-        ensemble = ["random_sample", "random_sample", "random_sample"] # For now its just a random ensemble not based on any logic
+        ensemble = ["random_sample", "random_sample", "random_sample"]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
     tasks = []
@@ -80,6 +80,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         task = actor.execute.remote(strategy, searchspace, tuning_options, simulation_mode)
         tasks.append(task)
     all_results = ray.get(tasks)
+    tuning_options = ray.get(cache_manager.get_tuning_options.remote())
 
     unique_configs = set()
     final_results = []
@@ -94,13 +95,11 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     return final_results
 
-# ITS REPEATING CODE, SAME IN parallel.py
-def create_actor_on_gpu(gpu_id, runner):
+def create_actor_on_gpu(gpu_id, runner, cache_manager):
     gpu_resource_name = f"gpu_{gpu_id}"
-    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.quiet,
-                                                                        runner.kernel_source, 
+    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
                                                                         runner.kernel_options, 
                                                                         runner.device_options, 
                                                                         runner.iterations, 
                                                                         runner.observers,
-                                                                        gpu_id)
\ No newline at end of file
+                                                                        cache_manager)
\ No newline at end of file

From 781839ab2e83d782903f05a19aeaafbf41bcd821 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Mon, 8 Apr 2024 12:09:00 +0200
Subject: [PATCH 022/106] added instances needed for the ensemble down the line
 of execution

---
 kernel_tuner/runners/simulation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
index 27eadf073..c0d9c2d00 100644
--- a/kernel_tuner/runners/simulation.py
+++ b/kernel_tuner/runners/simulation.py
@@ -58,6 +58,10 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.units = {}
 
+        self.device_options = device_options # needed for the ensemble strategy down the line
+        self.iterations = iterations # needed for the ensemble strategy down the line
+        self.observers = observers # needed for the ensemble strategy down the line
+
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
         env["simulation"] = True

From 9f8d212ec0590b3602b9de4d92630c5cb7f91979 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Mon, 8 Apr 2024 12:09:50 +0200
Subject: [PATCH 023/106] added strategy option to get_options function

---
 kernel_tuner/strategies/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 2185cb2f7..a45df191c 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -44,7 +44,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")

From d08b5d4b6d72d6d9634270a443cec748850a2d3b Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 10 Apr 2024 14:29:04 +0200
Subject: [PATCH 024/106] added ignore_reinit_error to ray init

---
 kernel_tuner/runners/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 0f7477652..34641298d 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -36,7 +36,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
             resources[gpu_resource_name] = 1
         # Initialize Ray
         os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True)
+        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
         # Create RemoteActor instances
         self.actors = [self.create_actor_on_gpu(id) for id in range(self.num_gpus)]
         # Create a pool of RemoteActor actors

From 903c981bf3e5d13a09f15c3d49fb2ed83e1cc73c Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 10 Apr 2024 14:29:35 +0200
Subject: [PATCH 025/106] added ignore_reinit_error to ray init

---
 kernel_tuner/strategies/ensemble.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 203785ad8..e31ac60ff 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -61,7 +61,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         resources[gpu_resource_name] = 1
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
-    ray.init(resources=resources, include_dashboard=True)
+    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     cache_manager = CacheManager.remote(tuning_options)
     # Create RemoteActor instances
     actors = [create_actor_on_gpu(id, runner, cache_manager) for id in range(num_gpus)]

From 1a2219a38285c62cc049ae4be2694c0550f4aa77 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Wed, 10 Apr 2024 16:32:35 +0200
Subject: [PATCH 026/106] added cache manager to parallel tuning

---
 kernel_tuner/runners/parallel.py              |  8 ++-
 .../runners/ray/parallel_remote_actor.py      | 58 ++++---------------
 kernel_tuner/strategies/brute_force.py        |  9 ++-
 3 files changed, 25 insertions(+), 50 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 34641298d..2a25c2104 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -26,6 +26,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.observers = observers
         self.iterations = iterations
         self.device_options = device_options
+        self.cache_manager = None
 
         # Define cluster resources
         self.num_gpus = get_num_devices(kernel_source.lang)
@@ -46,7 +47,12 @@ def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
-    def run(self, parameter_space, tuning_options):
+    def run(self, parameter_space, tuning_options, cache_manager):
+        self.cache_manager = cache_manager
+        # Distribute the cache manager to all actors and initialize runners of actors
+        for actor in self.actors:
+            actor.set_cache_manager.remote(cache_manager)
+            actor.init_runner.remote()
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
         return results
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index e913974a7..71b763326 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -6,6 +6,7 @@
 
 from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
 from kernel_tuner.core import DeviceInterface
+from kernel_tuner.runners.sequential import SequentialRunner
 
 @ray.remote(num_gpus=1)
 class ParallelRemoteActor():
@@ -29,54 +30,17 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-        #move data to the GPU
-        self.gpu_args = self.dev.ready_argument_list(self.kernel_options.arguments)
+        self.cache_manager = None
+        self.runner = None
 
     def execute(self, element, tuning_options):
-        #print(f"GPU {self.gpu_id} started execution", file=sys. stderr)
-        params = dict(zip(tuning_options.tune_params.keys(), element))
+        self.runner.run(element, tuning_options, self.cache_manager)
 
-        result = None
-        warmup_time = 0
+    def set_cache_manager(self, cache_manager):
+        self.cache_manager = cache_manager
 
-        # check if configuration is in the cache
-        x_int = ",".join([str(i) for i in element])
-        if tuning_options.cache and x_int in tuning_options.cache:
-            params.update(tuning_options.cache[x_int])
-            params['compile_time'] = 0
-            params['verification_time'] = 0
-            params['benchmark_time'] = 0
-        else:
-            # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
-            if not self.warmed_up:
-                warmup_time = perf_counter()
-                self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-                self.warmed_up = True
-                warmup_time = 1e3 * (perf_counter() - warmup_time)
-
-            result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
-            params.update(result)
-
-            if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-
-        # only compute metrics on configs that have not errored
-        if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
-            params = process_metrics(params, tuning_options.metrics)
-
-        # get the framework time by estimating based on other times
-        total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
-        params['strategy_time'] = self.last_strategy_time
-        params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-        params['timestamp'] = str(datetime.now(timezone.utc))
-        self.start_time = perf_counter()
-
-        if result:
-            # print configuration to the console
-            print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
-
-            # add configuration to cache
-            store_cache(x_int, params, tuning_options)
-
-        return params
\ No newline at end of file
+    def init_runner(self):
+        if self.cache_manager is None:
+            raise ValueError("Cache manager is not set.")
+        self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
+                                       self.iterations, self.observers, cache_manager=self.cache_manager)
\ No newline at end of file
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index a0e3f8ebe..ba3d834ad 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -1,13 +1,18 @@
 """ The default strategy that iterates through the whole parameter space """
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
+from kernel_tuner.runners.parallel import ParallelRunner
+from kernel_tuner.runners.ray.cache_manager import CacheManager
 
 _options = {}
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
-    # call the runner
-    return runner.run(searchspace.sorted_list(), tuning_options)
+    if isinstance(runner, ParallelRunner):
+        cache_manager = CacheManager.remote(tuning_options)
+        return runner.run(searchspace.sorted_list(), tuning_options, cache_manager)
+    else:
+        return runner.run(searchspace.sorted_list(), tuning_options)
 
 
 tune.__doc__ = common.get_strategy_docstring("Brute Force", _options)

From a476585a8f4a261372fe9610e8e0d0093de368c3 Mon Sep 17 00:00:00 2001
From: Milo Lurati <mlurati@fs0.das6.cs.vu.nl>
Date: Thu, 11 Apr 2024 15:10:40 +0200
Subject: [PATCH 027/106] re-assign tuning options to final version from the
 cache manager at the end of the parallel runner execution

---
 kernel_tuner/runners/parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 2a25c2104..07e3933ab 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -55,6 +55,7 @@ def run(self, parameter_space, tuning_options, cache_manager):
             actor.init_runner.remote()
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
+        tuning_options = ray.get(cache_manager.get_tuning_options.remote())
         return results
     
     def create_actor_on_gpu(self, gpu_id):

From 6233e09af7f2ba92de4653e9064728d0410c081f Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 11 Apr 2024 15:13:08 +0200
Subject: [PATCH 028/106] small bug fix in execute

---
 kernel_tuner/runners/ray/parallel_remote_actor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index 71b763326..81629781b 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -30,11 +30,14 @@ def __init__(self,
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
+        self.device_options = device_options
+        self.iterations = iterations
+        self.observers = observers
         self.cache_manager = None
         self.runner = None
 
     def execute(self, element, tuning_options):
-        self.runner.run(element, tuning_options, self.cache_manager)
+        return self.runner.run([element], tuning_options)[0]
 
     def set_cache_manager(self, cache_manager):
         self.cache_manager = cache_manager

From 14e5f0b3ec0e606a8f1f10bc9b3ae4f2d0017607 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 16 Apr 2024 11:14:44 +0200
Subject: [PATCH 029/106] updates to run ensemble in simulation mode on CPUs

---
 kernel_tuner/runners/ray/remote_actor.py |  2 +-
 kernel_tuner/strategies/ensemble.py      | 48 +++++++++++++++---------
 kernel_tuner/util.py                     |  6 ++-
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index c092d78e7..a68b63ace 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -9,7 +9,7 @@
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 
-@ray.remote(num_gpus=1)
+@ray.remote
 class RemoteActor():
     def __init__(self, 
                  kernel_source,
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index e31ac60ff..902402e2c 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -52,28 +52,34 @@
 }
 
 def tune(searchspace: Searchspace, runner, tuning_options):
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    if "ensemble" in tuning_options:
+        ensemble = tuning_options["ensemble"]
+    else:
+        ensemble = ["random_sample", "random_sample"]
+
     # Define cluster resources
-    num_gpus = get_num_devices(runner.kernel_source.lang)
-    print(f"Number of GPUs in use: {num_gpus}", file=sys. stderr)
+    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    print(f"Number of devices available: {num_devices}", file=sys. stderr)
+    if num_devices < len(ensemble):
+        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({len(ensemble)})")
+    
     resources = {}
-    for id in range(num_gpus):
-        gpu_resource_name = f"gpu_{id}"
-        resources[gpu_resource_name] = 1
+    for id in range(len(ensemble)):
+        device_resource_name = f"device_{id}"
+        resources[device_resource_name] = 1
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
-    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
-    cache_manager = CacheManager.remote(tuning_options)
-    # Create RemoteActor instances
-    actors = [create_actor_on_gpu(id, runner, cache_manager) for id in range(num_gpus)]
-    
-    if "ensemble" in tuning_options:
-        ensemble = tuning_options["ensemble"]
+    if simulation_mode:
+        ray.init(num_cpus=len(ensemble) + 1, include_dashboard=True, ignore_reinit_error=True)
     else:
-        ensemble = ["random_sample", "random_sample", "random_sample"]
+        ray.init(num_gpus=len(ensemble), num_cpus=1, include_dashboard=True, ignore_reinit_error=True)
+    # Create cache manager and actors
+    cache_manager = CacheManager.remote(tuning_options)
+    actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
     tasks = []
-    simulation_mode = True if isinstance(runner, SimulationRunner) else False
     for i in range(len(ensemble)):
         strategy = ensemble[i]
         actor = actors[i]
@@ -93,11 +99,19 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
 
+    #kill all actors and chache manager
+    for actor in actors:
+        ray.kill(actor)
+    ray.kill(cache_manager)
+
     return final_results
 
-def create_actor_on_gpu(gpu_id, runner, cache_manager):
-    gpu_resource_name = f"gpu_{gpu_id}"
-    return RemoteActor.options(resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
+def create_actor_on_device(device_id, runner, cache_manager, simulation_mode):
+    if simulation_mode:
+        resource_options= {"num_cpus": 1}
+    else:
+        resource_options= {"num_gpus": 1}
+    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
                                                                         runner.kernel_options, 
                                                                         runner.device_options, 
                                                                         runner.iterations, 
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 3392dead2..b9ecf9b3a 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1279,9 +1279,11 @@ def cuda_error_check(error):
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
 
-def get_num_devices(lang):
+def get_num_devices(lang, simulation_mode=False):
     num_devices = 0
-    if lang.upper() == "CUDA":
+    if simulation_mode:
+        num_devices = os.cpu_count()
+    elif lang.upper() == "CUDA":
         import pycuda.driver as cuda
         cuda.init()
         num_devices = cuda.Device.count()

From a963dac09eb3d3b48713e61a54c161bcbd22bffb Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 23 Apr 2024 12:44:11 +0200
Subject: [PATCH 030/106] fixed problem with ray resources and stalling actors

---
 kernel_tuner/runners/ray/cache_manager.py |  2 +-
 kernel_tuner/strategies/ensemble.py       | 25 +++++++++++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index 437499352..882207f02 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -3,7 +3,7 @@
 
 from kernel_tuner.util import store_cache
 
-@ray.remote
+@ray.remote(num_cpus=1)
 class CacheManager:
     def __init__(self, tuning_options):
         self.tuning_options = tuning_options
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 902402e2c..725414048 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -66,16 +66,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     
     resources = {}
     for id in range(len(ensemble)):
-        device_resource_name = f"device_{id}"
+        device_resource_name = f"gpu_{id}"
         resources[device_resource_name] = 1
+    resources["cache_manager_cpu"] = 1
     # Initialize Ray
     os.environ["RAY_DEDUP_LOGS"] = "0"
-    if simulation_mode:
-        ray.init(num_cpus=len(ensemble) + 1, include_dashboard=True, ignore_reinit_error=True)
-    else:
-        ray.init(num_gpus=len(ensemble), num_cpus=1, include_dashboard=True, ignore_reinit_error=True)
+    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     # Create cache manager and actors
-    cache_manager = CacheManager.remote(tuning_options)
+    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
     actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
@@ -106,14 +104,15 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     return final_results
 
-def create_actor_on_device(device_id, runner, cache_manager, simulation_mode):
+def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
+    gpu_resource_name = f"gpu_{gpu_id}"
     if simulation_mode:
         resource_options= {"num_cpus": 1}
     else:
         resource_options= {"num_gpus": 1}
-    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
-                                                                        runner.kernel_options, 
-                                                                        runner.device_options, 
-                                                                        runner.iterations, 
-                                                                        runner.observers,
-                                                                        cache_manager)
\ No newline at end of file
+    return RemoteActor.options(**resource_options, resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
+                                                                                            runner.kernel_options, 
+                                                                                            runner.device_options, 
+                                                                                            runner.iterations, 
+                                                                                            runner.observers,
+                                                                                            cache_manager)
\ No newline at end of file

From c55b8704e6570e4441564b24b236cad433b7cb4e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:17:34 +0200
Subject: [PATCH 031/106] added setup_resources and new impl of costfunc (not
 yet tested and still have to deal with stop criterion)

---
 kernel_tuner/strategies/common.py | 90 ++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index e9cdfeab4..5ff3cacaf 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -6,6 +6,7 @@
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.util import get_num_devices
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -44,7 +45,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", "maxiter"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -73,6 +74,35 @@ def __call__(self, x, check_restrictions=True):
         util.check_stop_criterion(self.tuning_options)
 
         # snap values in x to nearest actual value for each parameter, unscale x if needed
+        configs = [self._prepare_config(cfg) for cfg in configs]
+
+        legal_configs, illegal_results = self._get_legal_configs(configs)
+        results = self.runner.run(legal_configs, self.tuning_options)
+        self.results.extend(results)
+
+        for result in results:
+            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
+            x_int = ",".join([str(i) for i in config])
+            # append to tuning results
+            if x_int not in self.tuning_options.unique_results:
+                self.tuning_options.unique_results[x_int] = result
+
+        # upon returning from this function control will be given back to the strategy, so reset the start time
+        self.runner.last_strategy_start_time = perf_counter()
+
+        # get numerical return values, taking optimization direction into account
+        all_results = results + illegal_results
+        return_values = []
+        for result in all_results:
+            return_value = result[self.tuning_options.objective] or sys.float_info.max
+            return_values.append(return_value if not self.tuning_options.objective_higher_is_better else -return_value)
+
+        if len(return_values) == 1:
+            return return_values[0]
+        return return_values
+    
+    def _prepare_config(self, x):
+        """Prepare a single configuration by snapping to nearest values and/or scaling."""
         if self.snap:
             if self.scaling:
                 params = unscale_and_snap_to_nearest(x, self.searchspace.tune_params, self.tuning_options.eps)
@@ -81,38 +111,21 @@ def __call__(self, x, check_restrictions=True):
         else:
             params = x
         logging.debug('params ' + str(params))
+        return params
+    
+    def _get_legal_configs(self, configs) -> list:
+            results = []
+            legal_configs = []
+            for config in configs:
+                params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
+                legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
+                if not legal:
+                    params_dict[self.tuning_options.objective] = util.InvalidConfig()
+                    results.append(params_dict)
+                else:
+                    legal_configs.append(config)
+            return legal_configs, results
 
-        legal = True
-        result = {}
-        x_int = ",".join([str(i) for i in params])
-
-        # else check if this is a legal (non-restricted) configuration
-        if check_restrictions and self.searchspace.restrictions:
-            params_dict = dict(zip(self.searchspace.tune_params.keys(), params))
-            legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
-            if not legal:
-                result = params_dict
-                result[self.tuning_options.objective] = util.InvalidConfig()
-
-        if legal:
-            # compile and benchmark this instance
-            res = self.runner.run([params], self.tuning_options)
-            result = res[0]
-
-            # append to tuning results
-            if x_int not in self.tuning_options.unique_results:
-                self.tuning_options.unique_results[x_int] = result
-
-            self.results.append(result)
-
-            # upon returning from this function control will be given back to the strategy, so reset the start time
-            self.runner.last_strategy_start_time = perf_counter()
-
-        # get numerical return value, taking optimization direction into account
-        return_value = result[self.tuning_options.objective] or sys.float_info.max
-        return_value = return_value if not self.tuning_options.objective_higher_is_better else -return_value
-
-        return return_value
 
     def get_bounds_x0_eps(self):
         """Compute bounds, x0 (the initial guess), and eps."""
@@ -243,3 +256,16 @@ def scale_from_params(params, tune_params, eps):
     for i, v in enumerate(tune_params.values()):
         x[i] = 0.5 * eps + v.index(params[i])*eps
     return x
+
+def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
+    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    print(f"Number of devices available: {num_devices}", file=sys.stderr)
+    if num_devices < ensemble_size:
+        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")
+    
+    resources = {}
+    for id in range(ensemble_size):
+        device_resource_name = f"gpu_{id}"
+        resources[device_resource_name] = 1
+    resources["cache_manager_cpu"] = 1
+    return resources
\ No newline at end of file

From d8541a07ef38ac4881ba3e4af5d9e17a58a6569e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:20:51 +0200
Subject: [PATCH 032/106] added ensemble and memetic to strategy map and import

---
 kernel_tuner/interface.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 1267915c5..81ae7de48 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -58,7 +58,8 @@
     pso,
     random_sample,
     simulated_annealing,
-    ensemble
+    ensemble,
+    memetic
 )
 
 strategy_map = {
@@ -77,7 +78,8 @@
     "simulated_annealing": simulated_annealing,
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
-    "ensemble": ensemble
+    "ensemble": ensemble,
+    "memetic": memetic,
 }
 
 

From c755254a1406d07cee57f3c3acfc5dc2ba85d73a Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:22:27 +0200
Subject: [PATCH 033/106] rearranged how parallel runner deals with cache
 manager and actor's lifecycle

---
 kernel_tuner/runners/parallel.py              | 48 +++++++++++--------
 .../runners/ray/parallel_remote_actor.py      | 10 ++--
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 07e3933ab..1a2e894c6 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -12,7 +12,7 @@
 
 class ParallelRunner(Runner):
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, resources=None):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = device_options.quiet
@@ -26,39 +26,44 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.observers = observers
         self.iterations = iterations
         self.device_options = device_options
-        self.cache_manager = None
+        self.cache_manager = cache_manager
 
         # Define cluster resources
         self.num_gpus = get_num_devices(kernel_source.lang)
         print(f"Number of GPUs in use: {self.num_gpus}", file=sys. stderr)
-        resources = {}
-        for id in range(self.num_gpus):
-            gpu_resource_name = f"gpu_{id}"
-            resources[gpu_resource_name] = 1
+        if resources is None:
+            for id in range(self.num_gpus):
+                gpu_resource_name = f"gpu_{id}"
+                resources[gpu_resource_name] = 1
         # Initialize Ray
-        os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
-        # Create RemoteActor instances
-        self.actors = [self.create_actor_on_gpu(id) for id in range(self.num_gpus)]
-        # Create a pool of RemoteActor actors
-        self.actor_pool = ActorPool(self.actors)
+        if not ray.is_initialized():
+            os.environ["RAY_DEDUP_LOGS"] = "0"
+            ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
-    def run(self, parameter_space, tuning_options, cache_manager):
-        self.cache_manager = cache_manager
-        # Distribute the cache manager to all actors and initialize runners of actors
-        for actor in self.actors:
-            actor.set_cache_manager.remote(cache_manager)
-            actor.init_runner.remote()
+    def run(self, parameter_space, tuning_options, cache_manager=None):
+        if self.cache_manager is None:
+            if cache_manager is None:
+                raise ValueError("A cache manager is required for parallel execution")
+            self.cache_manager = cache_manager
+        # Create RemoteActor instances
+        self.actors = [self.create_actor_on_gpu(id, self.cache_manager) for id in range(self.num_gpus)]
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
-        tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+        new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+        tuning_options.update(new_tuning_options)
+
+        for actor in self.actors:
+            ray.kill(actor)
+        
         return results
     
-    def create_actor_on_gpu(self, gpu_id):
+    def create_actor_on_gpu(self, gpu_id, cache_manager):
         gpu_resource_name = f"gpu_{gpu_id}"
         return ParallelRemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
                                                                             self.kernel_source, 
@@ -66,4 +71,5 @@ def create_actor_on_gpu(self, gpu_id):
                                                                             self.device_options, 
                                                                             self.iterations, 
                                                                             self.observers,
-                                                                            gpu_id)
+                                                                            gpu_id,
+                                                                            cache_manager)
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index 81629781b..051c8689c 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -17,7 +17,8 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers,
-                 gpu_id):
+                 gpu_id,
+                 cache_manager):
         
         self.gpu_id = gpu_id
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
@@ -33,15 +34,14 @@ def __init__(self,
         self.device_options = device_options
         self.iterations = iterations
         self.observers = observers
-        self.cache_manager = None
+        self.cache_manager = cache_manager
         self.runner = None
 
     def execute(self, element, tuning_options):
+        if self.runner is None:
+            self.init_runner()
         return self.runner.run([element], tuning_options)[0]
 
-    def set_cache_manager(self, cache_manager):
-        self.cache_manager = cache_manager
-
     def init_runner(self):
         if self.cache_manager is None:
             raise ValueError("Cache manager is not set.")

From a23ef94ca297656798772e27e8d7164c8e4ce42d Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:25:50 +0200
Subject: [PATCH 034/106] initial adaptions for memetic and cleaned up logic of
 ensemble

---
 kernel_tuner/strategies/ensemble.py | 83 +++++++++++++++++------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 725414048..d5933750f 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -2,6 +2,8 @@
 import sys
 import os
 import ray
+import copy
+import logging
 from ray.util.actor_pool import ActorPool
 
 import numpy as np
@@ -9,7 +11,7 @@
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params
+from kernel_tuner.strategies.common import CostFunc, scale_from_params, setup_resources
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
@@ -51,29 +53,20 @@
     "bayes_opt": bayes_opt,
 }
 
-def tune(searchspace: Searchspace, runner, tuning_options):
+def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
         ensemble = ["random_sample", "random_sample"]
-
-    # Define cluster resources
-    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    print(f"Number of devices available: {num_devices}", file=sys. stderr)
-    if num_devices < len(ensemble):
-        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({len(ensemble)})")
-    
-    resources = {}
-    for id in range(len(ensemble)):
-        device_resource_name = f"gpu_{id}"
-        resources[device_resource_name] = 1
-    resources["cache_manager_cpu"] = 1
+    resources = setup_resources(len(ensemble), simulation_mode, runner)
     # Initialize Ray
-    os.environ["RAY_DEDUP_LOGS"] = "0"
-    ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+    if not ray.is_initialized():
+        os.environ["RAY_DEDUP_LOGS"] = "0"
+        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     # Create cache manager and actors
-    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
+    if cache_manager is None:
+        cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
     actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
     ensemble = [strategy_map[strategy] for strategy in ensemble]
@@ -81,27 +74,19 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     for i in range(len(ensemble)):
         strategy = ensemble[i]
         actor = actors[i]
-        task = actor.execute.remote(strategy, searchspace, tuning_options, simulation_mode)
+        remote_tuning_options = setup_tuning_options(tuning_options)
+        task = actor.execute.remote(strategy, searchspace, remote_tuning_options, simulation_mode)
         tasks.append(task)
     all_results = ray.get(tasks)
-    tuning_options = ray.get(cache_manager.get_tuning_options.remote())
-
-    unique_configs = set()
-    final_results = []
-
-    for strategy_results in all_results:
-        for new_result in strategy_results:
-            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+    new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+    tuning_options.update(new_tuning_options)
+    final_results, population = process_results(all_results, searchspace)
 
-            if config_signature not in unique_configs:
-                final_results.append(new_result)
-                unique_configs.add(config_signature)
-
-    #kill all actors and chache manager
-    for actor in actors:
-        ray.kill(actor)
-    ray.kill(cache_manager)
+    if population: # for memetic strategy
+        tuning_options.strategy_options["population"] = population
+        logging.debug(f"tuning_options.strategy_options[population]: {tuning_options.strategy_options['population']}")
 
+    clean_up(actors, cache_manager)
     return final_results
 
 def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
@@ -115,4 +100,32 @@ def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
                                                                                             runner.device_options, 
                                                                                             runner.iterations, 
                                                                                             runner.observers,
-                                                                                            cache_manager)
\ No newline at end of file
+                                                                                            cache_manager)
+
+def setup_tuning_options(tuning_options):
+    new_tuning_options = copy.deepcopy(tuning_options)
+    if "candidates" in tuning_options.strategy_options:
+        #new_tuning_options.strategy_options.pop("candidates")
+        if len(tuning_options.strategy_options["candidates"]) > 0:
+            new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
+    return new_tuning_options
+
+def process_results(all_results, searchspace):
+    unique_configs = set()
+    final_results = []
+    population = [] # for memetic strategy
+
+    for (strategy_results, tuning_options) in all_results:
+        if "candidate" in tuning_options.strategy_options:
+            population.append(tuning_options.strategy_options["candidate"])
+        for new_result in strategy_results:
+            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+            if config_signature not in unique_configs:
+                final_results.append(new_result)
+                unique_configs.add(config_signature)
+    return final_results, population
+
+def clean_up(actors, cache_manager):
+    for actor in actors:
+        ray.kill(actor)
+    ray.kill(cache_manager)

From 697ead0805c44fc2a623bfdc51785677963bd154 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:26:24 +0200
Subject: [PATCH 035/106] returning tuning_options for memetic logic

---
 kernel_tuner/runners/ray/remote_actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index a68b63ace..61127d5be 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -35,5 +35,5 @@ def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
             runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                  self.iterations, self.observers, cache_manager=self.cache_manager)
         results = strategy.tune(searchspace, runner, tuning_options)
-        return results
+        return results, tuning_options
     
\ No newline at end of file

From b247ed0d8900ebe692850c9334c2730a1ae58682 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:26:52 +0200
Subject: [PATCH 036/106] init impl of memetic strategy

---
 kernel_tuner/strategies/memetic.py | 118 +++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 kernel_tuner/strategies/memetic.py

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
new file mode 100644
index 000000000..5708099be
--- /dev/null
+++ b/kernel_tuner/strategies/memetic.py
@@ -0,0 +1,118 @@
+import logging
+import ray
+import os
+
+from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.runners.parallel import ParallelRunner
+from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.strategies.common import setup_resources
+
+from kernel_tuner.strategies import (
+    basinhopping,
+    bayes_opt,
+    brute_force,
+    diff_evo,
+    dual_annealing,
+    firefly_algorithm,
+    genetic_algorithm,
+    greedy_ils,
+    greedy_mls,
+    minimize,
+    mls,
+    ordered_greedy_mls,
+    pso,
+    random_sample,
+    simulated_annealing,
+    ensemble,
+    memetic
+)
+
+strategy_map = {
+    "brute_force": brute_force,
+    "random_sample": random_sample,
+    "minimize": minimize,
+    "basinhopping": basinhopping,
+    "diff_evo": diff_evo,
+    "genetic_algorithm": genetic_algorithm,
+    "greedy_mls": greedy_mls,
+    "ordered_greedy_mls": ordered_greedy_mls,
+    "greedy_ils": greedy_ils,
+    "dual_annealing": dual_annealing,
+    "mls": mls,
+    "pso": pso,
+    "simulated_annealing": simulated_annealing,
+    "firefly_algorithm": firefly_algorithm,
+    "bayes_opt": bayes_opt,
+}
+
+# Pseudo code from "Memetic algorithms and memetic computing optimization: A literature review" by Ferrante Neri and Carlos Cotta
+# function BasicMA (in P: Problem, in par: Parameters):
+# Solution; 
+# begin 
+#     pop ← Initialize(par, P); 
+#     repeat 
+#         newpop1 ← Cooperate(pop, par, P); 
+#         newpop2 ← Improve(newpop1, par, P); 
+#         pop ← Compete (pop, newpop2); 
+#         if Converged(pop) then 
+#             pop ← Restart(pop, par); 
+#         end 
+#     until TerminationCriterion(par); 
+#     return GetNthBest(pop, 1); 
+# end
+
+ls_strategies_list = {
+    "greedy_mls",
+    "ordered_greedy_mls",
+    "greedy_ils",
+    "mls",
+    "hill_climbing"
+}
+
+pop_based_strategies_list = {
+    "genetic_algorithm",
+    "differential_evolution",
+    "pso"
+}
+
+
+def tune(searchspace: Searchspace, runner, tuning_options):
+    simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    ls_strategies = ["greedy_ils", "greedy_ils", "greedy_ils", "greedy_ils"]
+    pop_based_strategy = "genetic_algorithm"
+    iterations = 10
+
+    if set(ls_strategies) <= ls_strategies_list:
+        tuning_options["ensemble"] = ls_strategies
+    else:
+        raise ValueError("Provided local search ensemble are not all local search strategies")
+
+    if pop_based_strategy in pop_based_strategies_list:
+        pop_based_strategy = strategy_map[pop_based_strategy]
+    else:
+        raise ValueError("Provided population based strategy is not a population based strategy")
+    
+    tuning_options.strategy_options["candidates"] = searchspace.get_random_sample(len(ls_strategies))
+    tuning_options.strategy_options["max_fevals"] = 10
+    tuning_options.strategy_options["maxiter"] = 10
+
+    resources = setup_resources(len(ls_strategies), simulation_mode, runner)
+    # Initialize Ray
+    if not ray.is_initialized():
+        os.environ["RAY_DEDUP_LOGS"] = "0"
+        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+    # Create cache manager and actors
+    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
+    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+                                runner.iterations, runner.observers, cache_manager=cache_manager,
+                                resources=resources)
+    
+    for i in range(iterations):
+        print(f"Memetic algorithm iteration {i}")
+        print(f"start local search ensemble with candidates = {tuning_options.strategy_options['candidates']}")
+        ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
+        print(f"start pop base algo with population = {tuning_options.strategy_options['population']}")
+        results = pop_based_strategy.tune(searchspace, pop_runner, tuning_options)
+
+    return results
\ No newline at end of file

From 948ab7fc0a3b4004afdd029b626bda8e2ffec854 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 13:27:35 +0200
Subject: [PATCH 037/106] initial adapion for memetic strategy

---
 kernel_tuner/strategies/genetic_algorithm.py | 54 +++++++++++++++-----
 kernel_tuner/strategies/greedy_ils.py        |  9 ++--
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index c29c150b5..d202a86c8 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -7,39 +7,40 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
+from kernel_tuner.runners.parallel import ParallelRunner
 
 _options = dict(
     popsize=("population size", 20),
     maxiter=("maximum number of generations", 100),
     method=("crossover method to use, choose any from single_point, two_point, uniform, disruptive_uniform", "uniform"),
     mutation_chance=("chance to mutate is 1 in mutation_chance", 10),
+    population=("initial population", None),
 )
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
-    pop_size, generations, method, mutation_chance = common.get_options(options, _options)
+    pop_size, generations, method, mutation_chance, population = common.get_options(options, _options)
     crossover = supported_methods[method]
 
     best_score = 1e20
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
-    population = list(list(p) for p in searchspace.get_random_sample(pop_size))
+    if not population:
+        population = list(list(p) for p in searchspace.get_random_sample(pop_size))
+    else:
+        pop_size = len(population)
 
     for generation in range(generations):
 
-        # determine fitness of population members
-        weighted_population = []
-        for dna in population:
-            try:
-                time = cost_func(dna, check_restrictions=False)
-            except util.StopCriterionReached as e:
-                if tuning_options.verbose:
-                    print(e)
-                return cost_func.results
-
-            weighted_population.append((dna, time))
+        # Evaluate the entire population
+        try:
+            weighted_population = evaluate_population(runner, cost_func, population)
+        except util.StopCriterionReached as e:
+            if tuning_options.verbose:
+                print(e)
+            return cost_func.results
 
         # population is sorted such that better configs have higher chance of reproducing
         weighted_population.sort(key=lambda x: x[1])
@@ -69,7 +70,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                     break
 
         # could combine old + new generation here and do a selection
-
+    tuning_options.strategy_options["candidates"] = population # for memetic strategy
     return cost_func.results
 
 
@@ -177,3 +178,28 @@ def disruptive_uniform_crossover(dna1, dna2):
     "uniform": uniform_crossover,
     "disruptive_uniform": disruptive_uniform_crossover,
 }
+
+def evaluate_population(runner, cost_func, population):
+    """
+    Evaluate the population based on the type of runner.
+
+    Parameters:
+    - runner: The runner (ParallelRunner or SequentialRunner) determining how to process evaluations.
+    - cost_func: A function capable of evaluating the population.
+    - population: List of individuals to be evaluated.
+
+    Returns:
+    - List of tuples (dna, fitness_score) representing the population and their evaluation results.
+    """
+    print(f"population: {population}")
+    if isinstance(runner, ParallelRunner):
+        # Process the whole population at once if using a ParallelRunner
+        results = cost_func(population, check_restrictions=False)
+        return list(zip(population, results))
+    else:
+        # Process each individual sequentially for SequentialRunner
+        weighted_population = []
+        for dna in population:
+            time = cost_func(dna, check_restrictions=False)  # Cost function called with a single-element list
+            weighted_population.append((dna, time))
+        return weighted_population
\ No newline at end of file
diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index a4c521746..1aa00ec51 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -9,7 +9,8 @@
 _options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
                        restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
                        no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
-                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3))
+                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3),
+                       candidate=("initial candidate for the search", None))
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
@@ -17,7 +18,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
 
-    neighbor, restart, no_improvement, randomwalk = common.get_options(options, _options)
+    neighbor, restart, no_improvement, randomwalk, candidate = common.get_options(options, _options)
 
     perm_size = int(randomwalk * dna_size)
     if perm_size == 0:
@@ -31,7 +32,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
     #while searching
-    candidate = searchspace.get_random_sample(1)[0]
+    if not candidate:
+        candidate = searchspace.get_random_sample(1)[0]
     best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
@@ -53,6 +55,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Instead of full restart, permute the starting candidate
         candidate = random_walk(candidate, perm_size, no_improvement, last_improvement, searchspace)
+    tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
     return cost_func.results
 
 

From 9e40d4ec9d8efe620a8163355fe5ad1996b961f8 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 14:39:01 +0200
Subject: [PATCH 038/106] removed brute_force from strategy map and import

---
 kernel_tuner/strategies/ensemble.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index d5933750f..164599040 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -20,7 +20,6 @@
 from kernel_tuner.strategies import (
     basinhopping,
     bayes_opt,
-    brute_force,
     diff_evo,
     dual_annealing,
     firefly_algorithm,
@@ -36,7 +35,6 @@
 )
 
 strategy_map = {
-    "brute_force": brute_force,
     "random_sample": random_sample,
     "minimize": minimize,
     "basinhopping": basinhopping,

From 3cb428db56904a136089ab9e8980b5d40aa9896a Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 25 Apr 2024 14:40:02 +0200
Subject: [PATCH 039/106] fixes of new costfunc and stop criterion is checked
 retrospectively

---
 kernel_tuner/strategies/common.py | 43 ++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 5ff3cacaf..5ff17fea6 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -75,23 +75,16 @@ def __call__(self, x, check_restrictions=True):
 
         # snap values in x to nearest actual value for each parameter, unscale x if needed
         configs = [self._prepare_config(cfg) for cfg in configs]
-
-        legal_configs, illegal_results = self._get_legal_configs(configs)
-        results = self.runner.run(legal_configs, self.tuning_options)
-        self.results.extend(results)
-
-        for result in results:
-            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
-            x_int = ",".join([str(i) for i in config])
-            # append to tuning results
-            if x_int not in self.tuning_options.unique_results:
-                self.tuning_options.unique_results[x_int] = result
-
-        # upon returning from this function control will be given back to the strategy, so reset the start time
-        self.runner.last_strategy_start_time = perf_counter()
+        
+        legal_configs = configs
+        illegal_results = []
+        if check_restrictions and self.searchspace.restrictions:
+            legal_configs, illegal_results = self._get_legal_configs(configs)
+        
+        final_results = self._evaluate_configs(legal_configs) if len(legal_configs) > 0 else []
 
         # get numerical return values, taking optimization direction into account
-        all_results = results + illegal_results
+        all_results = final_results + illegal_results
         return_values = []
         for result in all_results:
             return_value = result[self.tuning_options.objective] or sys.float_info.max
@@ -125,7 +118,27 @@ def _get_legal_configs(self, configs) -> list:
                 else:
                     legal_configs.append(config)
             return legal_configs, results
+    
+    def _evaluate_configs(self, configs):
+        results = self.runner.run(configs, self.tuning_options)
+        self.results.extend(results)
+
+        final_results = []
+        for result in results:
+            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
+            x_int = ",".join([str(i) for i in config])
+            # append to tuning results
+            if x_int not in self.tuning_options.unique_results:
+                self.tuning_options.unique_results[x_int] = result
+                # check if max_fevals is reached or time limit is exceeded within the the results
+                util.check_stop_criterion(self.tuning_options)
+            final_results.append(result)
+
+        self.results.append(final_results)
+        # upon returning from this function control will be given back to the strategy, so reset the start time
+        self.runner.last_strategy_start_time = perf_counter()
 
+        return final_results
 
     def get_bounds_x0_eps(self):
         """Compute bounds, x0 (the initial guess), and eps."""

From 2d13fc36db0e6b78468cd1f54dad4e636a194796 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:42:06 +0200
Subject: [PATCH 040/106] fixed bug with tuning options cache manager

---
 kernel_tuner/runners/parallel.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 1a2e894c6..98e6a4c63 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -30,7 +30,6 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
 
         # Define cluster resources
         self.num_gpus = get_num_devices(kernel_source.lang)
-        print(f"Number of GPUs in use: {self.num_gpus}", file=sys. stderr)
         if resources is None:
             for id in range(self.num_gpus):
                 gpu_resource_name = f"gpu_{id}"
@@ -55,7 +54,7 @@ def run(self, parameter_space, tuning_options, cache_manager=None):
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
         results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
-        new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
+        new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
         tuning_options.update(new_tuning_options)
 
         for actor in self.actors:

From 1a2ba539fcf38343d2a6ffea42457575aaa325b8 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:45:10 +0200
Subject: [PATCH 041/106] fixed some bugs for memetic algo functioning

---
 kernel_tuner/strategies/ensemble.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 164599040..53bedd010 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -63,7 +63,9 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
         os.environ["RAY_DEDUP_LOGS"] = "0"
         ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
     # Create cache manager and actors
+    kill_cache_manager = False
     if cache_manager is None:
+        kill_cache_manager = True
         cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
     actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
     
@@ -82,9 +84,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
 
     if population: # for memetic strategy
         tuning_options.strategy_options["population"] = population
-        logging.debug(f"tuning_options.strategy_options[population]: {tuning_options.strategy_options['population']}")
 
-    clean_up(actors, cache_manager)
+    clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
 
 def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
@@ -103,7 +104,6 @@ def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
 def setup_tuning_options(tuning_options):
     new_tuning_options = copy.deepcopy(tuning_options)
     if "candidates" in tuning_options.strategy_options:
-        #new_tuning_options.strategy_options.pop("candidates")
         if len(tuning_options.strategy_options["candidates"]) > 0:
             new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
     return new_tuning_options
@@ -117,13 +117,14 @@ def process_results(all_results, searchspace):
         if "candidate" in tuning_options.strategy_options:
             population.append(tuning_options.strategy_options["candidate"])
         for new_result in strategy_results:
-            config_signature = tuple(new_result[param] for param in searchspace.tune_params)
+            config_signature = tuple(new_result[key] for key in searchspace.tune_params)
             if config_signature not in unique_configs:
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
     return final_results, population
 
-def clean_up(actors, cache_manager):
+def clean_up(actors, cache_manager, kill_cache_manager):
     for actor in actors:
         ray.kill(actor)
-    ray.kill(cache_manager)
+    if kill_cache_manager:
+        ray.kill(cache_manager)

From 2aba6f506bdaf44c8590b70f8ce614758c228e98 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:45:58 +0200
Subject: [PATCH 042/106] removed debug prints

---
 kernel_tuner/strategies/genetic_algorithm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index d202a86c8..7142ac6cb 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -191,7 +191,6 @@ def evaluate_population(runner, cost_func, population):
     Returns:
     - List of tuples (dna, fitness_score) representing the population and their evaluation results.
     """
-    print(f"population: {population}")
     if isinstance(runner, ParallelRunner):
         # Process the whole population at once if using a ParallelRunner
         results = cost_func(population, check_restrictions=False)

From cd3f2121b775a5bc2b6695206fc2f2716b1f431e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:51:29 +0200
Subject: [PATCH 043/106] fixed problem with single config input and final
 results data structure

---
 kernel_tuner/strategies/common.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 5ff17fea6..8db9cf103 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -73,8 +73,8 @@ def __call__(self, x, check_restrictions=True):
         # check if max_fevals is reached or time limit is exceeded
         util.check_stop_criterion(self.tuning_options)
 
-        # snap values in x to nearest actual value for each parameter, unscale x if needed
-        configs = [self._prepare_config(cfg) for cfg in configs]
+        x_list = [x] if self._is_single_configuration(x) else x
+        configs = [self._prepare_config(cfg) for cfg in x_list]
         
         legal_configs = configs
         illegal_results = []
@@ -82,18 +82,32 @@ def __call__(self, x, check_restrictions=True):
             legal_configs, illegal_results = self._get_legal_configs(configs)
         
         final_results = self._evaluate_configs(legal_configs) if len(legal_configs) > 0 else []
-
         # get numerical return values, taking optimization direction into account
         all_results = final_results + illegal_results
         return_values = []
         for result in all_results:
             return_value = result[self.tuning_options.objective] or sys.float_info.max
             return_values.append(return_value if not self.tuning_options.objective_higher_is_better else -return_value)
-
+        
         if len(return_values) == 1:
             return return_values[0]
         return return_values
     
+    def _is_single_configuration(self, x):
+        # Check if x is an int or float
+        if isinstance(x, (int, float)):
+            return True
+        
+        # Check if x is a numpy array with only floats or ints
+        if isinstance(x, np.ndarray):
+            return x.dtype.kind in 'if'  # Checks for data type being integer ('i') or float ('f')
+
+        # Check if x is a list or tuple and all elements are int or float
+        if isinstance(x, (list, tuple)):
+            return all(isinstance(item, (int, float)) for item in x)
+        
+        return False
+    
     def _prepare_config(self, x):
         """Prepare a single configuration by snapping to nearest values and/or scaling."""
         if self.snap:
@@ -103,10 +117,9 @@ def _prepare_config(self, x):
                 params = snap_to_nearest_config(x, self.searchspace.tune_params)
         else:
             params = x
-        logging.debug('params ' + str(params))
         return params
     
-    def _get_legal_configs(self, configs) -> list:
+    def _get_legal_configs(self, configs):
             results = []
             legal_configs = []
             for config in configs:
@@ -121,11 +134,10 @@ def _get_legal_configs(self, configs) -> list:
     
     def _evaluate_configs(self, configs):
         results = self.runner.run(configs, self.tuning_options)
-        self.results.extend(results)
 
         final_results = []
         for result in results:
-            config = {key: result[key] for key in self.tuning_options.tune_params if key in result}
+            config = tuple(result[key] for key in self.tuning_options.tune_params if key in result)
             x_int = ",".join([str(i) for i in config])
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
@@ -134,7 +146,7 @@ def _evaluate_configs(self, configs):
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
 
-        self.results.append(final_results)
+        self.results.extend(final_results)
         # upon returning from this function control will be given back to the strategy, so reset the start time
         self.runner.last_strategy_start_time = perf_counter()
 
@@ -272,7 +284,6 @@ def scale_from_params(params, tune_params, eps):
 
 def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    print(f"Number of devices available: {num_devices}", file=sys.stderr)
     if num_devices < ensemble_size:
         raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")
     

From af9bd5e17eebe2c16f249847850ac1b5a735ebf3 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 13:52:11 +0200
Subject: [PATCH 044/106] added progress prints of memetic algo and kill
 statement for cache manager

---
 kernel_tuner/strategies/memetic.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 5708099be..6a679a913 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -1,10 +1,12 @@
 import logging
 import ray
 import os
+import sys
 
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import setup_resources
 
@@ -109,10 +111,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                                 resources=resources)
     
     for i in range(iterations):
-        print(f"Memetic algorithm iteration {i}")
-        print(f"start local search ensemble with candidates = {tuning_options.strategy_options['candidates']}")
+        print(f"Memetic iteration: {i}", file=sys.stderr)
+        print(f"Candidates local search: {tuning_options.strategy_options['candidates']}", file=sys.stderr)
         ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
-        print(f"start pop base algo with population = {tuning_options.strategy_options['population']}")
+        print(f"Population pop based: {tuning_options.strategy_options['population']}", file=sys.stderr)
         results = pop_based_strategy.tune(searchspace, pop_runner, tuning_options)
 
+    ray.kill(cache_manager)
+
     return results
\ No newline at end of file

From d382f05be6c7e00a6ec966f67a778176be4af447 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:13:29 +0200
Subject: [PATCH 045/106] sort results for retrospective stop criterion check

---
 kernel_tuner/strategies/common.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 8db9cf103..d50863f8f 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -134,6 +134,9 @@ def _get_legal_configs(self, configs):
     
     def _evaluate_configs(self, configs):
         results = self.runner.run(configs, self.tuning_options)
+        # sort based on timestamp, needed because of parallel tuning of populations and restrospective stop criterion check
+        if "timestamp" in results[0]:
+            results.sort(key=lambda x: x['timestamp'])
 
         final_results = []
         for result in results:
@@ -142,7 +145,7 @@ def _evaluate_configs(self, configs):
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
                 self.tuning_options.unique_results[x_int] = result
-                # check if max_fevals is reached or time limit is exceeded within the the results
+                # check restrospectively if max_fevals is reached or time limit is exceeded within the the results
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
 

From 218b8acd66f225b6fd90cbcb06b5b31f3f3dec6b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:27:41 +0200
Subject: [PATCH 046/106] added comments

---
 kernel_tuner/strategies/common.py | 72 +++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 13 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index d50863f8f..2f0fe1693 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -109,7 +109,16 @@ def _is_single_configuration(self, x):
         return False
     
     def _prepare_config(self, x):
-        """Prepare a single configuration by snapping to nearest values and/or scaling."""
+        """
+        Prepare a single configuration by snapping to nearest values and/or scaling.
+
+        Args:
+            x (list): The input configuration to be prepared.
+
+        Returns:
+            list: The prepared configuration.
+
+        """
         if self.snap:
             if self.scaling:
                 params = unscale_and_snap_to_nearest(x, self.searchspace.tune_params, self.tuning_options.eps)
@@ -120,19 +129,41 @@ def _prepare_config(self, x):
         return params
     
     def _get_legal_configs(self, configs):
-            results = []
-            legal_configs = []
-            for config in configs:
-                params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
-                legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
-                if not legal:
-                    params_dict[self.tuning_options.objective] = util.InvalidConfig()
-                    results.append(params_dict)
-                else:
-                    legal_configs.append(config)
-            return legal_configs, results
+        """
+        Filters and categorizes configurations into legal and illegal based on defined restrictions. 
+        Configurations are checked against restrictions; illegal ones are modified to indicate an invalid state and 
+        included in the results. Legal configurations are collected and returned for potential use.
+
+        Parameters:
+            configs (list of tuple): Configurations to be checked, each represented as a tuple of parameter values.
+
+        Returns:
+            tuple: A pair containing a list of legal configurations and a list of results with illegal configurations marked.
+        """
+        results = []
+        legal_configs = []
+        for config in configs:
+            params_dict = dict(zip(self.searchspace.tune_params.keys(), config))
+            legal = util.check_restrictions(self.searchspace.restrictions, params_dict, self.tuning_options.verbose)
+            if not legal:
+                params_dict[self.tuning_options.objective] = util.InvalidConfig()
+                results.append(params_dict)
+            else:
+                legal_configs.append(config)
+        return legal_configs, results
     
     def _evaluate_configs(self, configs):
+        """
+        Evaluate and manage configurations based on tuning options. Results are sorted by timestamp to maintain 
+        order during parallel processing. The function ensures no duplicates in results and checks for stop criteria 
+        post-processing. Strategy start time is updated upon completion.
+
+        Parameters:
+            configs (list): Configurations to be evaluated.
+
+        Returns:
+            list of dict: Processed results of the evaluations.
+        """
         results = self.runner.run(configs, self.tuning_options)
         # sort based on timestamp, needed because of parallel tuning of populations and restrospective stop criterion check
         if "timestamp" in results[0]:
@@ -145,7 +176,7 @@ def _evaluate_configs(self, configs):
             # append to tuning results
             if x_int not in self.tuning_options.unique_results:
                 self.tuning_options.unique_results[x_int] = result
-                # check restrospectively if max_fevals is reached or time limit is exceeded within the the results
+                # check retrospectively if max_fevals is reached or time limit is exceeded within the results
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
 
@@ -286,6 +317,21 @@ def scale_from_params(params, tune_params, eps):
     return x
 
 def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
+    """
+    Configures resources for an ensemble based on device availability and ensemble size. Checks device 
+    availability against the required number and assigns necessary resources to each GPU and the cache manager.
+
+    Parameters:
+        ensemble_size (int): Required number of devices.
+        simulation_mode (bool): Indicates if the simulation mode affects device availability.
+        runner: Provides access to device information.
+
+    Returns:
+        dict: Resource allocation for GPUs and other components.
+
+    Raises:
+        ValueError: If available devices are insufficient for the ensemble size.
+    """
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
         raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")

From 79b7a506e9ff068b54fa1b50efc8d2e57ccb6605 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:38:15 +0200
Subject: [PATCH 047/106] updated returning results logic in
 _evaluate_configs()

---
 kernel_tuner/strategies/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 2f0fe1693..76990b575 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -179,6 +179,8 @@ def _evaluate_configs(self, configs):
                 # check retrospectively if max_fevals is reached or time limit is exceeded within the results
                 util.check_stop_criterion(self.tuning_options)
             final_results.append(result)
+            # in case of stop creterion reached, save the results so far
+            self.results.append(result)
 
         self.results.extend(final_results)
         # upon returning from this function control will be given back to the strategy, so reset the start time

From 88f63b4f812123e511f57ca43b786ae69c00a541 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 29 Apr 2024 14:44:23 +0200
Subject: [PATCH 048/106] added comments

---
 kernel_tuner/strategies/common.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 76990b575..3a6e83612 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -94,18 +94,23 @@ def __call__(self, x, check_restrictions=True):
         return return_values
     
     def _is_single_configuration(self, x):
-        # Check if x is an int or float
+        """
+        Determines if the input is a single configuration based on its type and composition.
+        
+        Parameters:
+            x: The input to check, which can be an int, float, numpy array, list, or tuple.
+
+        Returns:
+            bool: True if `x` is a single configuration, which includes being a singular int or float, 
+                a numpy array of ints or floats, or a list or tuple where all elements are ints or floats.
+                Otherwise, returns False.
+        """
         if isinstance(x, (int, float)):
             return True
-        
-        # Check if x is a numpy array with only floats or ints
         if isinstance(x, np.ndarray):
             return x.dtype.kind in 'if'  # Checks for data type being integer ('i') or float ('f')
-
-        # Check if x is a list or tuple and all elements are int or float
         if isinstance(x, (list, tuple)):
             return all(isinstance(item, (int, float)) for item in x)
-        
         return False
     
     def _prepare_config(self, x):

From a2afd1d451fd19a8d34249be2f2315378dc8545d Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 30 Apr 2024 15:54:04 +0200
Subject: [PATCH 049/106] updates to run more strategies then devices available

---
 kernel_tuner/runners/parallel.py              | 31 +++++------
 .../runners/ray/parallel_remote_actor.py      |  2 -
 kernel_tuner/runners/ray/remote_actor.py      |  7 +--
 kernel_tuner/strategies/common.py             | 29 ++---------
 kernel_tuner/strategies/ensemble.py           | 52 ++++++++++++-------
 kernel_tuner/strategies/memetic.py            | 19 +++----
 6 files changed, 64 insertions(+), 76 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 98e6a4c63..fe06a8c2d 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -12,7 +12,7 @@
 
 class ParallelRunner(Runner):
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, resources=None):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, num_gpus, cache_manager=None):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = device_options.quiet
@@ -27,17 +27,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.iterations = iterations
         self.device_options = device_options
         self.cache_manager = cache_manager
+        self.num_gpus = num_gpus
 
-        # Define cluster resources
-        self.num_gpus = get_num_devices(kernel_source.lang)
-        if resources is None:
-            for id in range(self.num_gpus):
-                gpu_resource_name = f"gpu_{id}"
-                resources[gpu_resource_name] = 1
         # Initialize Ray
         if not ray.is_initialized():
             os.environ["RAY_DEDUP_LOGS"] = "0"
-            ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+            ray.init(include_dashboard=True, ignore_reinit_error=True)
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
@@ -49,7 +44,7 @@ def run(self, parameter_space, tuning_options, cache_manager=None):
                 raise ValueError("A cache manager is required for parallel execution")
             self.cache_manager = cache_manager
         # Create RemoteActor instances
-        self.actors = [self.create_actor_on_gpu(id, self.cache_manager) for id in range(self.num_gpus)]
+        self.actors = [self.create_actor_on_gpu(self.cache_manager) for _ in range(self.num_gpus)]
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
@@ -62,13 +57,11 @@ def run(self, parameter_space, tuning_options, cache_manager=None):
         
         return results
     
-    def create_actor_on_gpu(self, gpu_id, cache_manager):
-        gpu_resource_name = f"gpu_{gpu_id}"
-        return ParallelRemoteActor.options(resources={gpu_resource_name: 1}).remote(self.quiet,
-                                                                            self.kernel_source, 
-                                                                            self.kernel_options, 
-                                                                            self.device_options, 
-                                                                            self.iterations, 
-                                                                            self.observers,
-                                                                            gpu_id,
-                                                                            cache_manager)
+    def create_actor_on_gpu(self, cache_manager):
+        return ParallelRemoteActor.remote(self.quiet,
+                                            self.kernel_source, 
+                                            self.kernel_options, 
+                                            self.device_options, 
+                                            self.iterations, 
+                                            self.observers,
+                                            cache_manager)
diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
index 051c8689c..bc0d192e7 100644
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ b/kernel_tuner/runners/ray/parallel_remote_actor.py
@@ -17,10 +17,8 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers,
-                 gpu_id,
                  cache_manager):
         
-        self.gpu_id = gpu_id
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
         self.units = self.dev.units
         self.quiet = quiet
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 61127d5be..fba5e0069 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -26,14 +26,15 @@ def __init__(self,
         self.iterations = iterations
         self.observers = observers
         self.cache_manager = cache_manager
+        self.runner = None
         
     def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
         if simulation_mode:
-            runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
+            self.runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                  self.iterations, self.observers)
         else:
-            runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
+            self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                  self.iterations, self.observers, cache_manager=self.cache_manager)
-        results = strategy.tune(searchspace, runner, tuning_options)
+        results = strategy.tune(searchspace, self.runner, tuning_options)
         return results, tuning_options
     
\ No newline at end of file
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 3a6e83612..65db1831c 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -1,6 +1,7 @@
 import logging
 import sys
 from time import perf_counter
+import warnings
 
 import numpy as np
 
@@ -323,29 +324,9 @@ def scale_from_params(params, tune_params, eps):
         x[i] = 0.5 * eps + v.index(params[i])*eps
     return x
 
-def setup_resources(ensemble_size: int, simulation_mode: bool, runner):
-    """
-    Configures resources for an ensemble based on device availability and ensemble size. Checks device 
-    availability against the required number and assigns necessary resources to each GPU and the cache manager.
-
-    Parameters:
-        ensemble_size (int): Required number of devices.
-        simulation_mode (bool): Indicates if the simulation mode affects device availability.
-        runner: Provides access to device information.
-
-    Returns:
-        dict: Resource allocation for GPUs and other components.
-
-    Raises:
-        ValueError: If available devices are insufficient for the ensemble size.
-    """
+def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
+    
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
-        raise ValueError(f"Number of devices ({num_devices}) is less than the number of strategies in the ensemble ({ensemble_size})")
-    
-    resources = {}
-    for id in range(ensemble_size):
-        device_resource_name = f"gpu_{id}"
-        resources[device_resource_name] = 1
-    resources["cache_manager_cpu"] = 1
-    return resources
\ No newline at end of file
+         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
+    
\ No newline at end of file
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 53bedd010..eaac68284 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -4,14 +4,13 @@
 import ray
 import copy
 import logging
-from ray.util.actor_pool import ActorPool
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params, setup_resources
+from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
@@ -53,31 +52,47 @@
 
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
+    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    ensemble = []
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
-        ensemble = ["random_sample", "random_sample"]
-    resources = setup_resources(len(ensemble), simulation_mode, runner)
+        ensemble = ["greedy_ils", "greedy_ils"]
+    ensemble_size = len(ensemble)
+    
     # Initialize Ray
     if not ray.is_initialized():
+        check_num_devices(ensemble_size, simulation_mode, runner)
         os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
+    
     # Create cache manager and actors
     kill_cache_manager = False
     if cache_manager is None:
         kill_cache_manager = True
-        cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
-    actors = [create_actor_on_device(id, runner, cache_manager, simulation_mode) for id in range(len(ensemble))]
+        cache_manager = CacheManager.remote(tuning_options)
+    actors = [create_actor(runner, cache_manager, simulation_mode) for _ in range(ensemble_size)]
     
+    # Execute all actor with one strategy each
     ensemble = [strategy_map[strategy] for strategy in ensemble]
-    tasks = []
-    for i in range(len(ensemble)):
+    pending_tasks = {}
+    for i in range(ensemble_size):
         strategy = ensemble[i]
         actor = actors[i]
         remote_tuning_options = setup_tuning_options(tuning_options)
         task = actor.execute.remote(strategy, searchspace, remote_tuning_options, simulation_mode)
-        tasks.append(task)
-    all_results = ray.get(tasks)
+        pending_tasks[task] = actor
+    
+    # As soon as an actor is done we need to kill it to give space to other actors
+    all_results = []
+    while pending_tasks:
+        done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
+        for done_id in done_ids:
+            result = ray.get(done_id)
+            all_results.append(result)
+            actor = pending_tasks.pop(done_id)
+            ray.kill(actor)
+
     new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
     tuning_options.update(new_tuning_options)
     final_results, population = process_results(all_results, searchspace)
@@ -88,18 +103,17 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
 
-def create_actor_on_device(gpu_id, runner, cache_manager, simulation_mode):
-    gpu_resource_name = f"gpu_{gpu_id}"
+def create_actor(runner, cache_manager, simulation_mode):
     if simulation_mode:
         resource_options= {"num_cpus": 1}
     else:
         resource_options= {"num_gpus": 1}
-    return RemoteActor.options(**resource_options, resources={gpu_resource_name: 1}).remote(runner.kernel_source, 
-                                                                                            runner.kernel_options, 
-                                                                                            runner.device_options, 
-                                                                                            runner.iterations, 
-                                                                                            runner.observers,
-                                                                                            cache_manager)
+    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
+                                                            runner.kernel_options, 
+                                                            runner.device_options, 
+                                                            runner.iterations, 
+                                                            runner.observers,
+                                                            cache_manager)
 
 def setup_tuning_options(tuning_options):
     new_tuning_options = copy.deepcopy(tuning_options)
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 6a679a913..f582a6eb7 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -8,7 +8,8 @@
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
-from kernel_tuner.strategies.common import setup_resources
+from kernel_tuner.strategies.common import check_num_devices
+from kernel_tuner.util import get_num_devices
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -81,7 +82,7 @@
 
 def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    ls_strategies = ["greedy_ils", "greedy_ils", "greedy_ils", "greedy_ils"]
+    ls_strategies = ['greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils']
     pop_based_strategy = "genetic_algorithm"
     iterations = 10
 
@@ -96,19 +97,19 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         raise ValueError("Provided population based strategy is not a population based strategy")
     
     tuning_options.strategy_options["candidates"] = searchspace.get_random_sample(len(ls_strategies))
-    tuning_options.strategy_options["max_fevals"] = 10
-    tuning_options.strategy_options["maxiter"] = 10
+    tuning_options.strategy_options["max_fevals"] = (100 // iterations) // 2
+    tuning_options.strategy_options["maxiter"] = (100 // iterations) // 2
 
-    resources = setup_resources(len(ls_strategies), simulation_mode, runner)
     # Initialize Ray
     if not ray.is_initialized():
+        check_num_devices(len(ls_strategies), simulation_mode, runner)
         os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(resources=resources, include_dashboard=True, ignore_reinit_error=True)
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
+    num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     # Create cache manager and actors
-    cache_manager = CacheManager.options(resources={"cache_manager_cpu": 1}).remote(tuning_options)
+    cache_manager = CacheManager.remote(tuning_options)
     pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                runner.iterations, runner.observers, cache_manager=cache_manager,
-                                resources=resources)
+                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager)
     
     for i in range(iterations):
         print(f"Memetic iteration: {i}", file=sys.stderr)

From d950b2db42daee5fe893256e0e8aa2d6907d4d5a Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:07:31 +0200
Subject: [PATCH 050/106] returning last two lists of candidates for memetic
 algo

---
 kernel_tuner/strategies/ensemble.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index eaac68284..810c6fb09 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -95,10 +95,12 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
 
     new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
     tuning_options.update(new_tuning_options)
-    final_results, population = process_results(all_results, searchspace)
+    final_results, population, candidates = process_results(all_results, searchspace)
 
     if population: # for memetic strategy
         tuning_options.strategy_options["population"] = population
+    if candidates: # for memetic strategy
+        tuning_options.strategy_options["candidates"] = candidates
 
     clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
@@ -126,8 +128,11 @@ def process_results(all_results, searchspace):
     unique_configs = set()
     final_results = []
     population = [] # for memetic strategy
+    candidates = [] # for memetic strategy
 
     for (strategy_results, tuning_options) in all_results:
+        if "old_candidate" in tuning_options.strategy_options:
+            candidates.append(tuning_options.strategy_options["old_candidate"])
         if "candidate" in tuning_options.strategy_options:
             population.append(tuning_options.strategy_options["candidate"])
         for new_result in strategy_results:
@@ -135,7 +140,7 @@ def process_results(all_results, searchspace):
             if config_signature not in unique_configs:
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
-    return final_results, population
+    return final_results, population, candidates
 
 def clean_up(actors, cache_manager, kill_cache_manager):
     for actor in actors:

From 980777fb57e33c2d2a1603f72ba590e9d1ab5534 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:09:21 +0200
Subject: [PATCH 051/106] returning last two candidates for memetic algo

---
 kernel_tuner/strategies/greedy_ils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index 1aa00ec51..575b89bd2 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -34,12 +34,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     #while searching
     if not candidate:
         candidate = searchspace.get_random_sample(1)[0]
+    old_candidate = candidate # for memetic strategy
     best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
     while fevals < max_fevals:
 
         try:
+            old_candidate = candidate # for memetic strategy
             candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
@@ -55,6 +57,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Instead of full restart, permute the starting candidate
         candidate = random_walk(candidate, perm_size, no_improvement, last_improvement, searchspace)
+    tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
     tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
     return cost_func.results
 

From 95a2f0fe427230c6fee05ead46a51a72e2535d11 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:10:02 +0200
Subject: [PATCH 052/106] returning last two populations for memetic algo

---
 kernel_tuner/strategies/genetic_algorithm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index 7142ac6cb..310ff820f 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -31,11 +31,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         population = list(list(p) for p in searchspace.get_random_sample(pop_size))
     else:
         pop_size = len(population)
-
+    
     for generation in range(generations):
 
         # Evaluate the entire population
         try:
+            old_population = population
             weighted_population = evaluate_population(runner, cost_func, population)
         except util.StopCriterionReached as e:
             if tuning_options.verbose:
@@ -70,6 +71,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                     break
 
         # could combine old + new generation here and do a selection
+    tuning_options.strategy_options["population"] = old_population # for memetic strategy
     tuning_options.strategy_options["candidates"] = population # for memetic strategy
     return cost_func.results
 

From 89c499bbee6df0722de3693371ef54c6d69317f6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 3 May 2024 15:11:14 +0200
Subject: [PATCH 053/106] implemented adaptive local search depth logic and fix
 few issues, works also in simulation

---
 kernel_tuner/strategies/memetic.py | 135 +++++++++++++++++++++--------
 1 file changed, 99 insertions(+), 36 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index f582a6eb7..e49d8ed55 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -2,6 +2,7 @@
 import ray
 import os
 import sys
+import copy
 
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.runners.parallel import ParallelRunner
@@ -49,22 +50,6 @@
     "bayes_opt": bayes_opt,
 }
 
-# Pseudo code from "Memetic algorithms and memetic computing optimization: A literature review" by Ferrante Neri and Carlos Cotta
-# function BasicMA (in P: Problem, in par: Parameters):
-# Solution; 
-# begin 
-#     pop ← Initialize(par, P); 
-#     repeat 
-#         newpop1 ← Cooperate(pop, par, P); 
-#         newpop2 ← Improve(newpop1, par, P); 
-#         pop ← Compete (pop, newpop2); 
-#         if Converged(pop) then 
-#             pop ← Restart(pop, par); 
-#         end 
-#     until TerminationCriterion(par); 
-#     return GetNthBest(pop, 1); 
-# end
-
 ls_strategies_list = {
     "greedy_mls",
     "ordered_greedy_mls",
@@ -81,43 +66,121 @@
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
+    options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    ls_strategies = ['greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils', 'greedy_ils']
-    pop_based_strategy = "genetic_algorithm"
-    iterations = 10
+    local_search = options.get('local_search', 'greedy_ils')
+    global_search = options.get('global_search', "genetic_algorithm")
+    max_feval = options.get("max_fevals", 100)
+    alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
+    lsd = options.get("lsd", 25) # Local Search Depth (LSD)
+    maxiter = options.get("maxiter", 3)
+    popsize = options.get("popsize", 10)
 
-    if set(ls_strategies) <= ls_strategies_list:
-        tuning_options["ensemble"] = ls_strategies
+    if local_search in ls_strategies_list:
+        tuning_options["ensemble"] = [local_search] * popsize
     else:
         raise ValueError("Provided local search ensemble are not all local search strategies")
 
-    if pop_based_strategy in pop_based_strategies_list:
-        pop_based_strategy = strategy_map[pop_based_strategy]
+    if global_search in pop_based_strategies_list:
+        global_search = strategy_map[global_search]
     else:
         raise ValueError("Provided population based strategy is not a population based strategy")
     
-    tuning_options.strategy_options["candidates"] = searchspace.get_random_sample(len(ls_strategies))
-    tuning_options.strategy_options["max_fevals"] = (100 // iterations) // 2
-    tuning_options.strategy_options["maxiter"] = (100 // iterations) // 2
+    tuning_options.strategy_options["population"] = searchspace.get_random_sample(popsize)
 
     # Initialize Ray
     if not ray.is_initialized():
-        check_num_devices(len(ls_strategies), simulation_mode, runner)
+        check_num_devices(popsize, simulation_mode, runner)
         os.environ["RAY_DEDUP_LOGS"] = "0"
         ray.init(include_dashboard=True, ignore_reinit_error=True)
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     # Create cache manager and actors
     cache_manager = CacheManager.remote(tuning_options)
-    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager)
+    if simulation_mode:
+        pop_runner = runner
+    else:
+        pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
+                                simulation_mode=simulation_mode)
     
-    for i in range(iterations):
-        print(f"Memetic iteration: {i}", file=sys.stderr)
-        print(f"Candidates local search: {tuning_options.strategy_options['candidates']}", file=sys.stderr)
-        ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
-        print(f"Population pop based: {tuning_options.strategy_options['population']}", file=sys.stderr)
-        results = pop_based_strategy.tune(searchspace, pop_runner, tuning_options)
+    all_results = []
+    all_results_dict = {}
+    feval = 0
+    while feval < max_feval:
+        print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
+        if feval + lsd + maxiter * popsize > max_feval:
+            lsd = max_feval - feval - maxiter * popsize
+        print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
+        # Global Search (GS)
+        print(f"DEBUG:=================Global Search=================", file=sys.stderr)
+        tuning_options.strategy_options["maxiter"] = maxiter
+        pop_start_gs = copy.deepcopy(tuning_options.strategy_options["population"])
+        results = global_search.tune(searchspace, pop_runner, tuning_options)
+        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
+        feval += maxiter * popsize
+
+        pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
+        pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
+        pop_end_gs_res = get_pop_results(pop_end_gs, all_results_dict)
+        afi_gs = calculate_afi(pop_start_gs_res, pop_end_gs_res, maxiter, all_results_dict)
+
+        # Local Search (LS)
+        print(f"DEBUG:=================Local Search=================", file=sys.stderr)
+        tuning_options.strategy_options["max_fevals"] = lsd
+        pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
+        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
+        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
+        feval += lsd
+
+        pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
+        pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
+        pop_end_ls_res = get_pop_results(pop_end_ls, all_results_dict)
+        afi_ls = calculate_afi(pop_start_ls_res, pop_end_ls_res, lsd, all_results_dict)
+
+        # Adaptive Local Search Depth (ALSD)
+        if lsd > 3:
+            if afi_ls > afi_gs:
+                lsd += alsd
+            elif afi_ls < afi_gs:
+                lsd -= alsd
+            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
 
-    return results
\ No newline at end of file
+    return results
+
+def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
+    delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
+    afi = delta_fitness / feval
+    print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
+    return afi
+
+def fitness_increment(pop_before, pop_after):
+    if len(pop_before) != len(pop_after):
+        raise ValueError("populations must have the same size.")
+    
+    sum_before = sum(t for t in pop_before if isinstance(t, float))
+    sum_after = sum(t for t in pop_after if isinstance(t, float))
+    difference_sum = sum_before - sum_after
+    print(f"DEBUG:fitness_increment difference_sum: {difference_sum}", file=sys.stderr)
+    return difference_sum
+
+def get_pop_results(pop, results):
+    print(f"DEBUG:get_pop_results pop = {pop}", file=sys.stderr)
+    times = []
+    for entry in pop:
+        key = ','.join(map(str, entry))
+        if key in results:
+            time = results[key]
+            times.append(time)
+        else:
+            times.append(None)
+
+    print(f"DEBUG:get_pop_results times = {times}", file=sys.stderr)
+    return times
+
+def add_to_results(all_results, all_results_dict, results, tune_params):
+    for result in results:
+        key = ",".join(str(result[param]) for param in tune_params)
+        all_results_dict[key] = result["time"]
+        all_results.append(result)

From babba0b95e047b3dd6c1162c6d6f09ca7307d7ce Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 6 May 2024 15:33:40 +0200
Subject: [PATCH 054/106] modifications related to last iteration of memetic
 algo

---
 kernel_tuner/strategies/memetic.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index e49d8ed55..8f18c3076 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -73,8 +73,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     max_feval = options.get("max_fevals", 100)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
-    maxiter = options.get("maxiter", 3)
-    popsize = options.get("popsize", 10)
+    maxiter = options.get("maxiter", 2)
+    popsize = options.get("popsize", 20)
 
     if local_search in ls_strategies_list:
         tuning_options["ensemble"] = [local_search] * popsize
@@ -108,9 +108,15 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     feval = 0
     while feval < max_feval:
         print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        if feval + lsd + maxiter * popsize > max_feval:
-            lsd = max_feval - feval - maxiter * popsize
+        feval_left = max_feval - feval
+        if feval_left < lsd + maxiter * popsize:
+            maxiter = feval_left // popsize
+            if maxiter == 1: # It doesnt make sense to have one generation for global search, so we give all final resources to local search
+                maxiter = 0
+                lsd = feval_left
+            lsd = feval_left - maxiter * popsize
         print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
+
         # Global Search (GS)
         print(f"DEBUG:=================Global Search=================", file=sys.stderr)
         tuning_options.strategy_options["maxiter"] = maxiter
@@ -138,20 +144,21 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         afi_ls = calculate_afi(pop_start_ls_res, pop_end_ls_res, lsd, all_results_dict)
 
         # Adaptive Local Search Depth (ALSD)
-        if lsd > 3:
+        if afi_gs is not None and afi_ls is not None:
             if afi_ls > afi_gs:
                 lsd += alsd
             elif afi_ls < afi_gs:
-                lsd -= alsd
-            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
+                lsd -= alsd if lsd - alsd > 5 else 5
+                print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
 
     return results
 
 def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
+    # Average Fitness Increment (AFI)
     delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
-    afi = delta_fitness / feval
+    afi = delta_fitness / feval if feval > 0 else None
     print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
     return afi
 

From e0e1e61b2db8cea9ce86f3ebfe3b1fcd433d0b9b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 6 May 2024 15:34:27 +0200
Subject: [PATCH 055/106] updates related to old popuation logic

---
 kernel_tuner/strategies/genetic_algorithm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index 310ff820f..b082ce3c6 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -32,6 +32,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     else:
         pop_size = len(population)
     
+    old_population = population
     for generation in range(generations):
 
         # Evaluate the entire population

From 630578253194d3e5662b06811dea654045f9ed11 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:51:29 +0200
Subject: [PATCH 056/106] unified two actors into one

---
 .../runners/ray/parallel_remote_actor.py      | 47 -------------------
 kernel_tuner/runners/ray/remote_actor.py      | 43 ++++++++++-------
 2 files changed, 27 insertions(+), 63 deletions(-)
 delete mode 100644 kernel_tuner/runners/ray/parallel_remote_actor.py

diff --git a/kernel_tuner/runners/ray/parallel_remote_actor.py b/kernel_tuner/runners/ray/parallel_remote_actor.py
deleted file mode 100644
index bc0d192e7..000000000
--- a/kernel_tuner/runners/ray/parallel_remote_actor.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
-import ray
-import sys
-
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
-from kernel_tuner.runners.sequential import SequentialRunner
-
-@ray.remote(num_gpus=1)
-class ParallelRemoteActor():
-    def __init__(self, 
-                 quiet,
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers,
-                 cache_manager):
-        
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
-        self.quiet = quiet
-        self.kernel_source = kernel_source
-        self.warmed_up = False
-        self.simulation_mode = False
-        self.start_time = perf_counter()
-        self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
-        self.device_options = device_options
-        self.iterations = iterations
-        self.observers = observers
-        self.cache_manager = cache_manager
-        self.runner = None
-
-    def execute(self, element, tuning_options):
-        if self.runner is None:
-            self.init_runner()
-        return self.runner.run([element], tuning_options)[0]
-
-    def init_runner(self):
-        if self.cache_manager is None:
-            raise ValueError("Cache manager is not set.")
-        self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                       self.iterations, self.observers, cache_manager=self.cache_manager)
\ No newline at end of file
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index fba5e0069..3eceb4414 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,11 +1,5 @@
-import logging
-from datetime import datetime, timezone
-from time import perf_counter
 import ray
-import sys
 
-from kernel_tuner.util import ErrorConfig, print_config_output, process_metrics, store_cache
-from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 
@@ -17,24 +11,41 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers,
-                 cache_manager):
-        
+                 cache_manager=None,
+                 simulation_mode=False):
         self.kernel_source = kernel_source
-        self.simulation_mode = False
         self.kernel_options = kernel_options
         self.device_options = device_options
         self.iterations = iterations
         self.observers = observers
         self.cache_manager = cache_manager
+        self.simulation_mode = simulation_mode
         self.runner = None
+            
+    def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        if self.runner is None:
+            self.init_runner()
+        if strategy and searchspace:
+            results = strategy.tune(searchspace, self.runner, tuning_options)
+            return results, tuning_options
+        elif element:
+            return self.runner.run([element], tuning_options)[0]
+        else:
+            raise ValueError("Invalid arguments for ray actor's execute method.")
         
-    def execute(self, strategy, searchspace, tuning_options, simulation_mode=False):
-        if simulation_mode:
+    def set_cache_manager(self, cache_manager):
+        if self.cache_manager is None:
+            self.cache_manager = cache_manager
+
+    def get_cache_magaer(self):
+        return self.cache_manager
+    
+    def init_runner(self):
+        if self.cache_manager is None:
+            raise ValueError("Cache manager is not set.")
+        if self.simulation_mode:
             self.runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                 self.iterations, self.observers)
+                                            self.iterations, self.observers)
         else:
             self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                 self.iterations, self.observers, cache_manager=self.cache_manager)
-        results = strategy.tune(searchspace, self.runner, tuning_options)
-        return results, tuning_options
-    
\ No newline at end of file
+                                       self.iterations, self.observers, cache_manager=self.cache_manager)

From 0f2b7e4190833b39b65996f8ea23eb818c28836e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:52:55 +0200
Subject: [PATCH 057/106] updates related to actors unification and memetic
 algo development

---
 kernel_tuner/runners/parallel.py | 45 +++++++++++++++-----------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index fe06a8c2d..76f27c619 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -7,14 +7,16 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.ray.parallel_remote_actor import ParallelRemoteActor
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
 class ParallelRunner(Runner):
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, num_gpus, cache_manager=None):
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
-        self.units = self.dev.units
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
+                 num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
         self.warmed_up = False
@@ -28,11 +30,17 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.device_options = device_options
         self.cache_manager = cache_manager
         self.num_gpus = num_gpus
+        self.actors = actors
 
-        # Initialize Ray
-        if not ray.is_initialized():
-            os.environ["RAY_DEDUP_LOGS"] = "0"
-            ray.init(include_dashboard=True, ignore_reinit_error=True)
+        if num_gpus is None:
+            self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
+
+        initialize_ray(num_gpus)
+
+        # Create RemoteActor instances
+        if actors is None:
+            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
+            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, simulation_mode, id) for id in range(self.num_gpus)]
 
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
@@ -41,27 +49,16 @@ def get_environment(self, tuning_options):
     def run(self, parameter_space, tuning_options, cache_manager=None):
         if self.cache_manager is None:
             if cache_manager is None:
-                raise ValueError("A cache manager is required for parallel execution")
+                cache_manager = CacheManager.remote(tuning_options)
             self.cache_manager = cache_manager
-        # Create RemoteActor instances
-        self.actors = [self.create_actor_on_gpu(self.cache_manager) for _ in range(self.num_gpus)]
+        # set the cache manager for each actor. Can't be done in constructor because we do not have yet the tuning_options
+        for actor in self.actors:
+            ray.get(actor.set_cache_manager.remote(self.cache_manager))
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(v, tuning_options), parameter_space))
+        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
         new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
         tuning_options.update(new_tuning_options)
-
-        for actor in self.actors:
-            ray.kill(actor)
         
         return results
-    
-    def create_actor_on_gpu(self, cache_manager):
-        return ParallelRemoteActor.remote(self.quiet,
-                                            self.kernel_source, 
-                                            self.kernel_options, 
-                                            self.device_options, 
-                                            self.iterations, 
-                                            self.observers,
-                                            cache_manager)

From 63ddedb5ba8b364e60cad24244913a712d33b97b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:53:39 +0200
Subject: [PATCH 058/106] added create_actor_on_device and initialize_ray

---
 kernel_tuner/strategies/common.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 65db1831c..787750825 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -2,12 +2,14 @@
 import sys
 from time import perf_counter
 import warnings
+import ray
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -46,7 +48,8 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", "maxiter"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
+                                       "maxiter", "lsd", "popsize", "alsd", ]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -329,4 +332,28 @@ def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
          warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
-    
\ No newline at end of file
+
+def create_actor_on_device(kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, id):
+    # Check if Ray is initialized, raise an error if not
+    if not ray.is_initialized():
+        raise RuntimeError("Ray is not initialized. Initialize Ray before creating an actor (remember to include resources).")
+
+    if simulation_mode:
+        resource_options = {"num_cpus": 1}
+    else:
+        resource_options = {"num_gpus": 1}
+    
+    # Create the actor with the specified options and resources
+    return RemoteActor.options(**resource_options).remote(kernel_source, 
+                                                            kernel_options, 
+                                                            device_options, 
+                                                            iterations, 
+                                                            observers,
+                                                            cache_manager=cache_manager,
+                                                            simulation_mode=simulation_mode)
+
+def initialize_ray():
+    # Initialize Ray
+    if not ray.is_initialized():
+        ray.init(include_dashboard=True, ignore_reinit_error=True)
+

From d7fe9b40a919cd6b3f8afbe588ac260a4eb4b393 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:55:45 +0200
Subject: [PATCH 059/106] updates realted to unification of actors, memetic
 algo, and reutilization of actors for performance

---
 kernel_tuner/strategies/ensemble.py | 59 ++++++++++++-----------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 810c6fb09..f678f7b74 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -4,13 +4,15 @@
 import ray
 import copy
 import logging
+import warnings
+from collections import deque
 
 import numpy as np
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices
+from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices, create_actor_on_device, initialize_ray
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
@@ -50,40 +52,41 @@
     "bayes_opt": bayes_opt,
 }
 
-def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
+def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    print(f"DEBUG: num_devices={num_devices}", file=sys.stderr)
+    
     ensemble = []
     if "ensemble" in tuning_options:
         ensemble = tuning_options["ensemble"]
     else:
         ensemble = ["greedy_ils", "greedy_ils"]
     ensemble_size = len(ensemble)
+    if num_devices < ensemble_size:
+        warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
+    num_actors = num_devices if ensemble_size > num_devices else ensemble_size
     
-    # Initialize Ray
-    if not ray.is_initialized():
-        check_num_devices(ensemble_size, simulation_mode, runner)
-        os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(include_dashboard=True, ignore_reinit_error=True)
+    initialize_ray(num_devices)
     
     # Create cache manager and actors
-    kill_cache_manager = False
     if cache_manager is None:
-        kill_cache_manager = True
         cache_manager = CacheManager.remote(tuning_options)
-    actors = [create_actor(runner, cache_manager, simulation_mode) for _ in range(ensemble_size)]
+    if actors is None:
+        runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
+        actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
     
     # Execute all actor with one strategy each
     ensemble = [strategy_map[strategy] for strategy in ensemble]
+    ensemble_queue = deque(ensemble)
     pending_tasks = {}
-    for i in range(ensemble_size):
-        strategy = ensemble[i]
-        actor = actors[i]
+    for actor in actors:
+        strategy = ensemble_queue.popleft()
         remote_tuning_options = setup_tuning_options(tuning_options)
-        task = actor.execute.remote(strategy, searchspace, remote_tuning_options, simulation_mode)
+        task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
         pending_tasks[task] = actor
     
-    # As soon as an actor is done we need to kill it to give space to other actors
+
     all_results = []
     while pending_tasks:
         done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
@@ -91,7 +94,12 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
             result = ray.get(done_id)
             all_results.append(result)
             actor = pending_tasks.pop(done_id)
-            ray.kill(actor)
+
+            if ensemble_queue:
+                strategy = ensemble_queue.popleft()
+                remote_tuning_options = setup_tuning_options(tuning_options)
+                task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+                pending_tasks[task] = actor
 
     new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
     tuning_options.update(new_tuning_options)
@@ -102,21 +110,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None):
     if candidates: # for memetic strategy
         tuning_options.strategy_options["candidates"] = candidates
 
-    clean_up(actors, cache_manager, kill_cache_manager)
     return final_results
 
-def create_actor(runner, cache_manager, simulation_mode):
-    if simulation_mode:
-        resource_options= {"num_cpus": 1}
-    else:
-        resource_options= {"num_gpus": 1}
-    return RemoteActor.options(**resource_options).remote(runner.kernel_source, 
-                                                            runner.kernel_options, 
-                                                            runner.device_options, 
-                                                            runner.iterations, 
-                                                            runner.observers,
-                                                            cache_manager)
-
 def setup_tuning_options(tuning_options):
     new_tuning_options = copy.deepcopy(tuning_options)
     if "candidates" in tuning_options.strategy_options:
@@ -141,9 +136,3 @@ def process_results(all_results, searchspace):
                 final_results.append(new_result)
                 unique_configs.add(config_signature)
     return final_results, population, candidates
-
-def clean_up(actors, cache_manager, kill_cache_manager):
-    for actor in actors:
-        ray.kill(actor)
-    if kill_cache_manager:
-        ray.kill(cache_manager)

From 46fcde17613a9d2529468ece9b0f51a043784eb1 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:56:37 +0200
Subject: [PATCH 060/106] returning 80% of cpus for simulation mode in
 get_num_devices

---
 kernel_tuner/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index b9ecf9b3a..434feb6ff 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1282,7 +1282,7 @@ def cuda_error_check(error):
 def get_num_devices(lang, simulation_mode=False):
     num_devices = 0
     if simulation_mode:
-        num_devices = os.cpu_count()
+        num_devices = int(round(os.cpu_count() * 0.8)) # keep resources for the main process and other tasks
     elif lang.upper() == "CUDA":
         import pycuda.driver as cuda
         cuda.init()

From d54384808fcebc130d268bc90537179c35160141 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 14:57:18 +0200
Subject: [PATCH 061/106] updates realted to actor unification and
 reutilization of actors for performance

---
 kernel_tuner/strategies/memetic.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 8f18c3076..c8d18887c 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -9,8 +9,9 @@
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
-from kernel_tuner.strategies.common import check_num_devices
+from kernel_tuner.strategies.common import check_num_devices, create_actor_on_device, initialize_ray
 from kernel_tuner.util import get_num_devices
+from kernel_tuner.runners.ray.remote_actor import RemoteActor
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -70,7 +71,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    max_feval = options.get("max_fevals", 100)
+    max_feval = options.get("max_fevals", 500)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
     maxiter = options.get("maxiter", 2)
@@ -88,20 +89,17 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     
     tuning_options.strategy_options["population"] = searchspace.get_random_sample(popsize)
 
-    # Initialize Ray
-    if not ray.is_initialized():
-        check_num_devices(popsize, simulation_mode, runner)
-        os.environ["RAY_DEDUP_LOGS"] = "0"
-        ray.init(include_dashboard=True, ignore_reinit_error=True)
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    # Create cache manager and actors
+    check_num_devices(num_gpus, simulation_mode, runner)
+    initialize_ray(num_gpus)
+    # Create cache manager, actors and parallel runner
     cache_manager = CacheManager.remote(tuning_options)
-    if simulation_mode:
-        pop_runner = runner
-    else:
-        pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+    num_actors = num_gpus if num_gpus < popsize else popsize
+    runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
+    actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
+    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
                                 runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
-                                simulation_mode=simulation_mode)
+                                simulation_mode=simulation_mode, actors=actors)
     
     all_results = []
     all_results_dict = {}
@@ -134,7 +132,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         print(f"DEBUG:=================Local Search=================", file=sys.stderr)
         tuning_options.strategy_options["max_fevals"] = lsd
         pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
-        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager)
+        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd
 
@@ -152,6 +150,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                 print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
+    for actor in actors:
+        ray.kill(actor)
 
     return results
 

From 15df6ea9b72f4eee09d4ec1cf0f0107cf037cb43 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 7 May 2024 15:46:46 +0200
Subject: [PATCH 062/106] updates on feval counting and distributing

---
 kernel_tuner/strategies/memetic.py | 48 +++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index c8d18887c..6ecd779e4 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -71,7 +71,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    max_feval = options.get("max_fevals", 500)
+    max_feval = options.get("max_fevals", 2000)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
     maxiter = options.get("maxiter", 2)
@@ -91,7 +91,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     check_num_devices(num_gpus, simulation_mode, runner)
-    initialize_ray(num_gpus)
+    initialize_ray()
     # Create cache manager, actors and parallel runner
     cache_manager = CacheManager.remote(tuning_options)
     num_actors = num_gpus if num_gpus < popsize else popsize
@@ -104,15 +104,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     all_results = []
     all_results_dict = {}
     feval = 0
+    afi_gs, afi_ls = None, None
     while feval < max_feval:
         print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        feval_left = max_feval - feval
-        if feval_left < lsd + maxiter * popsize:
-            maxiter = feval_left // popsize
-            if maxiter == 1: # It doesnt make sense to have one generation for global search, so we give all final resources to local search
-                maxiter = 0
-                lsd = feval_left
-            lsd = feval_left - maxiter * popsize
+        maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
         print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
 
         # Global Search (GS)
@@ -134,7 +129,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
-        feval += lsd
+        feval += lsd * popsize
 
         pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
         pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
@@ -147,7 +142,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                 lsd += alsd
             elif afi_ls < afi_gs:
                 lsd -= alsd if lsd - alsd > 5 else 5
-                print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
+            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)
     for actor in actors:
@@ -191,3 +186,34 @@ def add_to_results(all_results, all_results_dict, results, tune_params):
         key = ",".join(str(result[param]) for param in tune_params)
         all_results_dict[key] = result["time"]
         all_results.append(result)
+
+def distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls):
+    remaining_feval = max_feval - feval
+    if remaining_feval < (lsd + maxiter) * popsize:
+        # Calculate how many full batches of popsize can still be processed
+        proportion = remaining_feval // popsize
+
+        if afi_gs is None or afi_ls is None:
+            maxiter = int(proportion * 0.5)
+            lsd = int(proportion * 0.5)
+        else:
+            if afi_gs > afi_ls:
+                # More evaluations to maxiter
+                maxiter = int(proportion * 0.6)
+                lsd = int(proportion * 0.4)
+            else:
+                # More evaluations to lsd
+                maxiter = int(proportion * 0.4)
+                lsd = int(proportion * 0.6)
+
+        # If maxiter ends up being 1, assign all remaining feval to lsd
+        if maxiter == 1:
+            lsd = proportion  # Give all available batches to lsd
+            maxiter = 0
+
+        # Ensure at least one of maxiter or lsd is non-zero if there are still fevals to be used
+        if maxiter == 0 and lsd == 0 and remaining_feval > 0:
+            lsd = 1  # Allocate at least one batch to lsd to ensure progress
+
+    return maxiter, lsd
+ 
\ No newline at end of file

From ec719a209d9ad3ad43dc6cb8fc7e7ca99578db7c Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 10 May 2024 17:37:36 +0200
Subject: [PATCH 063/106] added logic for time limit stop

---
 kernel_tuner/strategies/memetic.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 6ecd779e4..1800bb0b6 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -10,7 +10,7 @@
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import check_num_devices, create_actor_on_device, initialize_ray
-from kernel_tuner.util import get_num_devices
+from kernel_tuner.util import get_num_devices, check_stop_criterion, StopCriterionReached
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 
 from kernel_tuner.strategies import (
@@ -71,11 +71,11 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    max_feval = options.get("max_fevals", 2000)
     alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
     lsd = options.get("lsd", 25) # Local Search Depth (LSD)
     maxiter = options.get("maxiter", 2)
     popsize = options.get("popsize", 20)
+    max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
 
     if local_search in ls_strategies_list:
         tuning_options["ensemble"] = [local_search] * popsize
@@ -105,9 +105,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     all_results_dict = {}
     feval = 0
     afi_gs, afi_ls = None, None
-    while feval < max_feval:
+    while (max_feval is None) or feval < max_feval:
         print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
+        if max_feval is not None:
+            maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
         print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
 
         # Global Search (GS)
@@ -117,6 +118,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = global_search.tune(searchspace, pop_runner, tuning_options)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += maxiter * popsize
+        try:
+            check_stop_criterion(tuning_options)
+        except StopCriterionReached as e:
+            if tuning_options.verbose:
+                print(e)
+            break
 
         pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
         pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
@@ -130,6 +137,13 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd * popsize
+        try:
+            print(f"DEBUG: check for sto criterion in memetic algo", file=sys.stderr)
+            check_stop_criterion(tuning_options)
+        except StopCriterionReached as e:
+            if tuning_options.verbose:
+                print(e)
+            break
 
         pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
         pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
@@ -148,12 +162,13 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     for actor in actors:
         ray.kill(actor)
 
-    return results
+    return all_results
 
 def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
     # Average Fitness Increment (AFI)
+    assert(feval >= 0)
     delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
-    afi = delta_fitness / feval if feval > 0 else None
+    afi = delta_fitness / feval if feval > 0 else 0
     print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
     return afi
 
@@ -182,6 +197,8 @@ def get_pop_results(pop, results):
     return times
 
 def add_to_results(all_results, all_results_dict, results, tune_params):
+    print(f"DEBUG:add_to_results results size = {len(results)}", file=sys.stderr)
+    print(f"DEBUG:add_to_results all_results size = {len(all_results)}", file=sys.stderr)
     for result in results:
         key = ",".join(str(result[param]) for param in tune_params)
         all_results_dict[key] = result["time"]

From 6c2a62b7e3db0c8c0455939be1894edbd4ebbd39 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 10 May 2024 17:40:22 +0200
Subject: [PATCH 064/106] debug prints clean up

---
 kernel_tuner/strategies/memetic.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 1800bb0b6..3f2922f4b 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -138,7 +138,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd * popsize
         try:
-            print(f"DEBUG: check for sto criterion in memetic algo", file=sys.stderr)
             check_stop_criterion(tuning_options)
         except StopCriterionReached as e:
             if tuning_options.verbose:
@@ -179,7 +178,6 @@ def fitness_increment(pop_before, pop_after):
     sum_before = sum(t for t in pop_before if isinstance(t, float))
     sum_after = sum(t for t in pop_after if isinstance(t, float))
     difference_sum = sum_before - sum_after
-    print(f"DEBUG:fitness_increment difference_sum: {difference_sum}", file=sys.stderr)
     return difference_sum
 
 def get_pop_results(pop, results):
@@ -197,8 +195,6 @@ def get_pop_results(pop, results):
     return times
 
 def add_to_results(all_results, all_results_dict, results, tune_params):
-    print(f"DEBUG:add_to_results results size = {len(results)}", file=sys.stderr)
-    print(f"DEBUG:add_to_results all_results size = {len(all_results)}", file=sys.stderr)
     for result in results:
         key = ",".join(str(result[param]) for param in tune_params)
         all_results_dict[key] = result["time"]

From c7fd2af656f331177b4be55dae534b0b32f58faa Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 10 May 2024 17:41:47 +0200
Subject: [PATCH 065/106] unified parallel tuning and parallel ensemble logic
 in ParallelRunner

---
 kernel_tuner/runners/parallel.py         | 127 +++++++++++++++++++++--
 kernel_tuner/runners/ray/remote_actor.py |   5 +-
 kernel_tuner/strategies/ensemble.py      |  76 ++------------
 3 files changed, 129 insertions(+), 79 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 76f27c619..e6cdce2ab 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -4,11 +4,13 @@
 import os
 from ray.util.actor_pool import ActorPool
 from time import perf_counter
+from collections import deque
+import copy
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.util import get_num_devices
+from kernel_tuner.util import get_num_devices, get_nested_types
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
@@ -20,7 +22,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.quiet = device_options.quiet
         self.kernel_source = kernel_source
         self.warmed_up = False
-        self.simulation_mode = False
+        self.simulation_mode = simulation_mode
         self.start_time = perf_counter()
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
@@ -35,7 +37,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         if num_gpus is None:
             self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
 
-        initialize_ray(num_gpus)
+        initialize_ray()
 
         # Create RemoteActor instances
         if actors is None:
@@ -46,19 +48,126 @@ def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     
-    def run(self, parameter_space, tuning_options, cache_manager=None):
+    def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
+        if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
+            raise ValueError("tuning_options cannot be None")
+        
         if self.cache_manager is None:
             if cache_manager is None:
                 cache_manager = CacheManager.remote(tuning_options)
             self.cache_manager = cache_manager
-        # set the cache manager for each actor. Can't be done in constructor because we do not have yet the tuning_options
+        
+        # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
             ray.get(actor.set_cache_manager.remote(self.cache_manager))
-        # Create a pool of RemoteActor actors
-        self.actor_pool = ActorPool(self.actors)
-        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
+    
+        # Determine what type of parallelism and run appropriately
+        if parameter_space and not ensemble and not searchspace:
+            results, tuning_options_list = self.run_parallel_tuning(tuning_options, parameter_space)
+        elif ensemble and searchspace and not parameter_space:
+            results, tuning_options_list = self.run_parallel_ensemble(ensemble, tuning_options, searchspace)
+        else:
+            raise ValueError("Invalid arguments to parallel runner run method")
+        
+        # Update tuning options
         new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
         tuning_options.update(new_tuning_options)
+        if self.simulation_mode:
+            tuning_options.simulated_time += self._calculate_simulated_time(tuning_options_list)
+            print(f"DEBUG: simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
         
         return results
+
+    def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
+        """
+        Runs strategies from the ensemble in parallel using distributed actors, 
+        manages dynamic task allocation, and collects results.
+        """
+        ensemble_queue = deque(ensemble)
+        pending_tasks = {}
+        all_results = []
+
+        # Start initial tasks for each actor
+        for actor in self.actors:
+            strategy = ensemble_queue.popleft()
+            remote_tuning_options = self._setup_tuning_options(tuning_options)
+            task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+            pending_tasks[task] = actor
+        
+        # Manage task completion and redistribution
+        while pending_tasks:
+            done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
+            for done_id in done_ids:
+                result = ray.get(done_id)
+                all_results.append(result)
+                actor = pending_tasks.pop(done_id)
+
+                # Reassign actors if strategies remain
+                if ensemble_queue:
+                    strategy = ensemble_queue.popleft()
+                    remote_tuning_options = self._setup_tuning_options(tuning_options)
+                    task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+                    pending_tasks[task] = actor
+        
+        # Process results to extract population and candidates for further use
+        results, tuning_options_list, population, candidates = self._process_results_ensemble(all_results)
+
+        # Update tuning options for memetic strategies
+        if population:
+            tuning_options.strategy_options["population"] = population
+        if candidates:
+            tuning_options.strategy_options["candidates"] = candidates
+        return results, tuning_options_list
+    
+    def _setup_tuning_options(self, tuning_options):
+        new_tuning_options = copy.deepcopy(tuning_options)
+        if "candidates" in tuning_options.strategy_options:
+            if len(tuning_options.strategy_options["candidates"]) > 0:
+                new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
+        return new_tuning_options
+    
+    def _process_results_ensemble(self, all_results):
+        population = [] # for memetic strategy
+        candidates = [] # for memetic strategy
+        results = []
+        tuning_options_list = []
+
+        for (strategy_results, tuning_options) in all_results:
+            if "old_candidate" in tuning_options.strategy_options:
+                candidates.append(tuning_options.strategy_options["old_candidate"])
+            if "candidate" in tuning_options.strategy_options:
+                population.append(tuning_options.strategy_options["candidate"])
+            results.extend(strategy_results)
+            tuning_options_list.append(tuning_options)
+
+        return results, tuning_options_list, population, candidates
+
+
+    def run_parallel_tuning(self, tuning_options, parameter_space):
+        # Create a pool of RemoteActor actors
+        self.actor_pool = ActorPool(self.actors)
+        # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
+        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
+        results = [x[0] for x in all_results]
+        tuning_options_list = [x[1] for x in all_results]
+        return results, tuning_options_list
+    
+    def _process_results(self, all_results, searchspace):
+        unique_configs = set()
+        final_results = []
+
+        for (strategy_results, tuning_options) in all_results:
+            for new_result in strategy_results:
+                config_signature = tuple(new_result[key] for key in searchspace.tune_params)
+                if config_signature not in unique_configs:
+                    final_results.append(new_result)
+                    unique_configs.add(config_signature)
+        return final_results
+    
+    def _calculate_simulated_time(self, tuning_options_list):
+        simulated_times = []
+        for tuning_options in tuning_options_list:
+            print(f"DEBUG:_calculate_simulated_time tuning_options.simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
+            simulated_times.append(tuning_options.simulated_time)
+        #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
+        return max(simulated_times)
\ No newline at end of file
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 3eceb4414..3956c8648 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,7 +1,9 @@
 import ray
+import sys
 
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
+from kernel_tuner.util import get_nested_types
 
 @ray.remote
 class RemoteActor():
@@ -29,7 +31,8 @@ def execute(self, tuning_options, strategy=None, searchspace=None, element=None)
             results = strategy.tune(searchspace, self.runner, tuning_options)
             return results, tuning_options
         elif element:
-            return self.runner.run([element], tuning_options)[0]
+            results = self.runner.run([element], tuning_options)[0]
+            return results, tuning_options
         else:
             raise ValueError("Invalid arguments for ray actor's execute method.")
         
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index f678f7b74..78fd85001 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -17,6 +17,7 @@
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.cache_manager import CacheManager
+from kernel_tuner.runners.parallel import ParallelRunner
 
 from kernel_tuner.strategies import (
     basinhopping,
@@ -55,84 +56,21 @@
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    print(f"DEBUG: num_devices={num_devices}", file=sys.stderr)
     
     ensemble = []
     if "ensemble" in tuning_options:
-        ensemble = tuning_options["ensemble"]
+        ensemble = tuning_options.ensemble
     else:
         ensemble = ["greedy_ils", "greedy_ils"]
     ensemble_size = len(ensemble)
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
-    
-    initialize_ray(num_devices)
-    
-    # Create cache manager and actors
-    if cache_manager is None:
-        cache_manager = CacheManager.remote(tuning_options)
-    if actors is None:
-        runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
-        actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
-    
-    # Execute all actor with one strategy each
+
     ensemble = [strategy_map[strategy] for strategy in ensemble]
-    ensemble_queue = deque(ensemble)
-    pending_tasks = {}
-    for actor in actors:
-        strategy = ensemble_queue.popleft()
-        remote_tuning_options = setup_tuning_options(tuning_options)
-        task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
-        pending_tasks[task] = actor
+    parallel_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
+                                    runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
+                                    simulation_mode=simulation_mode, actors=actors)
+    final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
     
-
-    all_results = []
-    while pending_tasks:
-        done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
-        for done_id in done_ids:
-            result = ray.get(done_id)
-            all_results.append(result)
-            actor = pending_tasks.pop(done_id)
-
-            if ensemble_queue:
-                strategy = ensemble_queue.popleft()
-                remote_tuning_options = setup_tuning_options(tuning_options)
-                task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
-                pending_tasks[task] = actor
-
-    new_tuning_options = ray.get(cache_manager.get_tuning_options.remote())
-    tuning_options.update(new_tuning_options)
-    final_results, population, candidates = process_results(all_results, searchspace)
-
-    if population: # for memetic strategy
-        tuning_options.strategy_options["population"] = population
-    if candidates: # for memetic strategy
-        tuning_options.strategy_options["candidates"] = candidates
-
     return final_results
-
-def setup_tuning_options(tuning_options):
-    new_tuning_options = copy.deepcopy(tuning_options)
-    if "candidates" in tuning_options.strategy_options:
-        if len(tuning_options.strategy_options["candidates"]) > 0:
-            new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
-    return new_tuning_options
-
-def process_results(all_results, searchspace):
-    unique_configs = set()
-    final_results = []
-    population = [] # for memetic strategy
-    candidates = [] # for memetic strategy
-
-    for (strategy_results, tuning_options) in all_results:
-        if "old_candidate" in tuning_options.strategy_options:
-            candidates.append(tuning_options.strategy_options["old_candidate"])
-        if "candidate" in tuning_options.strategy_options:
-            population.append(tuning_options.strategy_options["candidate"])
-        for new_result in strategy_results:
-            config_signature = tuple(new_result[key] for key in searchspace.tune_params)
-            if config_signature not in unique_configs:
-                final_results.append(new_result)
-                unique_configs.add(config_signature)
-    return final_results, population, candidates

From af532c544e2bdbe6a0da384b5efdca879250051e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:25:55 +0200
Subject: [PATCH 066/106] added self.init_arguments for parallel runner
 execution

---
 kernel_tuner/observers/nvml.py        | 6 ++++++
 kernel_tuner/observers/pmt.py         | 3 +++
 kernel_tuner/observers/powersensor.py | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index 0fd812a34..2a496441a 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -315,6 +315,9 @@ def __init__(
         continous_duration=1,
     ):
         """Create an NVMLObserver."""
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observables, device, save_all, nvidia_smi_fallback, use_locked_clocks, continous_duration]
+
         if nvidia_smi_fallback:
             self.nvml = nvml(
                 device,
@@ -424,6 +427,9 @@ def __init__(self, observables, parent, nvml_instance, continous_duration=1):
         self.parent = parent
         self.nvml = nvml_instance
 
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observables, parent, nvml_instance, continous_duration]
+
         supported = ["power_readings", "nvml_power", "nvml_energy"]
         for obs in observables:
             if obs not in supported:
diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
index 6efb1209a..750b784bc 100644
--- a/kernel_tuner/observers/pmt.py
+++ b/kernel_tuner/observers/pmt.py
@@ -33,6 +33,9 @@ class PMTObserver(BenchmarkObserver):
     def __init__(self, observable=None):
         if not pmt:
             raise ImportError("could not import pmt")
+        
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observable]
 
         # User specifices a dictonary of platforms and corresponding device
         if type(observable) is dict:
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index 6d07e8977..e05b854a6 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -27,6 +27,9 @@ class PowerSensorObserver(BenchmarkObserver):
     def __init__(self, observables=None, device=None):
         if not powersensor:
             raise ImportError("could not import powersensor")
+        
+        # needed for re-initializing observer on ray actor
+        self.init_arguments = [observables, device]
 
         supported = ["ps_energy", "ps_power"]
         for obs in observables:

From 82d988687f9e7ab9956518c5ef9333a55ffb9e99 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:28:35 +0200
Subject: [PATCH 067/106] fix about non-pickleable observers and other small
 adjustments

---
 kernel_tuner/runners/parallel.py | 69 ++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e6cdce2ab..242d2e1d9 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -10,7 +10,7 @@
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.util import get_num_devices, get_nested_types
+from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
@@ -19,14 +19,11 @@ class ParallelRunner(Runner):
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
                  num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
-        self.quiet = device_options.quiet
         self.kernel_source = kernel_source
-        self.warmed_up = False
         self.simulation_mode = simulation_mode
+        self.kernel_options = kernel_options
         self.start_time = perf_counter()
         self.last_strategy_start_time = self.start_time
-        self.last_strategy_time = 0
-        self.kernel_options = kernel_options
         self.observers = observers
         self.iterations = iterations
         self.device_options = device_options
@@ -36,49 +33,52 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
 
         if num_gpus is None:
             self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
-
+        
         initialize_ray()
 
-        # Create RemoteActor instances
-        if actors is None:
-            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, simulation_mode, id) for id in range(self.num_gpus)]
-
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
-    
     def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
         if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
         
+        # Create RemoteActor instances
+        if self.actors is None:
+            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
+            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, self.simulation_mode, id) for id in range(self.num_gpus)]
+
         if self.cache_manager is None:
             if cache_manager is None:
-                cache_manager = CacheManager.remote(tuning_options)
+                cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
             self.cache_manager = cache_manager
         
         # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
             ray.get(actor.set_cache_manager.remote(self.cache_manager))
     
+        # Some observers can't be pickled
+        run_tuning_options = copy.deepcopy(tuning_options)
+        run_tuning_options['observers'] = None
         # Determine what type of parallelism and run appropriately
         if parameter_space and not ensemble and not searchspace:
-            results, tuning_options_list = self.run_parallel_tuning(tuning_options, parameter_space)
+            results, tuning_options_list = self.parallel_function_evaluation(run_tuning_options, parameter_space)
         elif ensemble and searchspace and not parameter_space:
-            results, tuning_options_list = self.run_parallel_ensemble(ensemble, tuning_options, searchspace)
+            results, tuning_options_list = self.multi_strategy_parallel_execution(ensemble, run_tuning_options, searchspace)
         else:
             raise ValueError("Invalid arguments to parallel runner run method")
         
         # Update tuning options
-        new_tuning_options = ray.get(self.cache_manager.get_tuning_options.remote())
-        tuning_options.update(new_tuning_options)
+        # NOTE: tuning options won't have the state of the observers created in the actors as they can't be pickled
+        cache, cachefile = ray.get(self.cache_manager.get_cache.remote())
+        tuning_options.cache = cache
+        tuning_options.cachefile = cachefile
         if self.simulation_mode:
             tuning_options.simulated_time += self._calculate_simulated_time(tuning_options_list)
-            print(f"DEBUG: simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
         
         return results
 
-    def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
+    def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspace):
         """
         Runs strategies from the ensemble in parallel using distributed actors, 
         manages dynamic task allocation, and collects results.
@@ -86,11 +86,20 @@ def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
         ensemble_queue = deque(ensemble)
         pending_tasks = {}
         all_results = []
+        max_feval = tuning_options.strategy_options["max_fevals"]
+        num_strategies = len(ensemble)
+
+        # distributing feval to all strategies
+        base_eval_per_strategy = max_feval // num_strategies
+        remainder = max_feval % num_strategies
+        evaluations_per_strategy = [base_eval_per_strategy] * num_strategies
+        for i in range(remainder):
+            evaluations_per_strategy[i] += 1
 
         # Start initial tasks for each actor
         for actor in self.actors:
             strategy = ensemble_queue.popleft()
-            remote_tuning_options = self._setup_tuning_options(tuning_options)
+            remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
             task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
             pending_tasks[task] = actor
         
@@ -105,7 +114,7 @@ def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
                 # Reassign actors if strategies remain
                 if ensemble_queue:
                     strategy = ensemble_queue.popleft()
-                    remote_tuning_options = self._setup_tuning_options(tuning_options)
+                    remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
                     task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
                     pending_tasks[task] = actor
         
@@ -117,13 +126,15 @@ def run_parallel_ensemble(self, ensemble, tuning_options, searchspace):
             tuning_options.strategy_options["population"] = population
         if candidates:
             tuning_options.strategy_options["candidates"] = candidates
+        
         return results, tuning_options_list
     
-    def _setup_tuning_options(self, tuning_options):
+    def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         new_tuning_options = copy.deepcopy(tuning_options)
         if "candidates" in tuning_options.strategy_options:
             if len(tuning_options.strategy_options["candidates"]) > 0:
                 new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
+        new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
@@ -143,11 +154,11 @@ def _process_results_ensemble(self, all_results):
         return results, tuning_options_list, population, candidates
 
 
-    def run_parallel_tuning(self, tuning_options, parameter_space):
+    def parallel_function_evaluation(self, tuning_options, parameter_space):
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(element=v, tuning_options=tuning_options), parameter_space))
+        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(tuning_options, element=v), parameter_space))
         results = [x[0] for x in all_results]
         tuning_options_list = [x[1] for x in all_results]
         return results, tuning_options_list
@@ -167,7 +178,13 @@ def _process_results(self, all_results, searchspace):
     def _calculate_simulated_time(self, tuning_options_list):
         simulated_times = []
         for tuning_options in tuning_options_list:
-            print(f"DEBUG:_calculate_simulated_time tuning_options.simulated_time = {tuning_options.simulated_time}", file=sys.stderr)
             simulated_times.append(tuning_options.simulated_time)
         #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
-        return max(simulated_times)
\ No newline at end of file
+        return max(simulated_times)
+
+    def clean_up_ray(self):
+        if self.actors is not None:
+            for actor in self.actors:
+                ray.kill(actor)
+        if self.cache_manager is not None:
+            ray.kill(self.cache_manager)
\ No newline at end of file

From c6a2f36277c26f8fc60c12dfafce2e2a04a82161 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:29:54 +0200
Subject: [PATCH 068/106] now the cache manager deals only with the cache and
 not with the entire tuning option dict

---
 kernel_tuner/runners/ray/cache_manager.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index 882207f02..9aeb56855 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -1,23 +1,23 @@
 import ray
-import json
 
 from kernel_tuner.util import store_cache
 
 @ray.remote(num_cpus=1)
 class CacheManager:
-    def __init__(self, tuning_options):
-        self.tuning_options = tuning_options
+    def __init__(self, cache, cachefile):
+        from kernel_tuner.interface import Options # importing here due to circular import
+        self.tuning_options = Options({'cache': cache, 'cachefile': cachefile})
 
     def store(self, key, params):
         store_cache(key, params, self.tuning_options)
 
     def check_and_retrieve(self, key):
         """Checks if a result exists for the given key and returns it if found."""
-        if self.tuning_options.cache:
-            return self.tuning_options.cache.get(key, None)
+        if self.tuning_options['cache']:
+            return self.tuning_options['cache'].get(key, None)
         else:
             return None
     
-    def get_tuning_options(self):
+    def get_cache(self):
         """Returns the current tuning options."""
-        return self.tuning_options
+        return self.tuning_options['cache'], self.tuning_options['cachefile']

From 5fe2e56653bb0200a32fa58c33404999b90b2523 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:30:25 +0200
Subject: [PATCH 069/106] fix related to non-pickleable observers

---
 kernel_tuner/runners/ray/remote_actor.py | 32 +++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 3956c8648..96d244c3b 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,9 +1,11 @@
 import ray
 import sys
+import copy
 
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.util import get_nested_types
+from kernel_tuner.core import DeviceInterface
+from kernel_tuner.observers.register import RegisterObserver
 
 @ray.remote
 class RemoteActor():
@@ -12,26 +14,44 @@ def __init__(self,
                  kernel_options, 
                  device_options,
                  iterations,
-                 observers,
+                 observers_type_and_arguments,
                  cache_manager=None,
                  simulation_mode=False):
         self.kernel_source = kernel_source
         self.kernel_options = kernel_options
         self.device_options = device_options
         self.iterations = iterations
-        self.observers = observers
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-            
+
+        # observers can't be pickled to the actor so we need to re-initialize them
+        register_observer = False
+        self.observers = []
+        for (observer, arguments) in observers_type_and_arguments:
+            if isinstance(observer, RegisterObserver):
+                register_observer = True
+            else:
+                self.observers.append(observer(*arguments))
+        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, **device_options) if not simulation_mode else None
+        # the register observer needs dev to be initialized, that's why its done later
+        if register_observer:
+            self.observers.append(RegisterObserver(self.dev))
+
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        tuning_options['observers'] = self.observers
         if self.runner is None:
             self.init_runner()
         if strategy and searchspace:
-            results = strategy.tune(searchspace, self.runner, tuning_options)
+            results = strategy.tune(searchspace, self.runner,  tuning_options)
+            # observers can't be pickled
+            tuning_options['observers'] = None
             return results, tuning_options
         elif element:
-            results = self.runner.run([element], tuning_options)[0]
+            results = self.runner.run([element],  tuning_options)[0]
+            # observers can't be pickled
+            tuning_options['observers'] = None
             return results, tuning_options
         else:
             raise ValueError("Invalid arguments for ray actor's execute method.")

From 3b3317c4ff7e425eec86512e82810bf298937285 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:31:13 +0200
Subject: [PATCH 070/106] update related to new cache manager

---
 kernel_tuner/strategies/brute_force.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index ba3d834ad..b08efea03 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -9,8 +9,8 @@
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     if isinstance(runner, ParallelRunner):
-        cache_manager = CacheManager.remote(tuning_options)
-        return runner.run(searchspace.sorted_list(), tuning_options, cache_manager)
+        cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
+        return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)
     else:
         return runner.run(searchspace.sorted_list(), tuning_options)
 

From 1593806ba2535864f18e297dd202c623e7ebfea9 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:32:48 +0200
Subject: [PATCH 071/106] added cleanup at the end of the ensemble

---
 kernel_tuner/strategies/ensemble.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 78fd85001..a5268dc18 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -54,6 +54,8 @@
 }
 
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
+    clean_up = True if actors is None and cache_manager is None else False
+    options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     
@@ -63,6 +65,9 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     else:
         ensemble = ["greedy_ils", "greedy_ils"]
     ensemble_size = len(ensemble)
+
+    tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
+
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
@@ -72,5 +77,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
                                     runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
                                     simulation_mode=simulation_mode, actors=actors)
     final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
+
+    if clean_up:
+        parallel_runner.clean_up_ray()
     
     return final_results

From efd5be20147a6162a19c65057fc6d6c7ff0c86db Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:34:13 +0200
Subject: [PATCH 072/106] changes to hyperparameters

---
 kernel_tuner/strategies/memetic.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 3f2922f4b..be5c95db4 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -76,6 +76,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     maxiter = options.get("maxiter", 2)
     popsize = options.get("popsize", 20)
     max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
+    print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
 
     if local_search in ls_strategies_list:
         tuning_options["ensemble"] = [local_search] * popsize
@@ -93,7 +94,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     check_num_devices(num_gpus, simulation_mode, runner)
     initialize_ray()
     # Create cache manager, actors and parallel runner
-    cache_manager = CacheManager.remote(tuning_options)
+    cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
     num_actors = num_gpus if num_gpus < popsize else popsize
     runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
     actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
@@ -132,7 +133,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Local Search (LS)
         print(f"DEBUG:=================Local Search=================", file=sys.stderr)
-        tuning_options.strategy_options["max_fevals"] = lsd
+        tuning_options.strategy_options["max_fevals"] = lsd * popsize
         pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
@@ -154,7 +155,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             if afi_ls > afi_gs:
                 lsd += alsd
             elif afi_ls < afi_gs:
-                lsd -= alsd if lsd - alsd > 5 else 5
+                lsd -= alsd
+            # Less than 5 lsd doesn't make sense
+            if lsd < 5:
+                lsd = 5
             print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
 
     ray.kill(cache_manager)

From bc66244741d2dc1c41b7f140d2c657f1e8d1d95d Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 28 May 2024 11:34:46 +0200
Subject: [PATCH 073/106] changes related to non-pickleable observers

---
 kernel_tuner/strategies/common.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 787750825..189620649 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -10,6 +10,10 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
+from kernel_tuner.observers.nvml import NVMLObserver, NVMLPowerObserver
+from kernel_tuner.observers.pmt import PMTObserver
+from kernel_tuner.observers.powersensor import PowerSensorObserver
+from kernel_tuner.observers.register import RegisterObserver
 
 _docstring_template = """ Find the best performing kernel configuration in the parameter space
 
@@ -343,12 +347,22 @@ def create_actor_on_device(kernel_source, kernel_options, device_options, iterat
     else:
         resource_options = {"num_gpus": 1}
     
+    observers_type_and_arguments = []
+    if observers is not None:
+        # observers can't be pickled so we will re-initialize them in the actors
+        # observers related to backends will be initialized once we call the device interface inside the actor, that is why we skip them here
+        for i, observer in enumerate(observers):
+            if isinstance(observer, (NVMLObserver, NVMLPowerObserver, PMTObserver, PowerSensorObserver)):
+                observers_type_and_arguments.append((observer.__class__, observer.init_arguments))
+            if isinstance(observer, RegisterObserver):
+                observers_type_and_arguments.append((observer.__class__, []))
+    
     # Create the actor with the specified options and resources
     return RemoteActor.options(**resource_options).remote(kernel_source, 
                                                             kernel_options, 
                                                             device_options, 
                                                             iterations, 
-                                                            observers,
+                                                            observers_type_and_arguments=observers_type_and_arguments,
                                                             cache_manager=cache_manager,
                                                             simulation_mode=simulation_mode)
 

From 9e9f1afe3ef24696225425aa652f165682e20870 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 31 May 2024 11:08:35 +0200
Subject: [PATCH 074/106] updated init_arguments to a dict

---
 kernel_tuner/observers/nvml.py        | 17 ++++++++++++++---
 kernel_tuner/observers/pmt.py         |  4 +++-
 kernel_tuner/observers/powersensor.py |  5 ++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index 2a496441a..bc93a275b 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -316,8 +316,14 @@ def __init__(
     ):
         """Create an NVMLObserver."""
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observables, device, save_all, nvidia_smi_fallback, use_locked_clocks, continous_duration]
-
+        self.init_arguments = {
+            "observables": observables,
+            "device": device,
+            "save_all": save_all,
+            "nvidia_smi_fallback": nvidia_smi_fallback,
+            "use_locked_clocks": use_locked_clocks,
+            "continous_duration": continous_duration
+        }
         if nvidia_smi_fallback:
             self.nvml = nvml(
                 device,
@@ -428,7 +434,12 @@ def __init__(self, observables, parent, nvml_instance, continous_duration=1):
         self.nvml = nvml_instance
 
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observables, parent, nvml_instance, continous_duration]
+        self.init_arguments = {
+            "observables": observables,
+            "parent": parent,
+            "nvml_instance": nvml_instance,
+            "continous_duration": continous_duration
+        }
 
         supported = ["power_readings", "nvml_power", "nvml_energy"]
         for obs in observables:
diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
index 750b784bc..f7f652d89 100644
--- a/kernel_tuner/observers/pmt.py
+++ b/kernel_tuner/observers/pmt.py
@@ -35,7 +35,9 @@ def __init__(self, observable=None):
             raise ImportError("could not import pmt")
         
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observable]
+        self.init_arguments = {
+            "observable": observable
+        }
 
         # User specifices a dictonary of platforms and corresponding device
         if type(observable) is dict:
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index e05b854a6..c946f9d44 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -29,7 +29,10 @@ def __init__(self, observables=None, device=None):
             raise ImportError("could not import powersensor")
         
         # needed for re-initializing observer on ray actor
-        self.init_arguments = [observables, device]
+        self.init_arguments = {
+            "observables": observables,
+            "device": device
+        }
 
         supported = ["ps_energy", "ps_power"]
         for obs in observables:

From 3fed66cdf87533b11c8833a56170a2cbc811351e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 31 May 2024 11:10:46 +0200
Subject: [PATCH 075/106] updates for searchspace split, ensemble related fix,
 and observer executing on correct device in parallel mode

---
 kernel_tuner/runners/parallel.py         | 18 ++++++++--
 kernel_tuner/runners/ray/remote_actor.py | 35 +++++++++++--------
 kernel_tuner/searchspace.py              | 44 ++++++++++++++++++++++++
 kernel_tuner/strategies/common.py        |  5 +--
 kernel_tuner/strategies/ensemble.py      |  6 +---
 kernel_tuner/strategies/memetic.py       |  6 ++--
 6 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 242d2e1d9..dc579c901 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -46,7 +46,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         # Create RemoteActor instances
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, self.cache_manager, self.simulation_mode, id) for id in range(self.num_gpus)]
+            self.actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for id in range(self.num_gpus)]
 
         if self.cache_manager is None:
             if cache_manager is None:
@@ -86,7 +86,9 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
         ensemble_queue = deque(ensemble)
         pending_tasks = {}
         all_results = []
-        max_feval = tuning_options.strategy_options["max_fevals"]
+        options = tuning_options.strategy_options
+        max_feval = options["max_fevals"]
+        split_searchspace = options["split_searchspace"] if "split_searchspace" in options else False
         num_strategies = len(ensemble)
 
         # distributing feval to all strategies
@@ -96,9 +98,17 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
         for i in range(remainder):
             evaluations_per_strategy[i] += 1
 
+        # Ensure we always have a list of search spaces
+        if split_searchspace:
+            searchspaces = searchspace.split_searchspace(num_strategies)
+        else:
+            searchspaces = [searchspace] * num_strategies
+        searchspaces = deque(searchspaces)
+
         # Start initial tasks for each actor
         for actor in self.actors:
             strategy = ensemble_queue.popleft()
+            searchspace = searchspaces.popleft()
             remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
             task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
             pending_tasks[task] = actor
@@ -114,6 +124,7 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
                 # Reassign actors if strategies remain
                 if ensemble_queue:
                     strategy = ensemble_queue.popleft()
+                    searchspace = searchspaces.popleft()
                     remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
                     task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
                     pending_tasks[task] = actor
@@ -128,6 +139,7 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
             tuning_options.strategy_options["candidates"] = candidates
         
         return results, tuning_options_list
+
     
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         new_tuning_options = copy.deepcopy(tuning_options)
@@ -135,6 +147,8 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
             if len(tuning_options.strategy_options["candidates"]) > 0:
                 new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
         new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
+        # the stop criterion uses the max feval in tuning options for some reason
+        new_tuning_options["max_fevals"] = new_tuning_options.strategy_options["max_fevals"]
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 96d244c3b..759a902a1 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -15,6 +15,7 @@ def __init__(self,
                  device_options,
                  iterations,
                  observers_type_and_arguments,
+                 id,
                  cache_manager=None,
                  simulation_mode=False):
         self.kernel_source = kernel_source
@@ -24,20 +25,9 @@ def __init__(self,
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-
-        # observers can't be pickled to the actor so we need to re-initialize them
-        register_observer = False
-        self.observers = []
-        for (observer, arguments) in observers_type_and_arguments:
-            if isinstance(observer, RegisterObserver):
-                register_observer = True
-            else:
-                self.observers.append(observer(*arguments))
-        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, **device_options) if not simulation_mode else None
-        # the register observer needs dev to be initialized, that's why its done later
-        if register_observer:
-            self.observers.append(RegisterObserver(self.dev))
+        self.id = id
+        self._reinitialize_observers(observers_type_and_arguments)
+        
 
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
         tuning_options['observers'] = self.observers
@@ -72,3 +62,20 @@ def init_runner(self):
         else:
             self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
                                        self.iterations, self.observers, cache_manager=self.cache_manager)
+
+    def _reinitialize_observers(self, observers_type_and_arguments):
+        # observers can't be pickled to the actor so we need to re-initialize them
+        register_observer = False
+        self.observers = []
+        for (observer, arguments) in observers_type_and_arguments:
+            if "device" in arguments:
+                arguments["device"] = self.id
+            if isinstance(observer, RegisterObserver):
+                register_observer = True
+            else:
+                self.observers.append(observer(**arguments))
+        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
+        self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
+        # the register observer needs dev to be initialized, that's why its done later
+        if register_observer:
+            self.observers.append(RegisterObserver(self.dev))
\ No newline at end of file
diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
index 5ee7f7ce2..f68295e93 100644
--- a/kernel_tuner/searchspace.py
+++ b/kernel_tuner/searchspace.py
@@ -50,6 +50,11 @@ def __init__(
         restrictions = restrictions if restrictions is not None else []
         self.tune_params = tune_params
         self.restrictions = restrictions
+        self.max_threads = max_threads
+        self.block_size_names = block_size_names
+        self.framework = framework
+        self.solver_method = solver_method
+        self.path_to_ATF_cache = path_to_ATF_cache
         # the searchspace can add commonly used constraints (e.g. maxprod(blocks) <= maxthreads)
         self._modified_restrictions = restrictions
         self.param_names = list(self.tune_params.keys())
@@ -727,3 +732,42 @@ def order_param_configs(
                 f"The number of ordered parameter configurations ({len(ordered_param_configs)}) differs from the original number of parameter configurations ({len(param_configs)})"
             )
         return ordered_param_configs
+
+    def split_searchspace(self, n: int) -> List['Searchspace']:
+        """Splits the searchspace into n more or less equal parts using a round-robin approach."""
+        if n <= 0:
+            raise ValueError("Number of parts must be greater than zero.")
+        if n > self.size:
+            raise ValueError(f"Cannot split into more parts ({n}) than the size of the searchspace ({self.size}).")
+
+        # Initialize the parts and their corresponding tune_params
+        parts = [{param: [] for param in self.tune_params} for _ in range(n)]
+        
+        # Distribute configurations in a round-robin fashion
+        for index, config in enumerate(self.list):
+            part_index = index % n
+            for j, param in enumerate(self.param_names):
+                parts[part_index][param].append(config[j])
+
+        # Remove duplicates and sort parameters within each part
+        for part_tune_params in parts:
+            for param in part_tune_params:
+                part_tune_params[param] = sorted(list(set(part_tune_params[param])))
+
+        # Create Searchspace objects for each part
+        searchspace_parts = []
+        for part_tune_params in parts:
+            part_searchspace = Searchspace(
+                tune_params=part_tune_params,
+                restrictions=self.restrictions,
+                max_threads=self.max_threads,
+                block_size_names=self.block_size_names,
+                build_neighbors_index=self.build_neighbors_index,
+                neighbor_method=self.neighbor_method,
+                framework=self.framework,
+                solver_method=self.solver_method,
+                path_to_ATF_cache=self.path_to_ATF_cache
+            )
+            searchspace_parts.append(part_searchspace)
+
+        return searchspace_parts
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 189620649..47fefd505 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -53,7 +53,7 @@ def make_strategy_options_doc(strategy_options):
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
     accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
-                                       "maxiter", "lsd", "popsize", "alsd", ]
+                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
@@ -364,7 +364,8 @@ def create_actor_on_device(kernel_source, kernel_options, device_options, iterat
                                                             iterations, 
                                                             observers_type_and_arguments=observers_type_and_arguments,
                                                             cache_manager=cache_manager,
-                                                            simulation_mode=simulation_mode)
+                                                            simulation_mode=simulation_mode,
+                                                            id=id)
 
 def initialize_ray():
     # Initialize Ray
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index a5268dc18..d8a919399 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -59,11 +59,7 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     
-    ensemble = []
-    if "ensemble" in tuning_options:
-        ensemble = tuning_options.ensemble
-    else:
-        ensemble = ["greedy_ils", "greedy_ils"]
+    ensemble = options.get('ensemble', ["greedy_ils", "greedy_ils"])
     ensemble_size = len(ensemble)
 
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index be5c95db4..3fd5f2b12 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -79,7 +79,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
 
     if local_search in ls_strategies_list:
-        tuning_options["ensemble"] = [local_search] * popsize
+        options["ensemble"] = [local_search] * popsize
     else:
         raise ValueError("Provided local search ensemble are not all local search strategies")
 
@@ -88,7 +88,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     else:
         raise ValueError("Provided population based strategy is not a population based strategy")
     
-    tuning_options.strategy_options["population"] = searchspace.get_random_sample(popsize)
+    options["population"] = searchspace.get_random_sample(popsize)
 
     num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
     check_num_devices(num_gpus, simulation_mode, runner)
@@ -97,7 +97,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
     num_actors = num_gpus if num_gpus < popsize else popsize
     runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
-    actors = [create_actor_on_device(*runner_attributes, cache_manager, simulation_mode, id) for id in range(num_actors)]
+    actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=cache_manager, simulation_mode=simulation_mode) for id in range(num_actors)]
     pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
                                 runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
                                 simulation_mode=simulation_mode, actors=actors)

From 86a9b677b6a6b75a22b4be228cd234d0b6cceb00 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Wed, 5 Jun 2024 14:29:11 +0200
Subject: [PATCH 076/106] small corections related to stop criterion for
 memetic

---
 kernel_tuner/strategies/greedy_ils.py | 11 ++++++++++-
 kernel_tuner/strategies/memetic.py    | 20 ++++----------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index 575b89bd2..bbceb76b8 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -35,7 +35,14 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     if not candidate:
         candidate = searchspace.get_random_sample(1)[0]
     old_candidate = candidate # for memetic strategy
-    best_score = cost_func(candidate, check_restrictions=False)
+    try:
+        best_score = cost_func(candidate, check_restrictions=False)
+    except util.StopCriterionReached as e:
+        tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
+        tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
+        if tuning_options.verbose:
+            print(e)
+        return cost_func.results
 
     last_improvement = 0
     while fevals < max_fevals:
@@ -45,6 +52,8 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
+            tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
+            tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
             if tuning_options.verbose:
                 print(e)
             return cost_func.results
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index 3fd5f2b12..ac42cdecd 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -71,15 +71,15 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     local_search = options.get('local_search', 'greedy_ils')
     global_search = options.get('global_search', "genetic_algorithm")
-    alsd = options.get("alsd", 2) # Adaptive Local Search Depth (ALSD)
-    lsd = options.get("lsd", 25) # Local Search Depth (LSD)
-    maxiter = options.get("maxiter", 2)
+    alsd = options.get("alsd", 5) # Adaptive Local Search Depth (ALSD)
+    lsd = options.get("lsd", 30) # Local Search Depth (LSD)
+    maxiter = options.get("maxiter", 3)
     popsize = options.get("popsize", 20)
     max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
     print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
 
     if local_search in ls_strategies_list:
-        options["ensemble"] = [local_search] * popsize
+        tuning_options.strategy_options["ensemble"] = [local_search] * popsize
     else:
         raise ValueError("Provided local search ensemble are not all local search strategies")
 
@@ -119,12 +119,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = global_search.tune(searchspace, pop_runner, tuning_options)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += maxiter * popsize
-        try:
-            check_stop_criterion(tuning_options)
-        except StopCriterionReached as e:
-            if tuning_options.verbose:
-                print(e)
-            break
 
         pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
         pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
@@ -138,12 +132,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
         add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
         feval += lsd * popsize
-        try:
-            check_stop_criterion(tuning_options)
-        except StopCriterionReached as e:
-            if tuning_options.verbose:
-                print(e)
-            break
 
         pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
         pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])

From de5fc4948ccda1444605a0eb15001c66de3ca51b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:22:44 +0200
Subject: [PATCH 077/106] added logic to check if all GPUs are of the same type

---
 kernel_tuner/runners/parallel.py         | 34 ++++++++++++-----
 kernel_tuner/runners/ray/remote_actor.py |  9 ++++-
 kernel_tuner/util.py                     | 47 +++++++++++++-----------
 3 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index dc579c901..e86063ee7 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -10,7 +10,7 @@
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.util import get_num_devices
+from kernel_tuner.util import get_num_devices, GPUTypeMismatchError
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
@@ -30,12 +30,16 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.cache_manager = cache_manager
         self.num_gpus = num_gpus
         self.actors = actors
-
-        if num_gpus is None:
-            self.num_gpus = get_num_devices(kernel_source.lang, simulation_mode=self.simulation_mode)
         
         initialize_ray()
 
+        if num_gpus is None:
+            self.num_gpus = get_num_devices(simulation_mode)
+
+        # So we know the number of GPUs in the cache file
+        if not simulation_mode:
+            self.dev.name = [self.dev.name] * self.num_gpus
+
     def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
@@ -46,7 +50,11 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         # Create RemoteActor instances
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for id in range(self.num_gpus)]
+            self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
+        
+        # Check if all GPUs are of the same type
+        if not self.simulation_mode and not self._check_gpus_equals():
+            raise GPUTypeMismatchError(f"Different GPU types found") 
 
         if self.cache_manager is None:
             if cache_manager is None:
@@ -88,7 +96,6 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
         all_results = []
         options = tuning_options.strategy_options
         max_feval = options["max_fevals"]
-        split_searchspace = options["split_searchspace"] if "split_searchspace" in options else False
         num_strategies = len(ensemble)
 
         # distributing feval to all strategies
@@ -99,10 +106,7 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
             evaluations_per_strategy[i] += 1
 
         # Ensure we always have a list of search spaces
-        if split_searchspace:
-            searchspaces = searchspace.split_searchspace(num_strategies)
-        else:
-            searchspaces = [searchspace] * num_strategies
+        searchspaces = [searchspace] * num_strategies
         searchspaces = deque(searchspaces)
 
         # Start initial tasks for each actor
@@ -196,6 +200,16 @@ def _calculate_simulated_time(self, tuning_options_list):
         #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
         return max(simulated_times)
 
+    def _check_gpus_equals(self):
+        gpu_types = []
+        for actor in self.actors:
+            gpu_types.append(ray.get(actor.get_gpu_type.remote(self.kernel_source.lang)))
+        if len(set(gpu_types)) == 1:
+            print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
+            return True
+        else:
+            return False
+
     def clean_up_ray(self):
         if self.actors is not None:
             for actor in self.actors:
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 759a902a1..8ea23ca1b 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -6,6 +6,7 @@
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.observers.register import RegisterObserver
+from kernel_tuner.util import get_gpu_id, get_gpu_type
 
 @ray.remote
 class RemoteActor():
@@ -25,7 +26,7 @@ def __init__(self,
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-        self.id = id
+        self.id = get_gpu_id(kernel_source.lang) if not simulation_mode else None
         self._reinitialize_observers(observers_type_and_arguments)
         
 
@@ -78,4 +79,8 @@ def _reinitialize_observers(self, observers_type_and_arguments):
         self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
         # the register observer needs dev to be initialized, that's why its done later
         if register_observer:
-            self.observers.append(RegisterObserver(self.dev))
\ No newline at end of file
+            self.observers.append(RegisterObserver(self.dev))
+
+    def get_gpu_type(self, lang):
+        print(f"DEBUG:actor get_gpu_type called", file=sys.stderr)
+        return get_gpu_type(lang)
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 1502521ef..9cd0b0ac2 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -11,6 +11,8 @@
 from inspect import signature
 from types import FunctionType
 from typing import Optional, Union
+import ray
+import subprocess
 
 import numpy as np
 from constraint import (
@@ -90,6 +92,9 @@ class SkippableFailure(Exception):
 class StopCriterionReached(Exception):
     """Exception thrown when a stop criterion has been reached."""
 
+class GPUTypeMismatchError(Exception):
+    """Exception thrown when GPU types are not the same in parallel execution"""
+
 
 try:
     import torch
@@ -1277,28 +1282,26 @@ def cuda_error_check(error):
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
 
-def get_num_devices(lang, simulation_mode=False):
-    num_devices = 0
+def get_num_devices(simulation_mode=False):
+    resources = ray.cluster_resources()
     if simulation_mode:
-        num_devices = int(round(os.cpu_count() * 0.8)) # keep resources for the main process and other tasks
-    elif lang.upper() == "CUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "CUPY":
-        import cupy
-        num_devices = cupy.cuda.runtime.getDeviceCount()
-    elif lang.upper() == "NVCUDA":
-        import pycuda.driver as cuda
-        cuda.init()
-        num_devices = cuda.Device.count()
-    elif lang.upper() == "OPENCL":
-        import pyopencl as cl
-        num_devices = sum(len(platform.get_devices()) for platform in cl.get_platforms())
-    elif lang.upper() == "HIP":
-        from pyhip import hip
-        num_devices = hip.hipGetDeviceCount()
+        num_devices = round(resources.get("CPU") * 0.8)
     else:
-        raise ValueError(f"Unsupported language: {lang}")
+        num_devices = resources.get("GPU")
+    print(f"DEBUG: {num_devices} Ray devices detected", file=sys.stderr)
+    return int(num_devices)
 
-    return num_devices
\ No newline at end of file
+def get_gpu_id(lang):
+    if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
+        gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES") or os.environ.get("NVIDIA_VISIBLE_DEVICES") or "No GPU assigned"
+    else:
+        raise NotImplementedError("TODO: implement other languages")
+    return int(gpu_id)
+
+def get_gpu_type(lang):
+    gpu_id = get_gpu_id(lang)
+    if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
+        result = subprocess.run(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader', '-i', str(gpu_id)], capture_output=True, text=True)
+        return result.stdout.strip()
+    else:
+        raise NotImplementedError("TODO: implement other languages")
\ No newline at end of file

From 1b0adb05b11545a5048cdf40d329e38eae9ef148 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:25:08 +0200
Subject: [PATCH 078/106] deleted split searchspace function

---
 kernel_tuner/searchspace.py | 39 -------------------------------------
 1 file changed, 39 deletions(-)

diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
index f68295e93..0317ff434 100644
--- a/kernel_tuner/searchspace.py
+++ b/kernel_tuner/searchspace.py
@@ -732,42 +732,3 @@ def order_param_configs(
                 f"The number of ordered parameter configurations ({len(ordered_param_configs)}) differs from the original number of parameter configurations ({len(param_configs)})"
             )
         return ordered_param_configs
-
-    def split_searchspace(self, n: int) -> List['Searchspace']:
-        """Splits the searchspace into n more or less equal parts using a round-robin approach."""
-        if n <= 0:
-            raise ValueError("Number of parts must be greater than zero.")
-        if n > self.size:
-            raise ValueError(f"Cannot split into more parts ({n}) than the size of the searchspace ({self.size}).")
-
-        # Initialize the parts and their corresponding tune_params
-        parts = [{param: [] for param in self.tune_params} for _ in range(n)]
-        
-        # Distribute configurations in a round-robin fashion
-        for index, config in enumerate(self.list):
-            part_index = index % n
-            for j, param in enumerate(self.param_names):
-                parts[part_index][param].append(config[j])
-
-        # Remove duplicates and sort parameters within each part
-        for part_tune_params in parts:
-            for param in part_tune_params:
-                part_tune_params[param] = sorted(list(set(part_tune_params[param])))
-
-        # Create Searchspace objects for each part
-        searchspace_parts = []
-        for part_tune_params in parts:
-            part_searchspace = Searchspace(
-                tune_params=part_tune_params,
-                restrictions=self.restrictions,
-                max_threads=self.max_threads,
-                block_size_names=self.block_size_names,
-                build_neighbors_index=self.build_neighbors_index,
-                neighbor_method=self.neighbor_method,
-                framework=self.framework,
-                solver_method=self.solver_method,
-                path_to_ATF_cache=self.path_to_ATF_cache
-            )
-            searchspace_parts.append(part_searchspace)
-
-        return searchspace_parts

From 513028679aa2d4a588ec4a33e16399127598978b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:25:57 +0200
Subject: [PATCH 079/106] changed place where ray is initialized

---
 kernel_tuner/strategies/memetic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
index ac42cdecd..083b117a7 100644
--- a/kernel_tuner/strategies/memetic.py
+++ b/kernel_tuner/strategies/memetic.py
@@ -89,10 +89,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         raise ValueError("Provided population based strategy is not a population based strategy")
     
     options["population"] = searchspace.get_random_sample(popsize)
-
-    num_gpus = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
-    check_num_devices(num_gpus, simulation_mode, runner)
+    
     initialize_ray()
+    num_gpus = get_num_devices(simulation_mode=simulation_mode)
+    check_num_devices(num_gpus, simulation_mode, runner)
     # Create cache manager, actors and parallel runner
     cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
     num_actors = num_gpus if num_gpus < popsize else popsize

From 5b9d8178b5df7906baceb1f9b9cd09a6eaedd51b Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 12:26:44 +0200
Subject: [PATCH 080/106] setting BO to random sampling if needed

---
 kernel_tuner/strategies/ensemble.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index d8a919399..1ae443240 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -57,11 +57,14 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     clean_up = True if actors is None and cache_manager is None else False
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    initialize_ray()
+    num_devices = get_num_devices(simulation_mode=simulation_mode)
     
-    ensemble = options.get('ensemble', ["greedy_ils", "greedy_ils"])
+    ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
     ensemble_size = len(ensemble)
 
+    if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
+        tuning_options.strategy_options["samplingmethod"] = 'random'
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
 
     if num_devices < ensemble_size:

From 040a57ec5600ee800e92e0fb02556c028ec4dcce Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 7 Jun 2024 17:39:46 +0200
Subject: [PATCH 081/106] added num_gpus option

---
 kernel_tuner/interface.py           | 10 +++++++---
 kernel_tuner/strategies/ensemble.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 81ae7de48..63c4c2fff 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -618,8 +618,8 @@ def tune_kernel(
         tuning_options["max_fevals"] = strategy_options["max_fevals"]
     if strategy_options and "time_limit" in strategy_options:
         tuning_options["time_limit"] = strategy_options["time_limit"]
-    if strategy_options and "ensemble" in strategy_options:
-        tuning_options["ensemble"] = strategy_options["ensemble"]
+    if strategy_options and "num_gpus" in strategy_options:
+        tuning_options["num_gpus"] = strategy_options["num_gpus"]
 
     logging.debug("tune_kernel called")
     logging.debug("kernel_options: %s", util.get_config_string(kernel_options))
@@ -661,7 +661,11 @@ def tune_kernel(
     # select the runner for this job based on input
     selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner)
     tuning_options.simulated_time = 0
-    runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
+    if parallel_mode:
+         num_gpus = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else None
+         runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers, num_gpus=num_gpus)
+    else:
+        runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
     # the user-specified function may or may not have an optional atol argument;
     # we normalize it so that it always accepts atol.
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 1ae443240..4c16b4f8f 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -58,7 +58,7 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     initialize_ray()
-    num_devices = get_num_devices(simulation_mode=simulation_mode)
+    num_devices = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else get_num_devices(simulation_mode=simulation_mode)
     
     ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
     ensemble_size = len(ensemble)

From acaaeb12a3102fe48e4ead134aa02999e1e5c548 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 10 Jun 2024 15:38:59 +0200
Subject: [PATCH 082/106] removed debug print

---
 kernel_tuner/runners/ray/remote_actor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 8ea23ca1b..219fb6732 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -82,5 +82,4 @@ def _reinitialize_observers(self, observers_type_and_arguments):
             self.observers.append(RegisterObserver(self.dev))
 
     def get_gpu_type(self, lang):
-        print(f"DEBUG:actor get_gpu_type called", file=sys.stderr)
         return get_gpu_type(lang)

From 63d9f653437398227100d03e9b67c987e1e775cc Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 10 Jun 2024 15:39:33 +0200
Subject: [PATCH 083/106] added check_and_retrive strategy option

---
 kernel_tuner/runners/sequential.py     | 2 +-
 kernel_tuner/strategies/brute_force.py | 1 +
 kernel_tuner/strategies/ensemble.py    | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index b4fc18c57..3ee43be0f 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -125,7 +125,7 @@ def run(self, parameter_space, tuning_options):
         return results
 
     def config_in_cache(self, x_int, tuning_options):
-        if self.cache_manager:
+        if self.cache_manager and tuning_options.strategy_options['check_and_retrieve']:
             return ray.get(self.cache_manager.check_and_retrieve.remote(x_int))
         elif tuning_options.cache and x_int in tuning_options.cache:
             return tuning_options.cache[x_int]
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index b08efea03..ac5ae985a 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -9,6 +9,7 @@
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     if isinstance(runner, ParallelRunner):
+        tuning_options.strategy_options['check_and_retrieve'] = False
         cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
         return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)
     else:
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 4c16b4f8f..2a19f9f74 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -66,6 +66,7 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = 'random'
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
+    tuning_options.strategy_options['check_and_retrieve'] = True
 
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)

From e604510d7ec5d5d5447ef0953bd23bf1528101b6 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 18 Jun 2024 10:52:39 +0200
Subject: [PATCH 084/106] moved reinitialization of actor observers to execute
 method, before was in init

---
 kernel_tuner/runners/ray/remote_actor.py | 11 ++++++++---
 kernel_tuner/strategies/common.py        |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 219fb6732..138636def 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -27,10 +27,14 @@ def __init__(self,
         self.simulation_mode = simulation_mode
         self.runner = None
         self.id = get_gpu_id(kernel_source.lang) if not simulation_mode else None
-        self._reinitialize_observers(observers_type_and_arguments)
+        self.observers_initialized = False
+        self.observers_type_and_arguments = observers_type_and_arguments
         
 
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
+        if not self.observers_initialized:
+            self._reinitialize_observers(self.observers_type_and_arguments)
+            self.observers_initialized = True
         tuning_options['observers'] = self.observers
         if self.runner is None:
             self.init_runner()
@@ -65,6 +69,7 @@ def init_runner(self):
                                        self.iterations, self.observers, cache_manager=self.cache_manager)
 
     def _reinitialize_observers(self, observers_type_and_arguments):
+        print("DEBUG: reinit observers called", file=sys.stderr)
         # observers can't be pickled to the actor so we need to re-initialize them
         register_observer = False
         self.observers = []
@@ -75,10 +80,10 @@ def _reinitialize_observers(self, observers_type_and_arguments):
                 register_observer = True
             else:
                 self.observers.append(observer(**arguments))
-        # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
-        self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
         # the register observer needs dev to be initialized, that's why its done later
         if register_observer:
+            # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
+            self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
             self.observers.append(RegisterObserver(self.dev))
 
     def get_gpu_type(self, lang):
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 47fefd505..6d010a0a9 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -53,7 +53,7 @@ def make_strategy_options_doc(strategy_options):
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
     accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
-                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace"]
+                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace", "check_and_retrieve"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")

From 5933a6974d14b4f5779baa52d374f14e9207378e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Tue, 18 Jun 2024 18:08:50 +0200
Subject: [PATCH 085/106] changes related to re-initialization of observers in
 actor init and device interface

---
 kernel_tuner/runners/parallel.py         |  3 ++-
 kernel_tuner/runners/ray/remote_actor.py | 25 ++++++++++--------------
 kernel_tuner/runners/sequential.py       |  4 ++--
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e86063ee7..628c95958 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -203,7 +203,8 @@ def _calculate_simulated_time(self, tuning_options_list):
     def _check_gpus_equals(self):
         gpu_types = []
         for actor in self.actors:
-            gpu_types.append(ray.get(actor.get_gpu_type.remote(self.kernel_source.lang)))
+            env = ray.get(actor.get_environment.remote())
+            gpu_types.append(env["device_name"])
         if len(set(gpu_types)) == 1:
             print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
             return True
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 138636def..88aac10b6 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -26,15 +26,14 @@ def __init__(self,
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-        self.id = get_gpu_id(kernel_source.lang) if not simulation_mode else None
-        self.observers_initialized = False
-        self.observers_type_and_arguments = observers_type_and_arguments
-        
+        self.id = None
+        self._reinitialize_observers(observers_type_and_arguments)
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=self.observers, **device_options) if not simulation_mode else None
 
+    def get_environment(self):
+        return self.dev.get_environment()
+    
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
-        if not self.observers_initialized:
-            self._reinitialize_observers(self.observers_type_and_arguments)
-            self.observers_initialized = True
         tuning_options['observers'] = self.observers
         if self.runner is None:
             self.init_runner()
@@ -66,25 +65,21 @@ def init_runner(self):
                                             self.iterations, self.observers)
         else:
             self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                       self.iterations, self.observers, cache_manager=self.cache_manager)
+                                       self.iterations, self.observers, cache_manager=self.cache_manager, dev=self.dev)
 
     def _reinitialize_observers(self, observers_type_and_arguments):
         print("DEBUG: reinit observers called", file=sys.stderr)
         # observers can't be pickled to the actor so we need to re-initialize them
-        register_observer = False
         self.observers = []
         for (observer, arguments) in observers_type_and_arguments:
             if "device" in arguments:
+                self.id = get_gpu_id(self.kernel_source.lang) if self.id is None else self.id
                 arguments["device"] = self.id
             if isinstance(observer, RegisterObserver):
-                register_observer = True
+                self.observers.append(RegisterObserver())
             else:
                 self.observers.append(observer(**arguments))
-        # the register observer needs dev to be initialized, that's why its done later
-        if register_observer:
-            # we dont initialize the dev with observers, as this creates a 'invalid resource handle' error down the line
-            self.dev = DeviceInterface(self.kernel_source, iterations=self.iterations, **self.device_options) if not self.simulation_mode else None
-            self.observers.append(RegisterObserver(self.dev))
+        
 
     def get_gpu_type(self, lang):
         return get_gpu_type(lang)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 3ee43be0f..7fe39858a 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -12,7 +12,7 @@
 class SequentialRunner(Runner):
     """SequentialRunner is used for tuning with a single process/thread."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, dev=None):
         """Instantiate the SequentialRunner.
 
         :param kernel_source: The kernel source
@@ -30,7 +30,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :type iterations: int
         """
         #detect language and create high-level device interface
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if dev is None else dev
 
         self.units = self.dev.units
         self.quiet = device_options.quiet

From 4e4c47b42b776ac5a3e13761737264b92d4be902 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Fri, 21 Jun 2024 19:08:11 +0200
Subject: [PATCH 086/106] removed unnecesary blocking ray.get

---
 kernel_tuner/runners/parallel.py   | 13 +++++++++----
 kernel_tuner/runners/sequential.py |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 628c95958..53bf96160 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -51,7 +51,11 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
             self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
-        
+            # actors_ready_futures = [actor.__ray_ready__.remote() for actor in futures]
+            # ray.wait(actors_ready_futures, num_returns=len(actors_ready_futures), timeout=None)
+            # self.actors = futures
+
+
         # Check if all GPUs are of the same type
         if not self.simulation_mode and not self._check_gpus_equals():
             raise GPUTypeMismatchError(f"Different GPU types found") 
@@ -63,7 +67,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         
         # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
-            ray.get(actor.set_cache_manager.remote(self.cache_manager))
+            actor.set_cache_manager.remote(self.cache_manager)
     
         # Some observers can't be pickled
         run_tuning_options = copy.deepcopy(tuning_options)
@@ -202,8 +206,9 @@ def _calculate_simulated_time(self, tuning_options_list):
 
     def _check_gpus_equals(self):
         gpu_types = []
-        for actor in self.actors:
-            env = ray.get(actor.get_environment.remote())
+        env_refs = [actor.get_environment.remote() for actor in self.actors]
+        environments = ray.get(env_refs)
+        for env in environments:
             gpu_types.append(env["device_name"])
         if len(set(gpu_types)) == 1:
             print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 7fe39858a..e19242549 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -134,6 +134,6 @@ def config_in_cache(self, x_int, tuning_options):
 
     def store_in_cache(self, x_int, params, tuning_options):
         if self.cache_manager:
-            ray.get(self.cache_manager.store.remote(x_int, params))
+            self.cache_manager.store.remote(x_int, params)
         else:
             store_cache(x_int, params, tuning_options)
\ No newline at end of file

From 104205d34560cfbbb7649f014518ce10a7b6ec66 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 11:33:51 +0200
Subject: [PATCH 087/106] removed debug prints

---
 kernel_tuner/runners/parallel.py         | 2 +-
 kernel_tuner/runners/ray/remote_actor.py | 1 -
 kernel_tuner/util.py                     | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 53bf96160..61b0edf50 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -211,7 +211,7 @@ def _check_gpus_equals(self):
         for env in environments:
             gpu_types.append(env["device_name"])
         if len(set(gpu_types)) == 1:
-            print(f"DEBUG: Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
+            print(f"Running on {len(gpu_types)} {gpu_types[0]}", file=sys.stderr)
             return True
         else:
             return False
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 88aac10b6..533dea5b3 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -68,7 +68,6 @@ def init_runner(self):
                                        self.iterations, self.observers, cache_manager=self.cache_manager, dev=self.dev)
 
     def _reinitialize_observers(self, observers_type_and_arguments):
-        print("DEBUG: reinit observers called", file=sys.stderr)
         # observers can't be pickled to the actor so we need to re-initialize them
         self.observers = []
         for (observer, arguments) in observers_type_and_arguments:
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 9cd0b0ac2..21a6edd08 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -1288,7 +1288,6 @@ def get_num_devices(simulation_mode=False):
         num_devices = round(resources.get("CPU") * 0.8)
     else:
         num_devices = resources.get("GPU")
-    print(f"DEBUG: {num_devices} Ray devices detected", file=sys.stderr)
     return int(num_devices)
 
 def get_gpu_id(lang):

From 123fba516e738f31d552ef167ad02284a276c1e3 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:19:48 +0200
Subject: [PATCH 088/106] added greedy ils esemble instead of default

---
 test/test_ensemble_tuning.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/test/test_ensemble_tuning.py b/test/test_ensemble_tuning.py
index e5c807d43..69efb5a68 100644
--- a/test/test_ensemble_tuning.py
+++ b/test/test_ensemble_tuning.py
@@ -17,9 +17,11 @@
 def env():
     kernel_string = """
     extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
-        int i = blockIdx.x * block_size_x + threadIdx.x;
-        if (i<n) {
-            c[i] = a[i] + b[i];
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        int j = blockIdx.y * blockDim.y + threadIdx.y;
+        int index = i + j * gridDim.x * blockDim.x;
+        if (index < n) {
+            c[index] = a[index] + b[index];
         }
     }
     """
@@ -32,11 +34,16 @@ def env():
 
     args = [c, a, b, n]
     tune_params = dict()
-    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]
+
+    # Extend the range of block sizes for a bigger search space
+    tune_params["block_size_x"] = [128 + 64 * i for i in range(30)]
+    tune_params["block_size_y"] = [1 + i for i in range(1, 16)]
 
     return ["vector_add", kernel_string, size, args, tune_params]
 
 @skip_if_no_pycuda
 def test_parallel_tune_kernel(env):
-    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", parallel_mode=True)
+    strategy_options = {"ensemble": ["greedy_ils", "greedy_ils"]}
+    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", 
+                            parallel_mode=True, strategy_options=strategy_options)
     assert len(result) > 0
\ No newline at end of file

From d381011f9ff628f9cdb7552e402134afc2397561 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:20:23 +0200
Subject: [PATCH 089/106] added check on strategy_options

---
 kernel_tuner/strategies/brute_force.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index ac5ae985a..1ba83a467 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -9,6 +9,8 @@
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     if isinstance(runner, ParallelRunner):
+        if tuning_options.strategy_options is None:
+            tuning_options.strategy_options = {}
         tuning_options.strategy_options['check_and_retrieve'] = False
         cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
         return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)

From 7e832e33ca1882f3319dd79911223dbe9be91141 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:20:56 +0200
Subject: [PATCH 090/106] removed all memetic algo related stuff

---
 kernel_tuner/interface.py                    |   4 +-
 kernel_tuner/runners/parallel.py             |  28 +--
 kernel_tuner/strategies/common.py            |   3 +-
 kernel_tuner/strategies/genetic_algorithm.py |  60 ++---
 kernel_tuner/strategies/greedy_ils.py        |  25 +--
 kernel_tuner/strategies/memetic.py           | 224 -------------------
 6 files changed, 27 insertions(+), 317 deletions(-)
 delete mode 100644 kernel_tuner/strategies/memetic.py

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 63c4c2fff..e40304d08 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -58,8 +58,7 @@
     pso,
     random_sample,
     simulated_annealing,
-    ensemble,
-    memetic
+    ensemble
 )
 
 strategy_map = {
@@ -79,7 +78,6 @@
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
     "ensemble": ensemble,
-    "memetic": memetic,
 }
 
 
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 61b0edf50..8884b89e7 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -51,11 +51,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         if self.actors is None:
             runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
             self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
-            # actors_ready_futures = [actor.__ray_ready__.remote() for actor in futures]
-            # ray.wait(actors_ready_futures, num_returns=len(actors_ready_futures), timeout=None)
-            # self.actors = futures
-
-
+            
         # Check if all GPUs are of the same type
         if not self.simulation_mode and not self._check_gpus_equals():
             raise GPUTypeMismatchError(f"Different GPU types found") 
@@ -137,43 +133,28 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
                     task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
                     pending_tasks[task] = actor
         
-        # Process results to extract population and candidates for further use
-        results, tuning_options_list, population, candidates = self._process_results_ensemble(all_results)
-
-        # Update tuning options for memetic strategies
-        if population:
-            tuning_options.strategy_options["population"] = population
-        if candidates:
-            tuning_options.strategy_options["candidates"] = candidates
+        # Process results
+        results, tuning_options_list = self._process_results_ensemble(all_results)
         
         return results, tuning_options_list
 
     
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         new_tuning_options = copy.deepcopy(tuning_options)
-        if "candidates" in tuning_options.strategy_options:
-            if len(tuning_options.strategy_options["candidates"]) > 0:
-                new_tuning_options.strategy_options["candidate"] = tuning_options.strategy_options["candidates"].pop(0)
         new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
         # the stop criterion uses the max feval in tuning options for some reason
         new_tuning_options["max_fevals"] = new_tuning_options.strategy_options["max_fevals"]
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
-        population = [] # for memetic strategy
-        candidates = [] # for memetic strategy
         results = []
         tuning_options_list = []
 
         for (strategy_results, tuning_options) in all_results:
-            if "old_candidate" in tuning_options.strategy_options:
-                candidates.append(tuning_options.strategy_options["old_candidate"])
-            if "candidate" in tuning_options.strategy_options:
-                population.append(tuning_options.strategy_options["candidate"])
             results.extend(strategy_results)
             tuning_options_list.append(tuning_options)
 
-        return results, tuning_options_list, population, candidates
+        return results, tuning_options_list
 
 
     def parallel_function_evaluation(self, tuning_options, parameter_space):
@@ -201,7 +182,6 @@ def _calculate_simulated_time(self, tuning_options_list):
         simulated_times = []
         for tuning_options in tuning_options_list:
             simulated_times.append(tuning_options.simulated_time)
-        #simulated_times = [tuning_options.simulated_time for tuning_options in tuning_options_list]
         return max(simulated_times)
 
     def _check_gpus_equals(self):
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 6d010a0a9..7ea022519 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -52,8 +52,7 @@ def make_strategy_options_doc(strategy_options):
 
 def get_options(strategy_options, options):
     """Get the strategy-specific options or their defaults from user-supplied strategy_options."""
-    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "candidates", "candidate", "population", 
-                                       "maxiter", "lsd", "popsize", "alsd", "split_searchspace", "check_and_retrieve"]
+    accepted = list(options.keys()) + ["max_fevals", "time_limit", "ensemble", "check_and_retrieve"]
     for key in strategy_options:
         if key not in accepted:
             raise ValueError(f"Unrecognized option {key} in strategy_options")
diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index b082ce3c6..52361a744 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -7,42 +7,39 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
-from kernel_tuner.runners.parallel import ParallelRunner
 
 _options = dict(
     popsize=("population size", 20),
     maxiter=("maximum number of generations", 100),
     method=("crossover method to use, choose any from single_point, two_point, uniform, disruptive_uniform", "uniform"),
     mutation_chance=("chance to mutate is 1 in mutation_chance", 10),
-    population=("initial population", None),
 )
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
-    pop_size, generations, method, mutation_chance, population = common.get_options(options, _options)
+    pop_size, generations, method, mutation_chance = common.get_options(options, _options)
     crossover = supported_methods[method]
 
     best_score = 1e20
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
-    if not population:
-        population = list(list(p) for p in searchspace.get_random_sample(pop_size))
-    else:
-        pop_size = len(population)
-    
-    old_population = population
+    population = list(list(p) for p in searchspace.get_random_sample(pop_size))
+
     for generation in range(generations):
 
-        # Evaluate the entire population
-        try:
-            old_population = population
-            weighted_population = evaluate_population(runner, cost_func, population)
-        except util.StopCriterionReached as e:
-            if tuning_options.verbose:
-                print(e)
-            return cost_func.results
+        # determine fitness of population members
+        weighted_population = []
+        for dna in population:
+            try:
+                time = cost_func(dna, check_restrictions=False)
+            except util.StopCriterionReached as e:
+                if tuning_options.verbose:
+                    print(e)
+                return cost_func.results
+
+            weighted_population.append((dna, time))
 
         # population is sorted such that better configs have higher chance of reproducing
         weighted_population.sort(key=lambda x: x[1])
@@ -72,8 +69,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
                     break
 
         # could combine old + new generation here and do a selection
-    tuning_options.strategy_options["population"] = old_population # for memetic strategy
-    tuning_options.strategy_options["candidates"] = population # for memetic strategy
+
     return cost_func.results
 
 
@@ -180,28 +176,4 @@ def disruptive_uniform_crossover(dna1, dna2):
     "two_point": two_point_crossover,
     "uniform": uniform_crossover,
     "disruptive_uniform": disruptive_uniform_crossover,
-}
-
-def evaluate_population(runner, cost_func, population):
-    """
-    Evaluate the population based on the type of runner.
-
-    Parameters:
-    - runner: The runner (ParallelRunner or SequentialRunner) determining how to process evaluations.
-    - cost_func: A function capable of evaluating the population.
-    - population: List of individuals to be evaluated.
-
-    Returns:
-    - List of tuples (dna, fitness_score) representing the population and their evaluation results.
-    """
-    if isinstance(runner, ParallelRunner):
-        # Process the whole population at once if using a ParallelRunner
-        results = cost_func(population, check_restrictions=False)
-        return list(zip(population, results))
-    else:
-        # Process each individual sequentially for SequentialRunner
-        weighted_population = []
-        for dna in population:
-            time = cost_func(dna, check_restrictions=False)  # Cost function called with a single-element list
-            weighted_population.append((dna, time))
-        return weighted_population
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index bbceb76b8..26d15f591 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -9,8 +9,7 @@
 _options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
                        restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
                        no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
-                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3),
-                       candidate=("initial candidate for the search", None))
+                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3))
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
@@ -18,7 +17,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     options = tuning_options.strategy_options
 
-    neighbor, restart, no_improvement, randomwalk, candidate = common.get_options(options, _options)
+    neighbor, restart, no_improvement, randomwalk = common.get_options(options, _options)
 
     perm_size = int(randomwalk * dna_size)
     if perm_size == 0:
@@ -32,28 +31,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
     #while searching
-    if not candidate:
-        candidate = searchspace.get_random_sample(1)[0]
-    old_candidate = candidate # for memetic strategy
-    try:
-        best_score = cost_func(candidate, check_restrictions=False)
-    except util.StopCriterionReached as e:
-        tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
-        tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
-        if tuning_options.verbose:
-            print(e)
-        return cost_func.results
+    candidate = searchspace.get_random_sample(1)[0]
+    best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
     while fevals < max_fevals:
 
         try:
-            old_candidate = candidate # for memetic strategy
             candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
-            tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
-            tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
             if tuning_options.verbose:
                 print(e)
             return cost_func.results
@@ -66,8 +53,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # Instead of full restart, permute the starting candidate
         candidate = random_walk(candidate, perm_size, no_improvement, last_improvement, searchspace)
-    tuning_options.strategy_options["old_candidate"] = old_candidate # for memetic strategy
-    tuning_options.strategy_options["candidate"] = candidate # for memetic strategy
     return cost_func.results
 
 
@@ -78,4 +63,4 @@ def random_walk(indiv, permutation_size, no_improve, last_improve, searchspace:
         return searchspace.get_random_sample(1)[0]
     for _ in range(permutation_size):
         indiv = mutate(indiv, 0, searchspace, cache=False)
-    return indiv
+    return indiv
\ No newline at end of file
diff --git a/kernel_tuner/strategies/memetic.py b/kernel_tuner/strategies/memetic.py
deleted file mode 100644
index 083b117a7..000000000
--- a/kernel_tuner/strategies/memetic.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import logging
-import ray
-import os
-import sys
-import copy
-
-from kernel_tuner.searchspace import Searchspace
-from kernel_tuner.runners.parallel import ParallelRunner
-from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.sequential import SequentialRunner
-from kernel_tuner.runners.ray.cache_manager import CacheManager
-from kernel_tuner.strategies.common import check_num_devices, create_actor_on_device, initialize_ray
-from kernel_tuner.util import get_num_devices, check_stop_criterion, StopCriterionReached
-from kernel_tuner.runners.ray.remote_actor import RemoteActor
-
-from kernel_tuner.strategies import (
-    basinhopping,
-    bayes_opt,
-    brute_force,
-    diff_evo,
-    dual_annealing,
-    firefly_algorithm,
-    genetic_algorithm,
-    greedy_ils,
-    greedy_mls,
-    minimize,
-    mls,
-    ordered_greedy_mls,
-    pso,
-    random_sample,
-    simulated_annealing,
-    ensemble,
-    memetic
-)
-
-strategy_map = {
-    "brute_force": brute_force,
-    "random_sample": random_sample,
-    "minimize": minimize,
-    "basinhopping": basinhopping,
-    "diff_evo": diff_evo,
-    "genetic_algorithm": genetic_algorithm,
-    "greedy_mls": greedy_mls,
-    "ordered_greedy_mls": ordered_greedy_mls,
-    "greedy_ils": greedy_ils,
-    "dual_annealing": dual_annealing,
-    "mls": mls,
-    "pso": pso,
-    "simulated_annealing": simulated_annealing,
-    "firefly_algorithm": firefly_algorithm,
-    "bayes_opt": bayes_opt,
-}
-
-ls_strategies_list = {
-    "greedy_mls",
-    "ordered_greedy_mls",
-    "greedy_ils",
-    "mls",
-    "hill_climbing"
-}
-
-pop_based_strategies_list = {
-    "genetic_algorithm",
-    "differential_evolution",
-    "pso"
-}
-
-
-def tune(searchspace: Searchspace, runner, tuning_options):
-    options = tuning_options.strategy_options
-    simulation_mode = True if isinstance(runner, SimulationRunner) else False
-    local_search = options.get('local_search', 'greedy_ils')
-    global_search = options.get('global_search', "genetic_algorithm")
-    alsd = options.get("alsd", 5) # Adaptive Local Search Depth (ALSD)
-    lsd = options.get("lsd", 30) # Local Search Depth (LSD)
-    maxiter = options.get("maxiter", 3)
-    popsize = options.get("popsize", 20)
-    max_feval = options.get("max_fevals", None if 'time_limit' in options else 2000)
-    print(f"DEBUG: local_search={local_search} global_search={global_search} alsd={alsd} lsd={lsd} maxiter={maxiter} popsize={popsize} max_feval={max_feval}", file=sys.stderr)
-
-    if local_search in ls_strategies_list:
-        tuning_options.strategy_options["ensemble"] = [local_search] * popsize
-    else:
-        raise ValueError("Provided local search ensemble are not all local search strategies")
-
-    if global_search in pop_based_strategies_list:
-        global_search = strategy_map[global_search]
-    else:
-        raise ValueError("Provided population based strategy is not a population based strategy")
-    
-    options["population"] = searchspace.get_random_sample(popsize)
-    
-    initialize_ray()
-    num_gpus = get_num_devices(simulation_mode=simulation_mode)
-    check_num_devices(num_gpus, simulation_mode, runner)
-    # Create cache manager, actors and parallel runner
-    cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
-    num_actors = num_gpus if num_gpus < popsize else popsize
-    runner_attributes = [runner.kernel_source, runner.kernel_options, runner.device_options, runner.iterations, runner.observers]
-    actors = [create_actor_on_device(*runner_attributes, id=id, cache_manager=cache_manager, simulation_mode=simulation_mode) for id in range(num_actors)]
-    pop_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                runner.iterations, runner.observers, num_gpus=num_gpus, cache_manager=cache_manager,
-                                simulation_mode=simulation_mode, actors=actors)
-    
-    all_results = []
-    all_results_dict = {}
-    feval = 0
-    afi_gs, afi_ls = None, None
-    while (max_feval is None) or feval < max_feval:
-        print(f"DEBUG: --------------------NEW ITERATION--------feval = {feval}------------", file=sys.stderr)
-        if max_feval is not None:
-            maxiter, lsd = distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls)
-        print(f"DEBUG: maxiter * popsize = {maxiter * popsize}, lsd = {lsd}", file=sys.stderr)
-
-        # Global Search (GS)
-        print(f"DEBUG:=================Global Search=================", file=sys.stderr)
-        tuning_options.strategy_options["maxiter"] = maxiter
-        pop_start_gs = copy.deepcopy(tuning_options.strategy_options["population"])
-        results = global_search.tune(searchspace, pop_runner, tuning_options)
-        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
-        feval += maxiter * popsize
-
-        pop_start_gs_res = get_pop_results(pop_start_gs, all_results_dict)
-        pop_end_gs = copy.deepcopy(tuning_options.strategy_options["population"])
-        pop_end_gs_res = get_pop_results(pop_end_gs, all_results_dict)
-        afi_gs = calculate_afi(pop_start_gs_res, pop_end_gs_res, maxiter, all_results_dict)
-
-        # Local Search (LS)
-        print(f"DEBUG:=================Local Search=================", file=sys.stderr)
-        tuning_options.strategy_options["max_fevals"] = lsd * popsize
-        pop_start_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
-        results = ensemble.tune(searchspace, runner, tuning_options, cache_manager=cache_manager, actors=actors)
-        add_to_results(all_results, all_results_dict, results, tuning_options.tune_params)
-        feval += lsd * popsize
-
-        pop_start_ls_res = get_pop_results(pop_start_ls, all_results_dict)
-        pop_end_ls = copy.deepcopy(tuning_options.strategy_options["candidates"])
-        pop_end_ls_res = get_pop_results(pop_end_ls, all_results_dict)
-        afi_ls = calculate_afi(pop_start_ls_res, pop_end_ls_res, lsd, all_results_dict)
-
-        # Adaptive Local Search Depth (ALSD)
-        if afi_gs is not None and afi_ls is not None:
-            if afi_ls > afi_gs:
-                lsd += alsd
-            elif afi_ls < afi_gs:
-                lsd -= alsd
-            # Less than 5 lsd doesn't make sense
-            if lsd < 5:
-                lsd = 5
-            print(f"DEBUG: Adaptive Local Search Depth (ALSD) lsd = {lsd}", file=sys.stderr)
-
-    ray.kill(cache_manager)
-    for actor in actors:
-        ray.kill(actor)
-
-    return all_results
-
-def calculate_afi(pop_before_rs, pop_after_rs, feval, results):
-    # Average Fitness Increment (AFI)
-    assert(feval >= 0)
-    delta_fitness = fitness_increment(pop_before_rs, pop_after_rs)
-    afi = delta_fitness / feval if feval > 0 else 0
-    print(f"DEBUG:calculate_afi afi: {afi}", file=sys.stderr)
-    return afi
-
-def fitness_increment(pop_before, pop_after):
-    if len(pop_before) != len(pop_after):
-        raise ValueError("populations must have the same size.")
-    
-    sum_before = sum(t for t in pop_before if isinstance(t, float))
-    sum_after = sum(t for t in pop_after if isinstance(t, float))
-    difference_sum = sum_before - sum_after
-    return difference_sum
-
-def get_pop_results(pop, results):
-    print(f"DEBUG:get_pop_results pop = {pop}", file=sys.stderr)
-    times = []
-    for entry in pop:
-        key = ','.join(map(str, entry))
-        if key in results:
-            time = results[key]
-            times.append(time)
-        else:
-            times.append(None)
-
-    print(f"DEBUG:get_pop_results times = {times}", file=sys.stderr)
-    return times
-
-def add_to_results(all_results, all_results_dict, results, tune_params):
-    for result in results:
-        key = ",".join(str(result[param]) for param in tune_params)
-        all_results_dict[key] = result["time"]
-        all_results.append(result)
-
-def distribute_feval(feval, max_feval, maxiter, lsd, popsize, afi_gs, afi_ls):
-    remaining_feval = max_feval - feval
-    if remaining_feval < (lsd + maxiter) * popsize:
-        # Calculate how many full batches of popsize can still be processed
-        proportion = remaining_feval // popsize
-
-        if afi_gs is None or afi_ls is None:
-            maxiter = int(proportion * 0.5)
-            lsd = int(proportion * 0.5)
-        else:
-            if afi_gs > afi_ls:
-                # More evaluations to maxiter
-                maxiter = int(proportion * 0.6)
-                lsd = int(proportion * 0.4)
-            else:
-                # More evaluations to lsd
-                maxiter = int(proportion * 0.4)
-                lsd = int(proportion * 0.6)
-
-        # If maxiter ends up being 1, assign all remaining feval to lsd
-        if maxiter == 1:
-            lsd = proportion  # Give all available batches to lsd
-            maxiter = 0
-
-        # Ensure at least one of maxiter or lsd is non-zero if there are still fevals to be used
-        if maxiter == 0 and lsd == 0 and remaining_feval > 0:
-            lsd = 1  # Allocate at least one batch to lsd to ensure progress
-
-    return maxiter, lsd
- 
\ No newline at end of file

From 65d32c1daf2f763b3472608d3be48f9186b25e79 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:29:02 +0200
Subject: [PATCH 091/106] added ray to pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 3175ed34a..0209574c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ python-constraint2 = "^2.0.0b5"
 xmltodict = "*"
 pandas = ">=2.0.0"
 scikit-learn = ">=1.0.2"
+ray = ">=2.9.1"
 # Torch can be used with Kernel Tuner, but is not a dependency, should be up to the user to use it
 
 # List of optional dependencies for user installation, e.g. `pip install kernel_tuner[cuda]`, used in the below `extras`.

From 503df1b81b77b40d15d031cf2e8a7acf94540fbc Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 14:42:40 +0200
Subject: [PATCH 092/106] updated toml file with ray dashboard

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b48f7e458..721c60e7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,7 @@ python-constraint2 = "^2.0.0b5"
 xmltodict = "*"
 pandas = ">=2.0.0"
 scikit-learn = ">=1.0.2"
-ray = ">=2.9.1"
+ray = { version = ">=2.9.1", extras = ["default"] }
 # Torch can be used with Kernel Tuner, but is not a dependency, should be up to the user to use it
 
 # List of optional dependencies for user installation, e.g. `pip install kernel_tuner[cuda]`, used in the below `extras`.

From c126a011e19a51d5eb31767638c80b1644d7c202 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:20:09 +0200
Subject: [PATCH 093/106] fix small bug in _evaluate_configs

---
 kernel_tuner/strategies/common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 7ea022519..5e4dba354 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -194,7 +194,6 @@ def _evaluate_configs(self, configs):
             # in case of stop creterion reached, save the results so far
             self.results.append(result)
 
-        self.results.extend(final_results)
         # upon returning from this function control will be given back to the strategy, so reset the start time
         self.runner.last_strategy_start_time = perf_counter()
 

From 4df1b0d872f85a0a32593620a9f2068ff9ce9e62 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:33:49 +0200
Subject: [PATCH 094/106] adapted test for ensemble

---
 test/strategies/test_strategies.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/strategies/test_strategies.py b/test/strategies/test_strategies.py
index 096be38b0..1001aabec 100644
--- a/test/strategies/test_strategies.py
+++ b/test/strategies/test_strategies.py
@@ -36,7 +36,7 @@ def vector_add():
 @pytest.mark.parametrize('strategy', strategy_map)
 def test_strategies(vector_add, strategy):
 
-    options = dict(popsize=5, neighbor='adjacent')
+    options = dict(popsize=5)
 
     print(f"testing {strategy}")
 

From 29a507cc44812c2d40998c696fb79ea3a1151f9c Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:46:36 +0200
Subject: [PATCH 095/106] cleaned up not used imports

---
 kernel_tuner/runners/parallel.py         |  3 ---
 kernel_tuner/runners/ray/remote_actor.py |  2 --
 kernel_tuner/strategies/ensemble.py      | 15 +--------------
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 8884b89e7..e81341160 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -1,7 +1,5 @@
-import logging
 import ray
 import sys
-import os
 from ray.util.actor_pool import ActorPool
 from time import perf_counter
 from collections import deque
@@ -9,7 +7,6 @@
 
 from kernel_tuner.core import DeviceInterface
 from kernel_tuner.runners.runner import Runner
-from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices, GPUTypeMismatchError
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 533dea5b3..c0743ad22 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,6 +1,4 @@
 import ray
-import sys
-import copy
 
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 2a19f9f74..9cdc0b90e 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -1,22 +1,9 @@
-import random
-import sys
-import os
-import ray
-import copy
-import logging
 import warnings
-from collections import deque
 
-import numpy as np
-
-from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
-from kernel_tuner.strategies import common
-from kernel_tuner.strategies.common import CostFunc, scale_from_params, check_num_devices, create_actor_on_device, initialize_ray
+from kernel_tuner.strategies.common import initialize_ray
 from kernel_tuner.runners.simulation import SimulationRunner
-from kernel_tuner.runners.ray.remote_actor import RemoteActor
 from kernel_tuner.util import get_num_devices
-from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.runners.parallel import ParallelRunner
 
 from kernel_tuner.strategies import (

From 7c49a29da61886af990abc8c72fb3128619f272e Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Mon, 1 Jul 2024 18:56:53 +0200
Subject: [PATCH 096/106] added comments

---
 kernel_tuner/runners/parallel.py    | 18 ++++++++++++++++++
 kernel_tuner/strategies/ensemble.py |  4 ++++
 2 files changed, 22 insertions(+)

diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index e81341160..871b93228 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -144,6 +144,9 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
+        """
+        Process the results from the ensemble execution.
+        """
         results = []
         tuning_options_list = []
 
@@ -155,6 +158,9 @@ def _process_results_ensemble(self, all_results):
 
 
     def parallel_function_evaluation(self, tuning_options, parameter_space):
+        """
+        Perform parallel function evaluation.
+        """
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
@@ -164,6 +170,9 @@ def parallel_function_evaluation(self, tuning_options, parameter_space):
         return results, tuning_options_list
     
     def _process_results(self, all_results, searchspace):
+        """
+        Process the results and remove duplicates based on the searchspace.
+        """
         unique_configs = set()
         final_results = []
 
@@ -176,12 +185,18 @@ def _process_results(self, all_results, searchspace):
         return final_results
     
     def _calculate_simulated_time(self, tuning_options_list):
+        """
+        Calculate the maximum simulated time from the list of tuning options.
+        """
         simulated_times = []
         for tuning_options in tuning_options_list:
             simulated_times.append(tuning_options.simulated_time)
         return max(simulated_times)
 
     def _check_gpus_equals(self):
+        """
+        Check if all GPUs are of the same type.
+        """
         gpu_types = []
         env_refs = [actor.get_environment.remote() for actor in self.actors]
         environments = ray.get(env_refs)
@@ -194,6 +209,9 @@ def _check_gpus_equals(self):
             return False
 
     def clean_up_ray(self):
+        """
+        Clean up Ray actors and cache manager.
+        """
         if self.actors is not None:
             for actor in self.actors:
                 ray.kill(actor)
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 9cdc0b90e..7e66f0360 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -50,19 +50,23 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
     ensemble_size = len(ensemble)
 
+    # setup strategy options
     if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = 'random'
     tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
     tuning_options.strategy_options['check_and_retrieve'] = True
 
+    # define number of ray actors needed
     if num_devices < ensemble_size:
         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
 
     ensemble = [strategy_map[strategy] for strategy in ensemble]
+
     parallel_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
                                     runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
                                     simulation_mode=simulation_mode, actors=actors)
+    
     final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
 
     if clean_up:

From eb5db41fb0bee44ed7a606e4e88c9ecc3d77f877 Mon Sep 17 00:00:00 2001
From: Milo Lurati <lurati.milo@gmail.com>
Date: Thu, 4 Jul 2024 16:52:37 +0200
Subject: [PATCH 097/106] added documentation and related fixes

---
 doc/source/optimization.rst            |  1 +
 kernel_tuner/interface.py              |  1 +
 kernel_tuner/runners/parallel.py       | 99 ++++++++++++++++++++++++--
 kernel_tuner/runners/sequential.py     |  6 ++
 kernel_tuner/strategies/brute_force.py |  2 +-
 kernel_tuner/strategies/ensemble.py    | 21 ++++--
 6 files changed, 119 insertions(+), 11 deletions(-)

diff --git a/doc/source/optimization.rst b/doc/source/optimization.rst
index 59219ad51..2b8dd8987 100644
--- a/doc/source/optimization.rst
+++ b/doc/source/optimization.rst
@@ -25,6 +25,7 @@ the ``strategy=`` optional argument of ``tune_kernel()``. Kernel Tuner currently
  * "pso" particle swarm optimization
  * "random_sample" takes a random sample of the search space
  * "simulated_annealing" simulated annealing strategy
+ * "ensemble" ensemble strategy
 
 Most strategies have some mechanism built in to detect when to stop tuning, which may be controlled through specific 
 parameters that can be passed to the strategies using the ``strategy_options=`` optional argument of ``tune_kernel()``. You 
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index e40304d08..0be907737 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -467,6 +467,7 @@ def __deepcopy__(self, _):
         ),
         ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")),
         ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")),
+        ("parallel_mode", ("Run the auto-tuning on multiple devices (brute-force execution)", "bool")),
         ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")),
     ]
 )
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 871b93228..a7f2d95fc 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -12,9 +12,41 @@
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
 class ParallelRunner(Runner):
+    """ParallelRunner is used for tuning with multiple processes/threads using Ray for distributed computing."""
 
     def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
                  num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
+        """Instantiate the ParallelRunner.
+
+        :param kernel_source: The kernel source
+        :type kernel_source: kernel_tuner.core.KernelSource
+
+        :param kernel_options: A dictionary with all options for the kernel.
+        :type kernel_options: kernel_tuner.interface.Options
+
+        :param device_options: A dictionary with all options for the device
+            on which the kernel should be tuned.
+        :type device_options: kernel_tuner.interface.Options
+
+        :param iterations: The number of iterations used for benchmarking
+            each kernel instance.
+        :type iterations: int
+
+        :param observers: List of observers.
+        :type observers: list
+
+        :param num_gpus: Number of GPUs to use. Defaults to None.
+        :type num_gpus: int, optional
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
+
+        :param actors: List of pre-initialized actors. Defaults to None.
+        :type actors: list, optional
+
+        :param simulation_mode: Flag to indicate simulation mode. Defaults to False.
+        :type simulation_mode: bool, optional
+        """
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
         self.kernel_source = kernel_source
         self.simulation_mode = simulation_mode
@@ -41,6 +73,26 @@ def get_environment(self, tuning_options):
         return self.dev.get_environment()
 
     def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchspace=None, cache_manager=None):
+        """Run the tuning process with parallel execution.
+
+        :param parameter_space: The parameter space to explore.
+        :type parameter_space: iterable
+
+        :param tuning_options: Tuning options. Defaults to None.
+        :type tuning_options: dict, optional
+
+        :param ensemble: List of strategies for ensemble. Defaults to None.
+        :type ensemble: list, optional
+
+        :param searchspace: The search space to explore. Defaults to None.
+        :type searchspace: kernel_tuner.searchspace.Searchspace, optional
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
+
+        :returns: Results of the tuning process.
+        :rtype: list of dict
+        """
         if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
         
@@ -84,9 +136,20 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         return results
 
     def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspace):
-        """
-        Runs strategies from the ensemble in parallel using distributed actors, 
+        """Runs strategies from the ensemble in parallel using distributed actors,
         manages dynamic task allocation, and collects results.
+
+        :param ensemble: List of strategies to execute.
+        :type ensemble: list
+
+        :param tuning_options: Tuning options.
+        :type tuning_options: dict
+
+        :param searchspace: Search space to explore.
+        :type searchspace: kernel_tuner.searchspace.Searchspace
+
+        :returns: Processed results and tuning options list.
+        :rtype: tuple
         """
         ensemble_queue = deque(ensemble)
         pending_tasks = {}
@@ -137,6 +200,17 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
 
     
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
+        """Set up tuning options for each strategy in the ensemble.
+
+        :param tuning_options: Original tuning options.
+        :type tuning_options: dict
+
+        :param evaluations_per_strategy: Number of evaluations per strategy.
+        :type evaluations_per_strategy: list
+
+        :returns: Modified tuning options.
+        :rtype: dict
+        """
         new_tuning_options = copy.deepcopy(tuning_options)
         new_tuning_options.strategy_options["max_fevals"] = evaluations_per_strategy.pop(0)
         # the stop criterion uses the max feval in tuning options for some reason
@@ -144,8 +218,13 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         return new_tuning_options
     
     def _process_results_ensemble(self, all_results):
-        """
-        Process the results from the ensemble execution.
+        """Process the results from the ensemble execution.
+
+        :param all_results: List of results from all strategies.
+        :type all_results: list
+
+        :returns: Processed results and tuning options list.
+        :rtype: tuple
         """
         results = []
         tuning_options_list = []
@@ -158,8 +237,16 @@ def _process_results_ensemble(self, all_results):
 
 
     def parallel_function_evaluation(self, tuning_options, parameter_space):
-        """
-        Perform parallel function evaluation.
+        """Perform parallel function evaluation.
+
+        :param tuning_options: Tuning options.
+        :type tuning_options: dict
+
+        :param parameter_space: Parameter space to explore.
+        :type parameter_space: list
+
+        :returns: Results and tuning options list.
+        :rtype: tuple
         """
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index e19242549..46ba17e0a 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -28,6 +28,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param iterations: The number of iterations used for benchmarking
             each kernel instance.
         :type iterations: int
+
+        :param observers: List of observers.
+        :type observers: list
+
+        :param cache_manager: Cache manager instance. Defaults to None.
+        :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
         """
         #detect language and create high-level device interface
         self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if dev is None else dev
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index 1ba83a467..cf6ba521b 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -4,7 +4,7 @@
 from kernel_tuner.runners.parallel import ParallelRunner
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 
-_options = {}
+_options = dict(num_gpus=("Number of gpus to run parallel execution", None))
 
 def tune(searchspace: Searchspace, runner, tuning_options):
 
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 7e66f0360..2dab125f4 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -1,6 +1,11 @@
+"""
+The ensemble strategy that optimizes the search through the parameter space using a combination of multiple strategies.
+"""
+
 import warnings
 
 from kernel_tuner.searchspace import Searchspace
+from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import initialize_ray
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.util import get_num_devices
@@ -40,20 +45,26 @@
     "bayes_opt": bayes_opt,
 }
 
+_options = dict(
+    ensemble=("List of strategies to be used in the ensemble", ["random_sample", "random_sample"]),
+    max_fevals=("Maximum number of function evaluations", None),
+    num_gpus=("Number of gpus to run the parallel ensemble on", None)
+)
+
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     clean_up = True if actors is None and cache_manager is None else False
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     initialize_ray()
-    num_devices = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else get_num_devices(simulation_mode=simulation_mode)
-    
-    ensemble = options.get('ensemble', ["diff_evo", "diff_evo"])
+
+    ensemble, max_fevals, num_gpus =common.get_options(tuning_options.strategy_options, _options)
+    num_devices = num_gpus if num_gpus is not None else get_num_devices(simulation_mode=simulation_mode)  
     ensemble_size = len(ensemble)
 
     # setup strategy options
     if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = 'random'
-    tuning_options.strategy_options["max_fevals"] = options.get("max_fevals", 100 * ensemble_size)
+    tuning_options.strategy_options["max_fevals"] = 100 * ensemble_size if max_fevals is None else max_fevals
     tuning_options.strategy_options['check_and_retrieve'] = True
 
     # define number of ray actors needed
@@ -73,3 +84,5 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
         parallel_runner.clean_up_ray()
     
     return final_results
+
+tune.__doc__ = common.get_strategy_docstring("Ensemble", _options)
\ No newline at end of file

From bab28ef9c293f9380ac1b37db774c58d0be83d63 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Tue, 25 Mar 2025 15:19:40 +0100
Subject: [PATCH 098/106] Typo.

---
 kernel_tuner/observers/nvml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index 7d1809ef2..d468d6391 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -333,7 +333,7 @@ def __init__(
             "save_all": save_all,
             "nvidia_smi_fallback": nvidia_smi_fallback,
             "use_locked_clocks": use_locked_clocks,
-            "continous_duration": continous_duration
+            "continous_duration": continuous_duration
         }
         if nvidia_smi_fallback:
             self.nvml = nvml(

From 5b93b9c01fba2c34ecab4323b739146743d86d0a Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Tue, 25 Mar 2025 15:41:19 +0100
Subject: [PATCH 099/106] Remove Observer that does not exist anymore.

---
 kernel_tuner/strategies/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 5e4dba354..db5f36e4d 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -10,7 +10,7 @@
 from kernel_tuner.searchspace import Searchspace
 from kernel_tuner.util import get_num_devices
 from kernel_tuner.runners.ray.remote_actor import RemoteActor
-from kernel_tuner.observers.nvml import NVMLObserver, NVMLPowerObserver
+from kernel_tuner.observers.nvml import NVMLObserver
 from kernel_tuner.observers.pmt import PMTObserver
 from kernel_tuner.observers.powersensor import PowerSensorObserver
 from kernel_tuner.observers.register import RegisterObserver

From 20b7c96854ebe8fa11de37fb509286af36a7568b Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Tue, 25 Mar 2025 15:44:30 +0100
Subject: [PATCH 100/106] Remove spurious parameter to function.

---
 kernel_tuner/strategies/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index db5f36e4d..6d5fdb892 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -331,7 +331,7 @@ def scale_from_params(params, tune_params, eps):
 
 def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
     
-    num_devices = get_num_devices(runner.kernel_source.lang, simulation_mode=simulation_mode)
+    num_devices = get_num_devices(simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
          warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
 

From 50473a2e7c3b4bc129b9f53dcf990c9f870f5634 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Tue, 25 Mar 2025 15:48:32 +0100
Subject: [PATCH 101/106] Remove not existant observer.

---
 kernel_tuner/strategies/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index 6d5fdb892..c4e00bb8d 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -350,7 +350,7 @@ def create_actor_on_device(kernel_source, kernel_options, device_options, iterat
         # observers can't be pickled so we will re-initialize them in the actors
         # observers related to backends will be initialized once we call the device interface inside the actor, that is why we skip them here
         for i, observer in enumerate(observers):
-            if isinstance(observer, (NVMLObserver, NVMLPowerObserver, PMTObserver, PowerSensorObserver)):
+            if isinstance(observer, (NVMLObserver, PMTObserver, PowerSensorObserver)):
                 observers_type_and_arguments.append((observer.__class__, observer.init_arguments))
             if isinstance(observer, RegisterObserver):
                 observers_type_and_arguments.append((observer.__class__, []))

From ceb0996418b3b3276ac8ff4c3d352b9c1aeed7de Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Tue, 25 Mar 2025 15:57:11 +0100
Subject: [PATCH 102/106] Fix Runner interface.

---
 kernel_tuner/runners/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/runner.py b/kernel_tuner/runners/runner.py
index 80ab32146..8c4de22d7 100644
--- a/kernel_tuner/runners/runner.py
+++ b/kernel_tuner/runners/runner.py
@@ -14,7 +14,7 @@ def __init__(
         pass
 
     @abstractmethod
-    def get_environment(self):
+    def get_environment(self, tuning_options):
         pass
 
     @abstractmethod

From 9739495f425d58161b741cad08560652961d9f5d Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Tue, 25 Mar 2025 16:11:06 +0100
Subject: [PATCH 103/106] Reformat with black.

---
 kernel_tuner/accuracy.py                      |  15 +-
 kernel_tuner/backends/compiler.py             |  12 +-
 kernel_tuner/backends/cupy.py                 |   8 +-
 kernel_tuner/backends/nvcuda.py               |  24 +-
 kernel_tuner/backends/opencl.py               |  16 +-
 kernel_tuner/backends/pycuda.py               |  17 +-
 kernel_tuner/core.py                          | 158 +++------
 kernel_tuner/energy/energy.py                 |  67 +++-
 kernel_tuner/hyper.py                         |  17 +-
 kernel_tuner/integration.py                   | 324 +++++++++---------
 kernel_tuner/interface.py                     |   8 +-
 kernel_tuner/kernelbuilder.py                 |  70 ++--
 kernel_tuner/observers/hip.py                 |   4 +-
 kernel_tuner/observers/ncu.py                 |  23 +-
 kernel_tuner/observers/nvml.py                |   8 +-
 kernel_tuner/observers/observer.py            |  11 +-
 kernel_tuner/observers/pmt.py                 |  15 +-
 kernel_tuner/observers/powersensor.py         |  15 +-
 kernel_tuner/observers/tegra.py               |  46 +--
 kernel_tuner/runners/parallel.py              |  94 +++--
 kernel_tuner/runners/ray/cache_manager.py     |  14 +-
 kernel_tuner/runners/ray/remote_actor.py      |  64 ++--
 kernel_tuner/runners/runner.py                |   4 +-
 kernel_tuner/runners/sequential.py            |  59 ++--
 kernel_tuner/runners/simulation.py            |  34 +-
 kernel_tuner/searchspace.py                   |  29 +-
 kernel_tuner/strategies/basinhopping.py       |  15 +-
 kernel_tuner/strategies/brute_force.py        |   8 +-
 kernel_tuner/strategies/common.py             | 104 +++---
 kernel_tuner/strategies/diff_evo.py           |  37 +-
 kernel_tuner/strategies/dual_annealing.py     |   9 +-
 kernel_tuner/strategies/ensemble.py           |  39 ++-
 kernel_tuner/strategies/firefly_algorithm.py  |  20 +-
 kernel_tuner/strategies/genetic_algorithm.py  |   8 +-
 kernel_tuner/strategies/greedy_ils.py         |  25 +-
 kernel_tuner/strategies/greedy_mls.py         |  26 +-
 kernel_tuner/strategies/hillclimbers.py       |  15 +-
 kernel_tuner/strategies/minimize.py           |   2 +-
 kernel_tuner/strategies/mls.py                |  12 +-
 kernel_tuner/strategies/ordered_greedy_mls.py |  12 +-
 kernel_tuner/strategies/pso.py                |  22 +-
 .../strategies/simulated_annealing.py         |  31 +-
 kernel_tuner/util.py                          |  13 +-
 43 files changed, 826 insertions(+), 728 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 491541909..84c346ea1 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -46,9 +46,7 @@ def select_for_configuration(self, params):
 
         if option not in self.data:
             list = ", ".join(map(str, self.data.keys()))
-            raise KeyError(
-                f"'{option}' is not a valid parameter value, should be one of: {list}"
-            )
+            raise KeyError(f"'{option}' is not a valid parameter value, should be one of: {list}")
 
         return self.data[option]
 
@@ -60,12 +58,14 @@ def _find_bfloat16_if_available():
     # Try to get bfloat16 if available.
     try:
         from bfloat16 import bfloat16
+
         return bfloat16
     except ImportError:
         pass
 
     try:
         from tensorflow import bfloat16
+
         return bfloat16.as_numpy_dtype
     except ImportError:
         pass
@@ -102,9 +102,7 @@ def _to_float_dtype(x: str) -> np.dtype:
 
 
 class TunablePrecision(Tunable):
-    def __init__(
-        self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
-    ):
+    def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
         """The ``Tunable`` object can be used as an input argument when tuning
         kernels. It is a container that internally holds several arrays
         containing the same data, but stored in using different levels of
@@ -135,7 +133,6 @@ def __init__(
             if bfloat16 is not None:
                 dtypes["bfloat16"] = bfloat16
 
-
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
             dtypes = dict((name, _to_float_dtype(name)) for name in dtypes)
@@ -257,9 +254,7 @@ def metric(a, b):
         raise ValueError(f"invalid error metric provided: {user_key}")
 
     # cast both arguments to f64 before passing them to the metric
-    return lambda a, b: metric(
-        a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
-    )
+    return lambda a, b: metric(a.astype(np.float64, copy=False), b.astype(np.float64, copy=False))
 
 
 class AccuracyObserver(OutputObserver):
diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
index 730710489..b5aaf749a 100644
--- a/kernel_tuner/backends/compiler.py
+++ b/kernel_tuner/backends/compiler.py
@@ -34,7 +34,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    Pointer = Exception # using Exception here as a type that will never be among kernel arguments
+    Pointer = Exception  # using Exception here as a type that will never be among kernel arguments
     DeviceArray = Exception
 
 
@@ -157,7 +157,9 @@ def ready_argument_list(self, arguments):
 
         for i, arg in enumerate(arguments):
             if not (isinstance(arg, (np.ndarray, np.number, DeviceArray)) or is_cupy_array(arg)):
-                raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}")
+                raise TypeError(
+                    f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}"
+                )
             dtype_str = arg.typestr if isinstance(arg, DeviceArray) else str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
@@ -288,7 +290,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             subprocess.run(
@@ -299,7 +301,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -439,7 +441,7 @@ def cleanup_lib(self):
         """unload the previously loaded shared library"""
         if self.lib is None:
             return
-        
+
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py
index 914f211a7..e6fbdefcd 100644
--- a/kernel_tuner/backends/cupy.py
+++ b/kernel_tuner/backends/cupy.py
@@ -70,9 +70,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         # collect environment information
         env = dict()
         cupy_info = str(cp._cupyx.get_runtime_info()).split("\n")[:-1]
-        info_dict = {
-            s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
-        }
+        info_dict = {s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info}
         env["device_name"] = info_dict[f"Device {device} Name"]
 
         env["cuda_version"] = cp.cuda.runtime.driverGetVersion()
@@ -129,9 +127,7 @@ def compile(self, kernel_instance):
 
         options = tuple(compiler_options)
 
-        self.current_module = cp.RawModule(
-            code=kernel_string, options=options, name_expressions=[kernel_name]
-        )
+        self.current_module = cp.RawModule(code=kernel_string, options=options, name_expressions=[kernel_name])
 
         self.func = self.current_module.get_function(kernel_name)
         self.num_regs = self.func.num_regs
diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
index 15259cb23..1eaad1d45 100644
--- a/kernel_tuner/backends/nvcuda.py
+++ b/kernel_tuner/backends/nvcuda.py
@@ -56,13 +56,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             CudaFunctions.last_selected_device = device
 
         # compute capabilities and device properties
-        err, major = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
-        )
+        err, major = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
         cuda_error_check(err)
-        err, minor = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
-        )
+        err, minor = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
         cuda_error_check(err)
         err, self.max_threads = cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
@@ -164,20 +160,14 @@ def compile(self, kernel_instance):
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
         if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(
-                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
-            )
+            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
         if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
             self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
-        err, program = nvrtc.nvrtcCreateProgram(
-            str.encode(kernel_string), b"CUDAProgram", 0, [], []
-        )
+        err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
             cuda_error_check(err)
-            err = nvrtc.nvrtcCompileProgram(
-                program, len(compiler_options), compiler_options
-            )
+            err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
@@ -189,9 +179,7 @@ def compile(self, kernel_instance):
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = cuda.cuModuleGetFunction(
-                self.current_module, str.encode(kernel_name)
-            )
+            err, self.func = cuda.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel
diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py
index af3be1c00..feb7cf938 100644
--- a/kernel_tuner/backends/opencl.py
+++ b/kernel_tuner/backends/opencl.py
@@ -16,9 +16,7 @@
 class OpenCLFunctions(GPUBackend):
     """Class that groups the OpenCL functions on maintains some state about the device."""
 
-    def __init__(
-        self, device=0, platform=0, iterations=7, compiler_options=None, observers=None
-    ):
+    def __init__(self, device=0, platform=0, iterations=7, compiler_options=None, observers=None):
         """Creates OpenCL device context and reads device properties.
 
         :param device: The ID of the OpenCL device to use for benchmarking
@@ -37,14 +35,10 @@ def __init__(
         platforms = cl.get_platforms()
         self.ctx = cl.Context(devices=[platforms[platform].get_devices()[device]])
 
-        self.queue = cl.CommandQueue(
-            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
-        )
+        self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
         self.mf = cl.mem_flags
         # inspect device properties
-        self.max_threads = self.ctx.devices[0].get_info(
-            cl.device_info.MAX_WORK_GROUP_SIZE
-        )
+        self.max_threads = self.ctx.devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
         self.compiler_options = compiler_options or []
 
         # observer stuff
@@ -108,9 +102,7 @@ def compile(self, kernel_instance):
         :returns: An OpenCL kernel that can be called directly.
         :rtype: pyopencl.Kernel
         """
-        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(
-            options=self.compiler_options
-        )
+        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(options=self.compiler_options)
         func = getattr(prg, kernel_instance.name)
         return func
 
diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py
index 7fddc9393..57e7c07da 100644
--- a/kernel_tuner/backends/pycuda.py
+++ b/kernel_tuner/backends/pycuda.py
@@ -97,13 +97,9 @@ def _finish_up():
             PyCudaFunctions.last_selected_context = self.context
 
         # inspect device properties
-        devprops = {
-            str(k): v for (k, v) in self.context.get_device().get_attributes().items()
-        }
+        devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
         self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
-        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
-            devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
-        )
+        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(devprops.get("COMPUTE_CAPABILITY_MINOR", "0"))
         if cc == "00":
             cc = self.context.get_device().compute_capability()
         self.cc = str(cc[0]) + str(cc[1])
@@ -347,14 +343,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """
         if stream is None:
             stream = self.stream
-        func(
-            *gpu_args,
-            block=threads,
-            grid=grid,
-            stream=stream,
-            shared=self.smem_size,
-            texrefs=self.texrefs
-        )
+        func(*gpu_args, block=threads, grid=grid, stream=stream, shared=self.smem_size, texrefs=self.texrefs)
 
     def memset(self, allocation, value, size):
         """Set the memory in allocation to the value in value.
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 655779337..714f81b83 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -32,7 +32,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    DeviceArray = Exception # using Exception here as a type that will never be among kernel arguments
+    DeviceArray = Exception  # using Exception here as a type that will never be among kernel arguments
 
 _KernelInstance = namedtuple(
     "_KernelInstance",
@@ -84,9 +84,7 @@ def __init__(self, kernel_name, kernel_sources, lang, defines=None):
         self.defines = defines
         if lang is None:
             if callable(self.kernel_sources[0]):
-                raise TypeError(
-                    "Please specify language when using a code generator function"
-                )
+                raise TypeError("Please specify language when using a code generator function")
             kernel_string = self.get_kernel_string(0)
             lang = util.detect_language(kernel_string)
 
@@ -113,9 +111,7 @@ def get_kernel_string(self, index=0, params=None):
         kernel_source = self.kernel_sources[index]
         return util.get_kernel_string(kernel_source, params)
 
-    def prepare_list_of_files(
-        self, kernel_name, params, grid, threads, block_size_names
-    ):
+    def prepare_list_of_files(self, kernel_name, params, grid, threads, block_size_names):
         """prepare the kernel string along with any additional files
 
         The first file in the list is allowed to include or read in the others
@@ -151,9 +147,7 @@ def prepare_list_of_files(
 
         for i, f in enumerate(self.kernel_sources):
             if i > 0 and not util.looks_like_a_filename(f):
-                raise ValueError(
-                    "When passing multiple kernel sources, the secondary entries must be filenames"
-                )
+                raise ValueError("When passing multiple kernel sources, the secondary entries must be filenames")
 
             ks = self.get_kernel_string(i, params)
             # add preprocessor statements
@@ -187,9 +181,7 @@ def prepare_list_of_files(
 
     def get_user_suffix(self, index=0):
         """Get the suffix of the kernel filename, if the user specified one. Return None otherwise."""
-        if util.looks_like_a_filename(self.kernel_sources[index]) and (
-            "." in self.kernel_sources[index]
-        ):
+        if util.looks_like_a_filename(self.kernel_sources[index]) and ("." in self.kernel_sources[index]):
             return "." + self.kernel_sources[index].split(".")[-1]
         return None
 
@@ -218,13 +210,9 @@ def check_argument_lists(self, kernel_name, arguments):
         """
         for i, f in enumerate(self.kernel_sources):
             if not callable(f):
-                util.check_argument_list(
-                    kernel_name, self.get_kernel_string(i), arguments
-                )
+                util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments)
             else:
-                logging.debug(
-                    "Checking of arguments list not supported yet for code generators."
-                )
+                logging.debug("Checking of arguments list not supported yet for code generators.")
 
 
 class DeviceInterface(object):
@@ -317,7 +305,9 @@ def __init__(
                 observers=observers,
             )
         else:
-            raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
+            raise ValueError(
+                "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet"
+            )
         self.dev = dev
 
         # look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
@@ -390,7 +380,6 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
         for obs in self.benchmark_observers:
             result.update(obs.get_results())
 
-
     def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
         """Benchmark continuously for at least 'duration' seconds"""
         iterations = int(np.ceil(duration / (result["time"] / 1000)))
@@ -414,7 +403,6 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
         for obs in self.continuous_observers:
             result.update(obs.get_results())
 
-
     def set_nvml_parameters(self, instance):
         """Set the NVML parameters. Avoids setting time leaking into benchmark time."""
         if self.use_nvml:
@@ -433,7 +421,6 @@ def set_nvml_parameters(self, instance):
             if "tegra_gr_clock" in instance.params:
                 self.tegra.gr_clock = instance.params["tegra_gr_clock"]
 
-
     def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
         """Benchmark the kernel instance."""
         logging.debug("benchmark " + instance.name)
@@ -458,9 +445,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                     obs.results = result
                     duration = max(duration, obs.continuous_duration)
 
-                self.benchmark_continuous(
-                    func, gpu_args, instance.threads, instance.grid, result, duration
-                )
+                self.benchmark_continuous(func, gpu_args, instance.threads, instance.grid, result, duration)
 
         except Exception as e:
             # some launches may fail because too many registers are required
@@ -473,9 +458,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                 "INVALID_WORK_GROUP_SIZE",
             ]
             if any([skip_str in str(e) for skip_str in skippable_exceptions]):
-                logging.debug(
-                    "benchmark fails due to runtime failure too many resources required"
-                )
+                logging.debug("benchmark fails due to runtime failure too many resources required")
                 if verbose:
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too many resources requested for launch"
@@ -487,20 +470,20 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
                 raise e
         return result
 
-    def check_kernel_output(
-        self, func, gpu_args, instance, answer, atol, verify, verbose
-    ):
+    def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, verbose):
         """runs the kernel once and checks the result against answer"""
         logging.debug("check_kernel_output")
 
-        #if not using custom verify function, check if the length is the same
+        # if not using custom verify function, check if the length is the same
         if answer:
             if len(instance.arguments) != len(answer):
                 raise TypeError("The length of argument list and provided results do not match.")
 
             should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)]
         else:
-            should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments]
+            should_sync = [
+                isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments
+            ]
 
         # re-copy original contents of output arguments to GPU memory, to overwrite any changes
         # by earlier kernel runs
@@ -522,7 +505,7 @@ def check_kernel_output(
                     self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
                 elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
                     if not answer[i].is_cuda:
-                        #if the answer is on the host, copy gpu output to host as well
+                        # if the answer is on the host, copy gpu output to host as well
                         result_host.append(torch.zeros_like(answer[i]))
                         self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
                     else:
@@ -550,10 +533,7 @@ def check_kernel_output(
             correct = True
 
         if not correct:
-            raise RuntimeError(
-                "Kernel result verification failed for: "
-                + util.get_config_string(instance.params)
-            )
+            raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
     def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
         # reset previous timers
@@ -567,7 +547,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
         # Compile and benchmark a kernel instance based on kernel strings and parameters
         instance_string = util.get_instance_string(params)
 
-        logging.debug('compile_and_benchmark ' + instance_string)
+        logging.debug("compile_and_benchmark " + instance_string)
 
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
@@ -585,9 +565,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 else:
                     # add shared memory arguments to compiled module
                     if kernel_options.smem_args is not None:
-                        self.dev.copy_shared_memory_args(
-                            util.get_smem_args(kernel_options.smem_args, params)
-                        )
+                        self.dev.copy_shared_memory_args(util.get_smem_args(kernel_options.smem_args, params))
                     # add constant memory arguments to compiled module
                     if kernel_options.cmem_args is not None:
                         self.dev.copy_constant_memory_args(kernel_options.cmem_args)
@@ -601,12 +579,8 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 # test kernel for correctness
                 if func and (to.answer or to.verify or self.output_observers):
                     start_verification = time.perf_counter()
-                    self.check_kernel_output(
-                        func, gpu_args, instance, to.answer, to.atol, to.verify, verbose
-                    )
-                    last_verification_time = 1000 * (
-                        time.perf_counter() - start_verification
-                    )
+                    self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
+                    last_verification_time = 1000 * (time.perf_counter() - start_verification)
 
                 # benchmark
                 if func:
@@ -622,10 +596,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
             except Exception as e:
                 # dump kernel sources to temp file
                 temp_filenames = instance.prepare_temp_files_for_error_msg()
-                print(
-                    "Error while compiling or benchmarking, see source files: "
-                    + " ".join(temp_filenames)
-                )
+                print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames))
                 raise e
 
             # clean up any temporary files, if no error occured
@@ -656,9 +627,7 @@ def compile_kernel(self, instance, verbose):
             ]
             error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
             if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
-                logging.debug(
-                    "compile_kernel failed due to kernel using too much shared memory"
-                )
+                logging.debug("compile_kernel failed due to kernel using too much shared memory")
                 if verbose:
                     print(
                         f"skipping config {util.get_instance_string(instance.params)} reason: too much shared memory used"
@@ -671,7 +640,7 @@ def compile_kernel(self, instance, verbose):
 
     @staticmethod
     def preprocess_gpu_arguments(old_arguments, params):
-        """ Get a flat list of arguments based on the configuration given by `params` """
+        """Get a flat list of arguments based on the configuration given by `params`"""
         return _preprocess_gpu_arguments(old_arguments, params)
 
     def copy_shared_memory_args(self, smem_args):
@@ -707,9 +676,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
         if np.prod(threads) > self.dev.max_threads:
             if verbose:
-                print(
-                    f"skipping config {util.get_instance_string(params)} reason: too many threads per block"
-                )
+                print(f"skipping config {util.get_instance_string(params)} reason: too many threads per block")
             return util.InvalidConfig()
 
         # obtain the kernel_string and prepare additional files, if any
@@ -728,7 +695,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         # Preprocess GPU arguments. Require for handling `Tunable` arguments
         arguments = _preprocess_gpu_arguments(kernel_options.arguments, params)
 
-        #collect everything we know about this instance and return it
+        # collect everything we know about this instance and return it
         return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments)
 
     def get_environment(self):
@@ -775,12 +742,8 @@ def run_kernel(self, func, gpu_args, instance):
         try:
             self.dev.run_kernel(func, gpu_args, instance.threads, instance.grid)
         except Exception as e:
-            if "too many resources requested for launch" in str(
-                e
-            ) or "OUT_OF_RESOURCES" in str(e):
-                logging.debug(
-                    "ignoring runtime failure due to too many resources required"
-                )
+            if "too many resources requested for launch" in str(e) or "OUT_OF_RESOURCES" in str(e):
+                logging.debug("ignoring runtime failure due to too many resources required")
                 return False
             else:
                 logging.debug("encountered unexpected runtime failure: " + str(e))
@@ -789,7 +752,7 @@ def run_kernel(self, func, gpu_args, instance):
 
 
 def _preprocess_gpu_arguments(old_arguments, params):
-    """ Get a flat list of arguments based on the configuration given by `params` """
+    """Get a flat list of arguments based on the configuration given by `params`"""
     new_arguments = []
 
     for argument in old_arguments:
@@ -806,15 +769,11 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
 
     # first check if the length is the same
     if len(instance.arguments) != len(answer):
-        raise TypeError(
-            "The length of argument list and provided results do not match."
-        )
+        raise TypeError("The length of argument list and provided results do not match.")
     # for each element in the argument list, check if the types match
     for i, arg in enumerate(instance.arguments):
         if answer[i] is not None:  # skip None elements in the answer list
-            if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(
-                arg, (np.ndarray, cp.ndarray)
-            ):
+            if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(arg, (np.ndarray, cp.ndarray)):
                 if answer[i].dtype != arg.dtype:
                     raise TypeError(
                         f"Element {i} of the expected results list is not of the same dtype as the kernel output: "
@@ -862,16 +821,14 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
                     )
             else:
                 # either answer[i] and argument have different types or answer[i] is not a numpy type
-                if not isinstance(
-                    answer[i], (np.ndarray, cp.ndarray, torch.Tensor)
-                ) or not isinstance(answer[i], np.number):
+                if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance(
+                    answer[i], np.number
+                ):
                     raise TypeError(
                         f"Element {i} of expected results list is not a numpy/cupy ndarray, torch Tensor or numpy scalar."
                     )
                 else:
-                    raise TypeError(
-                        f"Element {i} of expected results list and kernel arguments have different types."
-                    )
+                    raise TypeError(f"Element {i} of expected results list and kernel arguments have different types.")
 
     def _ravel(a):
         if hasattr(a, "ravel") and len(a.shape) > 1:
@@ -891,26 +848,15 @@ def _flatten(a):
             expected = _flatten(expected)
             if any([isinstance(array, cp.ndarray) for array in [expected, result]]):
                 output_test = cp.allclose(expected, result, atol=atol)
-            elif isinstance(expected, torch.Tensor) and isinstance(
-                result, torch.Tensor
-            ):
+            elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor):
                 output_test = torch.allclose(expected, result, atol=atol)
             else:
                 output_test = np.allclose(expected, result, atol=atol)
 
             if not output_test and verbose:
-                print(
-                    "Error: "
-                    + util.get_config_string(instance.params)
-                    + " detected during correctness check"
-                )
-                print(
-                    "this error occured when checking value of the %oth kernel argument"
-                    % (i,)
-                )
-                print(
-                    "Printing kernel output and expected result, set verbose=False to suppress this debug print"
-                )
+                print("Error: " + util.get_config_string(instance.params) + " detected during correctness check")
+                print("this error occured when checking value of the %oth kernel argument" % (i,))
+                print("Printing kernel output and expected result, set verbose=False to suppress this debug print")
                 np.set_printoptions(edgeitems=50)
                 print("Kernel output:")
                 print(result)
@@ -945,11 +891,7 @@ def apply_template_typenames(type_list, templated_typenames):
     def replace_typename_token(matchobj):
         """function for a whitespace preserving token regex replace"""
         # replace only the match, leaving the whitespace around it as is
-        return (
-            matchobj.group(1)
-            + templated_typenames[matchobj.group(2)]
-            + matchobj.group(3)
-        )
+        return matchobj.group(1) + templated_typenames[matchobj.group(2)] + matchobj.group(3)
 
     for i, arg_type in enumerate(type_list):
         for k, v in templated_typenames.items():
@@ -980,9 +922,7 @@ def wrap_templated_kernel(kernel_string, kernel_name):
     # relatively strict regex that does not allow nested template parameters like vector<TF>
     # within the template parameter list
     regex = (
-        r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?"
-        + name
-        + r"\s*\((.*?)\)\s*\{"
+        r"template\s*<([^>]*?)>\s*__global__\s+void\s+(__launch_bounds__\([^\)]+?\)\s+)?" + name + r"\s*\((.*?)\)\s*\{"
     )
     match = re.search(regex, kernel_string, re.S)
     if not match:
@@ -990,15 +930,11 @@ def wrap_templated_kernel(kernel_string, kernel_name):
 
     template_parameters = match.group(1).split(",")
     argument_list = match.group(3).split(",")
-    argument_list = [
-        s.strip() for s in argument_list
-    ]  # remove extra whitespace around 'type name' strings
+    argument_list = [s.strip() for s in argument_list]  # remove extra whitespace around 'type name' strings
 
     type_list, name_list = split_argument_list(argument_list)
 
-    templated_typenames = get_templated_typenames(
-        template_parameters, template_arguments
-    )
+    templated_typenames = get_templated_typenames(template_parameters, template_arguments)
     apply_template_typenames(type_list, templated_typenames)
 
     # replace __global__ with __device__ in the templated kernel definition
@@ -1012,9 +948,7 @@ def wrap_templated_kernel(kernel_string, kernel_name):
         launch_bounds = match.group(2)
 
     # generate code for the compile-time template instantiation
-    template_instantiation = (
-        f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
-    )
+    template_instantiation = f"template __device__ void {kernel_name}(" + ", ".join(type_list) + ");\n"
 
     # generate code for the wrapper kernel
     new_arg_list = ", ".join([" ".join((a, b)) for a, b in zip(type_list, name_list)])
diff --git a/kernel_tuner/energy/energy.py b/kernel_tuner/energy/energy.py
index ab0582c52..40bcbe080 100644
--- a/kernel_tuner/energy/energy.py
+++ b/kernel_tuner/energy/energy.py
@@ -37,7 +37,10 @@
 }
 """
 
-def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None, simulation_mode=None):
+
+def get_frequency_power_relation_fp32(
+    device, n_samples=10, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None, simulation_mode=None
+):
     """Use NVML and PyCUDA with a synthetic kernel to obtain samples of frequency-power pairs."""
     # get some numbers about the device
     if not cache:
@@ -46,7 +49,7 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
 
         drv.init()
         dev = drv.Device(device)
-        device_name = dev.name().replace(' ', '_')
+        device_name = dev.name().replace(" ", "_")
         multiprocessor_count = dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT)
         max_block_dim_x = dev.get_attribute(drv.device_attribute.MAX_BLOCK_DIM_X)
 
@@ -76,12 +79,28 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
     metrics["f"] = lambda p: p["core_freq"]
 
     nvmlobserver = NVMLObserver(
-        ["core_freq", "nvml_power"], device=device, nvidia_smi_fallback=nvidia_smi_fallback, use_locked_clocks=use_locked_clocks)
-
-    results, _ = tune_kernel("fp32_kernel", fp32_kernel_string, problem_size=(multiprocessor_count, 64),
-                             arguments=arguments, tune_params=tune_params, observers=[nvmlobserver],
-                             verbose=False, quiet=True, metrics=metrics, iterations=10, simulation_mode=simulation_mode,
-                             grid_div_x=[], grid_div_y=[], cache=cache or f"synthetic_fp32_cache_{device_name}.json")
+        ["core_freq", "nvml_power"],
+        device=device,
+        nvidia_smi_fallback=nvidia_smi_fallback,
+        use_locked_clocks=use_locked_clocks,
+    )
+
+    results, _ = tune_kernel(
+        "fp32_kernel",
+        fp32_kernel_string,
+        problem_size=(multiprocessor_count, 64),
+        arguments=arguments,
+        tune_params=tune_params,
+        observers=[nvmlobserver],
+        verbose=False,
+        quiet=True,
+        metrics=metrics,
+        iterations=10,
+        simulation_mode=simulation_mode,
+        grid_div_x=[],
+        grid_div_y=[],
+        cache=cache or f"synthetic_fp32_cache_{device_name}.json",
+    )
 
     freqs = np.array([res["core_freq"] for res in results])
     nvml_power = np.array([res["nvml_power"] for res in results])
@@ -91,7 +110,7 @@ def get_frequency_power_relation_fp32(device, n_samples=10, nvidia_smi_fallback=
 
 def estimated_voltage(clocks, clock_threshold, voltage_scale):
     """Estimate voltage based on clock_threshold and voltage_scale."""
-    return [1 + ((clock > clock_threshold) * (1e-3 * voltage_scale * (clock-clock_threshold))) for clock in clocks]
+    return [1 + ((clock > clock_threshold) * (1e-3 * voltage_scale * (clock - clock_threshold))) for clock in clocks]
 
 
 def estimated_power(clocks, clock_threshold, voltage_scale, clock_scale, power_max):
@@ -131,18 +150,24 @@ def fit_power_frequency_model(freqs, nvml_power):
 
     # fit the model
     p0 = (clock_threshold, voltage_scale, clock_scale, power_max)
-    bounds = ([clock_min, 0, 0, 0.9*power_max],
-              [clock_max, 1, 1, 1.1*power_max])
+    bounds = ([clock_min, 0, 0, 0.9 * power_max], [clock_max, 1, 1, 1.1 * power_max])
     res = optimize.curve_fit(estimated_power, x, y, p0=p0, bounds=bounds)
-    clock_threshold, voltage_scale, clock_scale, power_max = np.round(
-        res[0], 2)
+    clock_threshold, voltage_scale, clock_scale, power_max = np.round(res[0], 2)
 
     fit_parameters = (clock_threshold, voltage_scale, clock_scale, power_max)
     scale_parameters = (clock_min, min(nvml_power))
     return clock_threshold + clock_min, fit_parameters, scale_parameters
 
 
-def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_smi_fallback=None, use_locked_clocks=False, cache=None, simulation_mode=None):
+def create_power_frequency_model(
+    device=0,
+    n_samples=10,
+    verbose=False,
+    nvidia_smi_fallback=None,
+    use_locked_clocks=False,
+    cache=None,
+    simulation_mode=None,
+):
     """Calculate the most energy-efficient clock frequency of device.
 
     This function uses a performance model to fit the power-frequency curve
@@ -176,7 +201,9 @@ def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_s
     :rtype: float
 
     """
-    freqs, nvml_power = get_frequency_power_relation_fp32(device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache, simulation_mode=simulation_mode)
+    freqs, nvml_power = get_frequency_power_relation_fp32(
+        device, n_samples, nvidia_smi_fallback, use_locked_clocks, cache=cache, simulation_mode=simulation_mode
+    )
 
     if verbose:
         print("Clock frequencies:", freqs.tolist())
@@ -187,7 +214,7 @@ def create_power_frequency_model(device=0, n_samples=10, verbose=False, nvidia_s
     if verbose:
         print(f"Modelled most energy efficient frequency: {ridge_frequency} MHz")
 
-    all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)['nvml_gr_clock'])
+    all_frequencies = np.array(get_nvml_gr_clocks(device, quiet=True)["nvml_gr_clock"])
     ridge_frequency_final = all_frequencies[np.argmin(abs(all_frequencies - ridge_frequency))]
 
     if verbose:
@@ -200,8 +227,12 @@ def get_frequency_range_around_ridge(ridge_frequency, all_frequencies, freq_rang
     """Return number_of_freqs frequencies in a freq_range percentage around the ridge_frequency from among all_frequencies."""
     min_freq = 1e-2 * (100 - int(freq_range)) * ridge_frequency
     max_freq = 1e-2 * (100 + int(freq_range)) * ridge_frequency
-    frequency_selection = np.unique([all_frequencies[np.argmin(abs(
-        all_frequencies - f))] for f in np.linspace(min_freq, max_freq, int(number_of_freqs))]).tolist()
+    frequency_selection = np.unique(
+        [
+            all_frequencies[np.argmin(abs(all_frequencies - f))]
+            for f in np.linspace(min_freq, max_freq, int(number_of_freqs))
+        ]
+    ).tolist()
 
     if verbose:
         print(f"Suggested range of frequencies to auto-tune: {frequency_selection} MHz")
diff --git a/kernel_tuner/hyper.py b/kernel_tuner/hyper.py
index f002882f3..b661609fd 100644
--- a/kernel_tuner/hyper.py
+++ b/kernel_tuner/hyper.py
@@ -9,7 +9,7 @@
 
 
 def tune_hyper_params(target_strategy, hyper_params, *args, **kwargs):
-    """ Tune hyperparameters for a given strategy and kernel
+    """Tune hyperparameters for a given strategy and kernel
 
     This function is to be called just like tune_kernel, except that you specify a strategy
     and a dictionary with hyperparameters in front of the arguments you pass to tune_kernel.
@@ -41,17 +41,17 @@ def put_if_not_present(target_dict, key, value):
     put_if_not_present(kwargs, "verbose", False)
     put_if_not_present(kwargs, "quiet", True)
     put_if_not_present(kwargs, "simulation_mode", True)
-    kwargs['strategy'] = 'brute_force'
+    kwargs["strategy"] = "brute_force"
 
-    #last position argument is tune_params
+    # last position argument is tune_params
     tune_params = args[-1]
 
-    #find optimum
+    # find optimum
     kwargs["strategy"] = "brute_force"
     results, _ = kernel_tuner.tune_kernel(*args, **kwargs)
     optimum = min(results, key=lambda p: p["time"])["time"]
 
-    #could throw a warning for the kwargs that will be overwritten, strategy(_options)
+    # could throw a warning for the kwargs that will be overwritten, strategy(_options)
     kwargs["strategy"] = target_strategy
 
     parameter_space = itertools.product(*hyper_params.values())
@@ -65,14 +65,13 @@ def put_if_not_present(target_dict, key, value):
         fevals = []
         p_of_opt = []
         for _ in range(100):
-            #measure
+            # measure
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 results, _ = kernel_tuner.tune_kernel(*args, **kwargs)
 
-            #get unique function evaluations
-            unique_fevals = {",".join([str(v) for k, v in record.items() if k in tune_params])
-                             for record in results}
+            # get unique function evaluations
+            unique_fevals = {",".join([str(v) for k, v in record.items() if k in tune_params]) for record in results}
 
             fevals.append(len(unique_fevals))
             p_of_opt.append(min(results, key=lambda p: p["time"])["time"] / optimum * 100)
diff --git a/kernel_tuner/integration.py b/kernel_tuner/integration.py
index d3219ba87..01f072694 100644
--- a/kernel_tuner/integration.py
+++ b/kernel_tuner/integration.py
@@ -6,8 +6,8 @@
 
 from kernel_tuner import util
 
-#specifies for a number of pre-defined objectives whether
-#the objective should be minimized or maximized (boolean value denotes higher is better)
+# specifies for a number of pre-defined objectives whether
+# the objective should be minimized or maximized (boolean value denotes higher is better)
 objective_default_map = {
     "time": False,
     "energy": False,
@@ -18,11 +18,12 @@
     "GFLOPS/W": True,
     "TFLOPS/W": True,
     "GFLOP/J": True,
-    "TFLOP/J": True
+    "TFLOP/J": True,
 }
 
+
 def get_objective_defaults(objective, objective_higher_is_better):
-    """ Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives """
+    """Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives"""
     objective = objective or "time"
     if objective_higher_is_better is None:
         if objective in objective_default_map:
@@ -31,6 +32,7 @@ def get_objective_defaults(objective, objective_higher_is_better):
             raise ValueError(f"Please specify objective_higher_is_better for objective {objective}")
     return objective, objective_higher_is_better
 
+
 schema_v1_0 = {
     "$schema": "https://json-schema.org/draft-07/schema#",
     "type": "object",
@@ -45,25 +47,20 @@ def get_objective_defaults(objective, objective_higher_is_better):
             "type": "array",
             "items": {
                 "type": "object",
-                "properties": {
-                    "device_name": {"type": "string"},
-                    "problem_size": {"type": "string"}
-                },
-                "required": ["device_name", "problem_size", "tunable_parameters"]
+                "properties": {"device_name": {"type": "string"}, "problem_size": {"type": "string"}},
+                "required": ["device_name", "problem_size", "tunable_parameters"],
             },
         },
     },
-    "required": ["version_number", "tunable_parameters", "kernel_name", "objective", "data"]
+    "required": ["version_number", "tunable_parameters", "kernel_name", "objective", "data"],
 }
 
 
-
-
-class TuneResults():
-    """ Object to represent the tuning results stored to file """
+class TuneResults:
+    """Object to represent the tuning results stored to file"""
 
     def __init__(self, results_filename):
-        #open results file
+        # open results file
         if not os.path.isfile(results_filename):
             raise ValueError("Error: results_filename does not exist")
         meta, data = _read_results_file(results_filename)
@@ -75,28 +72,28 @@ def __init__(self, results_filename):
         self.objective_higher_is_better = meta.get("objective_higher_is_better", False)
 
     def get_best_config(self, gpu_name="default", problem_size=None):
-        """ get the best config based on these tuning results
+        """get the best config based on these tuning results
 
-            This function returns the overall best performing kernel configuration
-            based on the tuning results for a given gpu_name and problem_size.
+        This function returns the overall best performing kernel configuration
+        based on the tuning results for a given gpu_name and problem_size.
 
-            If problem_size is not given this function will select a default configuration
-            based on the tuning results for all problem_sizes and the given gpu_name.
+        If problem_size is not given this function will select a default configuration
+        based on the tuning results for all problem_sizes and the given gpu_name.
 
-            If gpu_name is not given this function will select a default configuration
-            based on all tuning results.
+        If gpu_name is not given this function will select a default configuration
+        based on all tuning results.
 
-            :param gpu_name: Name of the GPU for which the best configuration
-                needs to be retrieved.
-            :type gpu_name: string
+        :param gpu_name: Name of the GPU for which the best configuration
+            needs to be retrieved.
+        :type gpu_name: string
 
-            :param problem_size: The problem size for which the best configuration
-                on the given gpu_name needs to be retrieved.
-            :type problem_size: tuple, int, or string
+        :param problem_size: The problem size for which the best configuration
+            on the given gpu_name needs to be retrieved.
+        :type problem_size: tuple, int, or string
 
-            :returns: A dictionary with tunable parameters of the selected kernel
-                kernel configuration.
-            :rtype: dict
+        :returns: A dictionary with tunable parameters of the selected kernel
+            kernel configuration.
+        :rtype: dict
         """
         gpu_name = gpu_name.replace("-", "_").replace(" ", "_")
 
@@ -111,88 +108,102 @@ def get_best_config(self, gpu_name="default", problem_size=None):
         gpu_match = [result for result in self.data if result["device_name"] == gpu_name]
 
         if gpu_match:
-            gpu_ps_match = [result for result in gpu_match if problem_size and result["problem_size"] == problem_size_str]
+            gpu_ps_match = [
+                result for result in gpu_match if problem_size and result["problem_size"] == problem_size_str
+            ]
             if gpu_ps_match:
                 return _get_best_config_from_list(gpu_ps_match, self.objective, self.objective_higher_is_better)
-            #problem size is not given or not among the results, so return a good default
+            # problem size is not given or not among the results, so return a good default
             return _select_best_common_config(gpu_match, self.objective, self.objective_higher_is_better)
 
-        #gpu is not among the results, so return a good default
+        # gpu is not among the results, so return a good default
         return _select_best_common_config(self.data, self.objective, self.objective_higher_is_better)
 
 
-def store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective=None, objective_higher_is_better=None):
-    """ stores tuning results to a JSON file
-
-        Stores the top (3% by default) best kernel configurations in a JSON file.
-        The results are stored for a specific device (retrieved using env['device_name'])
-        and for a specific problem_size. If the file already exists, new results for
-        this device and problem_size will be appended. Any previous results already stored
-        in the file for this specific device and problem_size will be overwritten.
-
-        :param results_filename: Filename of the JSON file in which the results will be stored.
-            Results will be appended if the file already exists. Existing results within the
-            file for the same device and problem_size will be overwritten.
-        :type results_filename: string
-
-        :param tune_params: The tunable parameters of this kernel.
-        :type tune_params: dict
-
-        :param problem_size: The problem_size this kernel was tuned for
-        :type problem_size: tuple
-
-        :param results: A list of dictionaries of all executed kernel configurations and their
-            execution times, and possibly other user-defined metrics, as returned by
-            tune_kernel().
-        :type results: list(dict)
-
-        :param env: A dictionary with information about the environment
-            in which the tuning took place. This records device name, properties,
-            version info, and so on. Typicaly this dictionary is returned by tune_kernel().
-        :type env: dict
-
-        :param top: Denotes the top percentage of results to store in the results file
-        :type top: float
-
-        :param objective: Optimization objective to sort results on, consisting of a string
-            that also occurs in results as a metric.
-        :type objective: string
-
-        :param objective_higher_is_better: A boolean that specifies whether the objective should
-            be maximized or minimized.
-        :type objective_higher_is_better: bool
+def store_results(
+    results_filename,
+    kernel_name,
+    kernel_string,
+    tune_params,
+    problem_size,
+    results,
+    env,
+    top=3,
+    objective=None,
+    objective_higher_is_better=None,
+):
+    """stores tuning results to a JSON file
+
+    Stores the top (3% by default) best kernel configurations in a JSON file.
+    The results are stored for a specific device (retrieved using env['device_name'])
+    and for a specific problem_size. If the file already exists, new results for
+    this device and problem_size will be appended. Any previous results already stored
+    in the file for this specific device and problem_size will be overwritten.
+
+    :param results_filename: Filename of the JSON file in which the results will be stored.
+        Results will be appended if the file already exists. Existing results within the
+        file for the same device and problem_size will be overwritten.
+    :type results_filename: string
+
+    :param tune_params: The tunable parameters of this kernel.
+    :type tune_params: dict
+
+    :param problem_size: The problem_size this kernel was tuned for
+    :type problem_size: tuple
+
+    :param results: A list of dictionaries of all executed kernel configurations and their
+        execution times, and possibly other user-defined metrics, as returned by
+        tune_kernel().
+    :type results: list(dict)
+
+    :param env: A dictionary with information about the environment
+        in which the tuning took place. This records device name, properties,
+        version info, and so on. Typicaly this dictionary is returned by tune_kernel().
+    :type env: dict
+
+    :param top: Denotes the top percentage of results to store in the results file
+    :type top: float
+
+    :param objective: Optimization objective to sort results on, consisting of a string
+        that also occurs in results as a metric.
+    :type objective: string
+
+    :param objective_higher_is_better: A boolean that specifies whether the objective should
+        be maximized or minimized.
+    :type objective_higher_is_better: bool
 
     """
 
     objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
 
-    #filter results to only those that contain the objective
+    # filter results to only those that contain the objective
     results_filtered = [item for item in results if objective in item]
 
-    #get top results
+    # get top results
     if objective_higher_is_better:
         best_config = max(results_filtered, key=lambda x: x[objective])
     else:
         best_config = min(results_filtered, key=lambda x: x[objective])
     best = best_config[objective]
-    top_range = top/100.0
+    top_range = top / 100.0
 
     def top_result(item):
         current = item[objective]
         if objective_higher_is_better:
-            return current > best * (1-top_range)
-        return current < best * (1+top_range)
+            return current > best * (1 - top_range)
+        return current < best * (1 + top_range)
+
     top_results = [item for item in results_filtered if top_result(item)]
 
-    #filter result items to just the tunable parameters and the objective
+    # filter result items to just the tunable parameters and the objective
     filter_keys = list(tune_params.keys()) + [objective]
-    top_results = [{k:item[k] for k in filter_keys} for item in top_results]
+    top_results = [{k: item[k] for k in filter_keys} for item in top_results]
 
-    #read existing results file
+    # read existing results file
     if os.path.isfile(results_filename):
         meta, data = _read_results_file(results_filename)
 
-        #validate consistency between arguments and results file
+        # validate consistency between arguments and results file
         if not kernel_name == meta["kernel_name"]:
             raise ValueError("Mismatch between given kernel_name and results file")
         if not all([param in meta["tunable_parameters"] for param in tune_params]):
@@ -200,7 +211,7 @@ def top_result(item):
         if not objective == meta["objective"]:
             raise ValueError("Mismatch between given objective and results file")
     else:
-        #new file
+        # new file
         meta = {}
         meta["version_number"] = "1.0"
         meta["kernel_name"] = kernel_name
@@ -214,18 +225,18 @@ def top_result(item):
         meta["tunable_parameters"] = list(tune_params.keys())
         data = []
 
-    #insert new results into the list
+    # insert new results into the list
     if not isinstance(problem_size, (list, tuple)):
         problem_size = (problem_size,)
     problem_size_str = "x".join(str(i) for i in problem_size)
 
-    #replace all non alphanumeric characters with underscore
-    dev_name = re.sub('[^0-9a-zA-Z]+', '_', env["device_name"].strip())
+    # replace all non alphanumeric characters with underscore
+    dev_name = re.sub("[^0-9a-zA-Z]+", "_", env["device_name"].strip())
 
-    #remove existing entries for this GPU and problem_size combination from the results if any
+    # remove existing entries for this GPU and problem_size combination from the results if any
     data = [d for d in data if not (d["device_name"] == dev_name and d["problem_size"] == problem_size_str)]
 
-    #extend the results with the top_results
+    # extend the results with the top_results
     results = []
     for result in top_results:
         record = {"device_name": dev_name, "problem_size": problem_size_str, "tunable_parameters": {}}
@@ -236,65 +247,65 @@ def top_result(item):
         results.append(record)
     data.extend(results)
 
-    #write output file
+    # write output file
     meta["data"] = data
-    with open(results_filename, 'w') as fh:
+    with open(results_filename, "w") as fh:
         fh.write(json.dumps(meta, indent=""))
 
 
 def create_device_targets(header_filename, results_filename, objective=None, objective_higher_is_better=None):
-    """ create a header with device targets
+    """create a header with device targets
 
-        This function generates a header file with device targets for compiling
-        a kernel with different parameters on different devices. The tuning
-        results are stored in a JSON file created by store_results. Existing
-        header_filename will be overwritten.
+    This function generates a header file with device targets for compiling
+    a kernel with different parameters on different devices. The tuning
+    results are stored in a JSON file created by store_results. Existing
+    header_filename will be overwritten.
 
-        This function only creates device targets and does not create problem_size
-        specific targets. Instead it searches for configurations that perform well
-        for different problem sizes and selects a single configuration to use
-        for the kernel.
+    This function only creates device targets and does not create problem_size
+    specific targets. Instead it searches for configurations that perform well
+    for different problem sizes and selects a single configuration to use
+    for the kernel.
 
-        The header file can be included in a kernel source file using:
-        ``#include "header_filename.h"``
+    The header file can be included in a kernel source file using:
+    ``#include "header_filename.h"``
 
-        The kernel can then be compiled for a specific device using:
-        ``-DTARGET_GPU="name_of_gpu"``
+    The kernel can then be compiled for a specific device using:
+    ``-DTARGET_GPU="name_of_gpu"``
 
-        The header will also include a default value, which is chosen to perform well
-        on different devices.
+    The header will also include a default value, which is chosen to perform well
+    on different devices.
 
-        :param header_filename: Filename of the to be created header file.
-        :type header_filename: string
+    :param header_filename: Filename of the to be created header file.
+    :type header_filename: string
 
-        :param results_filename: Filename of the JSON file that stores the tuning results.
-        :type results_filename: string
+    :param results_filename: Filename of the JSON file that stores the tuning results.
+    :type results_filename: string
 
-        :param objective: Optimization objective to sort results on, consisting of a string
-            that also occurs in results as a metric.
-        :type objective: string
+    :param objective: Optimization objective to sort results on, consisting of a string
+        that also occurs in results as a metric.
+    :type objective: string
 
-        :param objective_higher_is_better: A boolean that specifies whether the objective should
-            be maximized or minimized.
-        :type objective_higher_is_better: bool
+    :param objective_higher_is_better: A boolean that specifies whether the objective should
+        be maximized or minimized.
+    :type objective_higher_is_better: bool
 
     """
     objective, objective_higher_is_better = get_objective_defaults(objective, objective_higher_is_better)
 
-    #open results file
+    # open results file
     results = TuneResults(results_filename)
     data = results.data
 
-    #collect data for the if-block
+    # collect data for the if-block
     gpu_targets = list({r["device_name"] for r in data})
     targets = {}
     for gpu_name in gpu_targets:
         targets[gpu_name] = results.get_best_config(gpu_name)
 
-    #select a good default from all good configs
+    # select a good default from all good configs
     default_params = results.get_best_config()
 
-    #write the header output file
+    # write the header output file
     if_block = ""
     first = True
     for gpu_name, params in targets.items():
@@ -303,10 +314,10 @@ def create_device_targets(header_filename, results_filename, objective=None, obj
             first = False
         else:
             if_block += f"\n#elif TARGET_{gpu_name}\n"
-        if_block += "\n".join([f"#define {k} {v}" for k,v in params.items()])
+        if_block += "\n".join([f"#define {k} {v}" for k, v in params.items()])
         if_block += "\n"
 
-    default_config = "\n".join([f"#define {k} {v}" for k,v in default_params.items()])
+    default_config = "\n".join([f"#define {k} {v}" for k, v in default_params.items()])
 
     template_header_file = f"""/* header file generated by Kernel Tuner, do not modify by hand */
 #pragma once
@@ -320,79 +331,75 @@ def create_device_targets(header_filename, results_filename, objective=None, obj
 #endif /* kernel_tuner */
 """
 
-    with open(header_filename, 'w') as fh:
+    with open(header_filename, "w") as fh:
         fh.write(template_header_file)
 
 
-
-
 def _select_best_common_config(results, objective, objective_higher_is_better):
-    """ return the most common config among results obtained on different problem sizes """
+    """return the most common config among results obtained on different problem sizes"""
     results_table = {}
     total_performance = {}
 
     inverse_table = {}
 
-    #for each configuration in the list
+    # for each configuration in the list
     for config in results:
         params = config["tunable_parameters"]
 
         config_str = util.get_instance_string(params)
-        #count occurances
-        results_table[config_str] = results_table.get(config_str,0) + 1
-        #add to performance
-        total_performance[config_str] = total_performance.get(config_str,0) + config[objective]
-        #store mapping from config_str to the parameters
+        # count occurances
+        results_table[config_str] = results_table.get(config_str, 0) + 1
+        # add to performance
+        total_performance[config_str] = total_performance.get(config_str, 0) + config[objective]
+        # store mapping from config_str to the parameters
         inverse_table[config_str] = params
 
-    #look for best config
+    # look for best config
     top_freq = max(results_table.values())
     best_configs = [k for k in results_table if results_table[k] == top_freq]
 
-    #intersect total_performance with the best_configs
-    total_performance = {k:total_performance[k] for k in total_performance if k in best_configs}
+    # intersect total_performance with the best_configs
+    total_performance = {k: total_performance[k] for k in total_performance if k in best_configs}
 
-    #get the best config from this intersection
+    # get the best config from this intersection
     if objective_higher_is_better:
         best_config_str = max(total_performance.keys(), key=lambda x: total_performance[x])
     else:
         best_config_str = min(total_performance.keys(), key=lambda x: total_performance[x])
 
-    #lookup the tunable parameters of this configuration in the inverse table and return result
+    # lookup the tunable parameters of this configuration in the inverse table and return result
     return inverse_table[best_config_str]
 
 
 def _get_best_config_from_list(configs, objective, objective_higher_is_better):
-    """ return the tunable parameters of the best config from a list of configs """
+    """return the tunable parameters of the best config from a list of configs"""
     if objective_higher_is_better:
         best_config = max(configs, key=lambda x: x[objective])
     else:
         best_config = min(configs, key=lambda x: x[objective])
-    best_config_params = {k:best_config[k] for k in best_config if k != objective}
+    best_config_params = {k: best_config[k] for k in best_config if k != objective}
     return best_config_params
 
 
-
-
 def _read_results_file(results_filename):
-    """ Reader for results file
-
-        File format 1.0 specifies the following metadata
-        "version_number": string e.g. "1.0"
-        "tunable_parameters": list of strings
-        "kernel_name": string
-        "kernel_string": string with kernel code, optional
-        "objective": string
-        "objective_higher_is_better": True or False, default False
-        "data": list of dicts
-            each dict consists of the following keys:
-            - "device_name": device name as reported by the device, with all non-alphanumeric characters replaced with "_"
-            - "problem_size": a concatenated string of problem dimensions using "x" as separator
-            - "tunable_parameters": a dict with all tunable parameters
-            - "objective" as specified in the "objective" metadata
+    """Reader for results file
+
+    File format 1.0 specifies the following metadata
+    "version_number": string e.g. "1.0"
+    "tunable_parameters": list of strings
+    "kernel_name": string
+    "kernel_string": string with kernel code, optional
+    "objective": string
+    "objective_higher_is_better": True or False, default False
+    "data": list of dicts
+        each dict consists of the following keys:
+        - "device_name": device name as reported by the device, with all non-alphanumeric characters replaced with "_"
+        - "problem_size": a concatenated string of problem dimensions using "x" as separator
+        - "tunable_parameters": a dict with all tunable parameters
+        - "objective" as specified in the "objective" metadata
 
     """
-    with open(results_filename, 'r') as fh:
+    with open(results_filename, "r") as fh:
         data = json.loads(fh.read())
 
     if "version_number" in data:
@@ -402,7 +409,6 @@ def _read_results_file(results_filename):
     raise ValueError("Results fileformat not recognized")
 
 
-
 def _parse_results_file_version_1_0(data):
     validate(instance=data, schema=schema_v1_0)
 
@@ -412,7 +418,7 @@ def _parse_results_file_version_1_0(data):
     meta["kernel_string"] = data.get("kernel_string", "")
     entries = data["data"]
 
-    #do some final checks against the metadata that cannot be handled by the JSON schema
+    # do some final checks against the metadata that cannot be handled by the JSON schema
     entry_keys = ["tunable_parameters"] + [meta["objective"]] + ["device_name", "problem_size"]
     for entry in entries:
         if not all([k in entry for k in entry_keys]):
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 0be907737..2cf27403a 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -58,7 +58,7 @@
     pso,
     random_sample,
     simulated_annealing,
-    ensemble
+    ensemble,
 )
 
 strategy_map = {
@@ -661,8 +661,8 @@ def tune_kernel(
     selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner)
     tuning_options.simulated_time = 0
     if parallel_mode:
-         num_gpus = tuning_options['num_gpus'] if 'num_gpus' in tuning_options else None
-         runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers, num_gpus=num_gpus)
+        num_gpus = tuning_options["num_gpus"] if "num_gpus" in tuning_options else None
+        runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers, num_gpus=num_gpus)
     else:
         runner = selected_runner(kernelsource, kernel_options, device_options, iterations, observers)
 
@@ -696,7 +696,7 @@ def tune_kernel(
     if results:  # checks if results is not empty
         best_config = util.get_best_config(results, objective, objective_higher_is_better)
         # add the best configuration to env
-        env['best_config'] = best_config
+        env["best_config"] = best_config
         if not device_options.quiet:
             units = getattr(runner, "units", None)
             print("best performing configuration:")
diff --git a/kernel_tuner/kernelbuilder.py b/kernel_tuner/kernelbuilder.py
index 0f3f6154f..1e46ac811 100644
--- a/kernel_tuner/kernelbuilder.py
+++ b/kernel_tuner/kernelbuilder.py
@@ -5,53 +5,69 @@
 
 from kernel_tuner.integration import TuneResults
 
-class PythonKernel(object):
-
-    def __init__(self, kernel_name, kernel_string, problem_size, arguments, params=None, inputs=None, outputs=None, device=0, platform=0,
-                 block_size_names=None, grid_div_x=None, grid_div_y=None, grid_div_z=None, verbose=True, lang=None,
-                 results_file=None):
-        """ Construct Python helper object to compile and call the kernel from Python
-
-            This object compiles a GPU kernel parameterized using the parameters in params.
-            GPU memory is allocated for each argument using its size and type as listed in arguments.
-            The object can be called directly as a function with the kernel arguments as function arguments.
-            Kernel arguments marked as inputs will be copied to the GPU on every kernel launch.
-            Only the kernel arguments marked as outputs will be returned, note that the result is always
-            returned in a list, even when there is only one output.
 
-            Most of the arguments to this function are the same as with tune_kernel or run_kernel in Kernel Tuner,
-            and are therefore not duplicated here. The two new arguments are:
-
-            :param inputs: a boolean list of length arguments to signal whether an argument is input to the kernel
-            :type inputs: list(bool)
-
-            :param outputs: a boolean list of length arguments to signal whether an argument is output of the kernel
-            :type outputs: list(bool)
+class PythonKernel(object):
+    def __init__(
+        self,
+        kernel_name,
+        kernel_string,
+        problem_size,
+        arguments,
+        params=None,
+        inputs=None,
+        outputs=None,
+        device=0,
+        platform=0,
+        block_size_names=None,
+        grid_div_x=None,
+        grid_div_y=None,
+        grid_div_z=None,
+        verbose=True,
+        lang=None,
+        results_file=None,
+    ):
+        """Construct Python helper object to compile and call the kernel from Python
+
+        This object compiles a GPU kernel parameterized using the parameters in params.
+        GPU memory is allocated for each argument using its size and type as listed in arguments.
+        The object can be called directly as a function with the kernel arguments as function arguments.
+        Kernel arguments marked as inputs will be copied to the GPU on every kernel launch.
+        Only the kernel arguments marked as outputs will be returned, note that the result is always
+        returned in a list, even when there is only one output.
+
+        Most of the arguments to this function are the same as with tune_kernel or run_kernel in Kernel Tuner,
+        and are therefore not duplicated here. The two new arguments are:
+
+        :param inputs: a boolean list of length arguments to signal whether an argument is input to the kernel
+        :type inputs: list(bool)
+
+        :param outputs: a boolean list of length arguments to signal whether an argument is output of the kernel
+        :type outputs: list(bool)
 
         """
-        #construct device interface
+        # construct device interface
         kernel_source = core.KernelSource(kernel_name, kernel_string, lang)
         self.dev = core.DeviceInterface(kernel_source, device=device, quiet=True)
         if not params:
             params = {}
 
-        #if results_file is passed use the results file to lookup tunable parameters
+        # if results_file is passed use the results file to lookup tunable parameters
         if results_file:
             results = TuneResults(results_file)
             params.update(results.get_best_config(self.dev.name, problem_size))
         self.params = params
 
-        #construct kernel_options to hold information about the kernel
+        # construct kernel_options to hold information about the kernel
         opts = locals()
         kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys() if k in opts.keys()])
 
-        #instantiate the kernel given the parameters in params
+        # instantiate the kernel given the parameters in params
         self.kernel_instance = self.dev.create_kernel_instance(kernel_source, kernel_options, params, verbose)
 
-        #compile the kernel
+        # compile the kernel
         self.func = self.dev.compile_kernel(self.kernel_instance, verbose)
 
-        #setup GPU memory
+        # setup GPU memory
         self.gpu_args = self.dev.ready_argument_list(arguments)
         if inputs:
             self.inputs = inputs
diff --git a/kernel_tuner/observers/hip.py b/kernel_tuner/observers/hip.py
index c536cf965..a21bb18bd 100644
--- a/kernel_tuner/observers/hip.py
+++ b/kernel_tuner/observers/hip.py
@@ -14,7 +14,9 @@ class HipRuntimeObserver(BenchmarkObserver):
 
     def __init__(self, dev):
         if not hip or not hiprtc:
-            raise ImportError("Unable to import HIP Python, or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#hip-and-hip-python.")
+            raise ImportError(
+                "Unable to import HIP Python, or check https://kerneltuner.github.io/kernel_tuner/stable/install.html#hip-and-hip-python."
+            )
 
         self.dev = dev
         self.stream = dev.stream
diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py
index c727e1e30..0956dd2a7 100644
--- a/kernel_tuner/observers/ncu.py
+++ b/kernel_tuner/observers/ncu.py
@@ -2,24 +2,25 @@
 
 try:
     import nvmetrics
-except (ImportError):
+except ImportError:
     nvmetrics = None
 
+
 class NCUObserver(PrologueObserver):
     """``NCUObserver`` measures performance counters.
 
-        The exact performance counters supported differ per GPU, some examples:
+    The exact performance counters supported differ per GPU, some examples:
 
-         * "dram__bytes.sum",                                     # Counter         byte            # of bytes accessed in DRAM
-         * "dram__bytes_read.sum",                                # Counter         byte            # of bytes read from DRAM
-         * "dram__bytes_write.sum",                               # Counter         byte            # of bytes written to DRAM
-         * "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter         inst            # of FADD thread instructions executed where all predicates were true
-         * "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter         inst            # of FFMA thread instructions executed where all predicates were true
-         * "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter         inst            # of FMUL thread instructions executed where all predicates were true
+     * "dram__bytes.sum",                                     # Counter         byte            # of bytes accessed in DRAM
+     * "dram__bytes_read.sum",                                # Counter         byte            # of bytes read from DRAM
+     * "dram__bytes_write.sum",                               # Counter         byte            # of bytes written to DRAM
+     * "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter         inst            # of FADD thread instructions executed where all predicates were true
+     * "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter         inst            # of FFMA thread instructions executed where all predicates were true
+     * "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter         inst            # of FMUL thread instructions executed where all predicates were true
 
-        :param metrics: The metrics to observe. This should be a list of strings.
-                        You can use ``ncu --query-metrics`` to get a list of valid metrics.
-        :type metrics: list[str]
+    :param metrics: The metrics to observe. This should be a list of strings.
+                    You can use ``ncu --query-metrics`` to get a list of valid metrics.
+    :type metrics: list[str]
 
     """
 
diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py
index d468d6391..8b8529aa2 100644
--- a/kernel_tuner/observers/nvml.py
+++ b/kernel_tuner/observers/nvml.py
@@ -333,7 +333,7 @@ def __init__(
             "save_all": save_all,
             "nvidia_smi_fallback": nvidia_smi_fallback,
             "use_locked_clocks": use_locked_clocks,
-            "continous_duration": continuous_duration
+            "continous_duration": continuous_duration,
         }
         if nvidia_smi_fallback:
             self.nvml = nvml(
@@ -364,7 +364,9 @@ def __init__(
         if any([obs in self.needs_power for obs in observables]):
             self.measure_power = True
             power_observables = [obs for obs in observables if obs in self.needs_power]
-            self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)
+            self.continuous_observer = ContinuousObserver(
+                "nvml", power_observables, self, continuous_duration=continuous_duration
+            )
 
         # remove power observables
         self.observables = [obs for obs in observables if obs not in self.needs_power]
@@ -383,7 +385,7 @@ def __init__(
         self.iteration = {obs: [] for obs in self.during_obs}
 
     def read_power(self):
-        """ Return power in Watt """
+        """Return power in Watt"""
         return self.nvml.pwr_usage() / 1e3
 
     def before_start(self):
diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py
index bcf661c8a..545e3130f 100644
--- a/kernel_tuner/observers/observer.py
+++ b/kernel_tuner/observers/observer.py
@@ -2,6 +2,7 @@
 import time
 import numpy as np
 
+
 class BenchmarkObserver(ABC):
     """Base class for Benchmark Observers"""
 
@@ -47,9 +48,10 @@ class IterationObserver(BenchmarkObserver):
 class ContinuousObserver(BenchmarkObserver):
     """Generic observer that measures power while and continuous benchmarking.
 
-        To support continuous benchmarking an Observer should support:
-        a .read_power() method, which the ContinuousObserver can call to read power in Watt
+    To support continuous benchmarking an Observer should support:
+    a .read_power() method, which the ContinuousObserver can call to read power in Watt
     """
+
     def __init__(self, name, observables, parent, continuous_duration=1):
         self.parent = parent
         self.name = name
@@ -89,8 +91,7 @@ def during(self):
         timestamp = time.perf_counter() - self.t0
         # only store the result if we get a new measurement from the GPU
         if len(self.power_readings) == 0 or (
-            self.power_readings[-1][1] != power_usage
-            or timestamp - self.power_readings[-1][0] > 0.01
+            self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
         ):
             self.power_readings.append([timestamp, power_usage])
 
@@ -118,6 +119,7 @@ def get_results(self):
             results["power_readings"] = self.power_readings
         return results
 
+
 class OutputObserver(BenchmarkObserver):
     """Observer that can verify or measure something about the output produced by a kernel."""
 
@@ -128,6 +130,7 @@ def process_output(self, answer, output):
         """
         pass
 
+
 class PrologueObserver(BenchmarkObserver):
     """Observer that measures something in a seperate kernel invocation prior to the normal benchmark."""
 
diff --git a/kernel_tuner/observers/pmt.py b/kernel_tuner/observers/pmt.py
index 970e9bd22..268d1177d 100644
--- a/kernel_tuner/observers/pmt.py
+++ b/kernel_tuner/observers/pmt.py
@@ -49,11 +49,9 @@ class PMTObserver(BenchmarkObserver):
     def __init__(self, observable=None, use_continuous_observer=False, continuous_duration=1):
         if not pmt:
             raise ImportError("could not import pmt")
-        
+
         # needed for re-initializing observer on ray actor
-        self.init_arguments = {
-            "observable": observable
-        }
+        self.init_arguments = {"observable": observable}
 
         # User specifices a dictonary of platforms and corresponding device
         if type(observable) is dict:
@@ -111,18 +109,19 @@ def get_results(self):
 class PMTContinuousObserver(ContinuousObserver):
     """Generic observer that measures power while and continuous benchmarking.
 
-        To support continuous benchmarking an Observer should support:
-        a .read_power() method, which the ContinuousObserver can call to read power in Watt
+    To support continuous benchmarking an Observer should support:
+    a .read_power() method, which the ContinuousObserver can call to read power in Watt
     """
+
     def before_start(self):
-        """ Override default method in ContinuousObserver """
+        """Override default method in ContinuousObserver"""
         pass
 
     def after_start(self):
         self.parent.after_start()
 
     def during(self):
-        """ Override default method in ContinuousObserver """
+        """Override default method in ContinuousObserver"""
         pass
 
     def after_finish(self):
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index c946f9d44..889071a9f 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -27,12 +27,9 @@ class PowerSensorObserver(BenchmarkObserver):
     def __init__(self, observables=None, device=None):
         if not powersensor:
             raise ImportError("could not import powersensor")
-        
+
         # needed for re-initializing observer on ray actor
-        self.init_arguments = {
-            "observables": observables,
-            "device": device
-        }
+        self.init_arguments = {"observables": observables, "device": device}
 
         supported = ["ps_energy", "ps_power"]
         for obs in observables:
@@ -52,14 +49,10 @@ def after_start(self):
     def after_finish(self):
         end_state = self.ps.read()
         if "ps_energy" in self.observables:
-            ps_measured_e = powersensor.Joules(
-                self.begin_state, end_state, -1
-            )  # Joules
+            ps_measured_e = powersensor.Joules(self.begin_state, end_state, -1)  # Joules
             self.results["ps_energy"].append(ps_measured_e)
         if "ps_power" in self.observables:
-            ps_measured_t = (
-                    end_state.time_at_read - self.begin_state.time_at_read
-            )  # seconds
+            ps_measured_t = end_state.time_at_read - self.begin_state.time_at_read  # seconds
             self.results["ps_power"].append(ps_measured_e / ps_measured_t)  # Watt
 
     def get_results(self):
diff --git a/kernel_tuner/observers/tegra.py b/kernel_tuner/observers/tegra.py
index efc83048c..84495b1de 100644
--- a/kernel_tuner/observers/tegra.py
+++ b/kernel_tuner/observers/tegra.py
@@ -83,11 +83,11 @@ def get_gpu_channel(self):
         # Iterate over all channels in the of_node dir of the power path to
         # find the channel which holds GPU power information
         for channel_dir in Path(self.gpu_power_path + "/of_node/").iterdir():
-            if("channel@" in channel_dir.name):
+            if "channel@" in channel_dir.name:
                 with open(channel_dir / Path("label")) as fp:
                     channel_label = fp.read().strip()
                 if "GPU" in channel_label:
-                    return str(int(channel_dir.name[-1])+1)
+                    return str(int(channel_dir.name[-1]) + 1)
 
         # If this statement is reached, no channel for the GPU was found
         raise FileNotFoundError("No channel found with GPU power readings")
@@ -103,12 +103,7 @@ def _write_railgate_file(self, value):
         if value not in (0, 1):
             raise ValueError(f"Illegal governor value {value}, must be 0 or 1")
         full_path = self.dev_path / Path("device/railgate_enable")
-        args = [
-            "sudo",
-            "sh",
-            "-c",
-            f"echo {value} > {str(full_path)}"
-        ]
+        args = ["sudo", "sh", "-c", f"echo {value} > {str(full_path)}"]
         subprocess.run(args, check=True)
 
     def _read_clock_file(self, fname):
@@ -132,12 +127,7 @@ def _write_clock_file(self, fname, value):
             raise ValueError(f"Illegal frequency value {value}, must be one of {self.supported_gr_clocks}")
 
         full_path = self.dev_path / Path(fname)
-        args = [
-            "sudo",
-            "sh",
-            "-c",
-            f"echo {value} > {str(full_path)}"
-        ]
+        args = ["sudo", "sh", "-c", f"echo {value} > {str(full_path)}"]
         subprocess.run(args, check=True)
 
     @property
@@ -157,8 +147,8 @@ def gr_clock(self, new_clock):
             self._write_clock_file("min_freq", new_clock)
             self._write_clock_file("max_freq", new_clock)
         # wait for the new clock to be applied
-        while (self._read_clock_file("cur_freq") != new_clock):
-            time.sleep(.001)
+        while self._read_clock_file("cur_freq") != new_clock:
+            time.sleep(0.001)
 
     def reset_clock(self):
         """Reset the core clock frequency to the original values"""
@@ -180,9 +170,13 @@ def read_gpu_temp(self):
     def read_gpu_power(self):
         """Read the current and voltage to calculate and return the power int watt"""
 
-        result_cur = subprocess.run(["sudo", "cat", f"{self.gpu_power_path}/curr{self.gpu_channel}_input"], capture_output=True, text=True)
+        result_cur = subprocess.run(
+            ["sudo", "cat", f"{self.gpu_power_path}/curr{self.gpu_channel}_input"], capture_output=True, text=True
+        )
         current = int(result_cur.stdout.strip()) / 1000
-        result_vol = subprocess.run(["sudo", "cat", f"{self.gpu_power_path}/in{self.gpu_channel}_input"], capture_output=True, text=True)
+        result_vol = subprocess.run(
+            ["sudo", "cat", f"{self.gpu_power_path}/in{self.gpu_channel}_input"], capture_output=True, text=True
+        )
         voltage = int(result_vol.stdout.strip()) / 1000
 
         return current * voltage
@@ -200,13 +194,7 @@ class TegraObserver(BenchmarkObserver):
 
     """
 
-    def __init__(
-        self,
-        observables,
-        save_all=False,
-        power_path="",
-        temp_path=""
-    ):
+    def __init__(self, observables, save_all=False, power_path="", temp_path=""):
         """Create a TegraObserver"""
         self.tegra = tegra(power_path=power_path, temp_path=temp_path)
         self.save_all = save_all
@@ -233,19 +221,13 @@ def __init__(
         for obs in self.observables:
             self.results[obs + "s"] = []
 
-        self.during_obs = [
-            obs
-            for obs in observables
-            if obs in ["core_freq", "tegra_temp"]
-        ]
+        self.during_obs = [obs for obs in observables if obs in ["core_freq", "tegra_temp"]]
 
         self.iteration = {obs: [] for obs in self.during_obs}
 
-
     def read_power(self):
         return self.tegra.read_gpu_power()
 
-
     def before_start(self):
         # clear results of the observables for next measurement
         self.iteration = {obs: [] for obs in self.during_obs}
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index a7f2d95fc..0b563a546 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -11,11 +11,22 @@
 from kernel_tuner.runners.ray.cache_manager import CacheManager
 from kernel_tuner.strategies.common import create_actor_on_device, initialize_ray
 
+
 class ParallelRunner(Runner):
     """ParallelRunner is used for tuning with multiple processes/threads using Ray for distributed computing."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, 
-                 num_gpus=None, cache_manager=None, actors=None, simulation_mode=False):
+    def __init__(
+        self,
+        kernel_source,
+        kernel_options,
+        device_options,
+        iterations,
+        observers,
+        num_gpus=None,
+        cache_manager=None,
+        actors=None,
+        simulation_mode=False,
+    ):
         """Instantiate the ParallelRunner.
 
         :param kernel_source: The kernel source
@@ -47,7 +58,11 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param simulation_mode: Flag to indicate simulation mode. Defaults to False.
         :type simulation_mode: bool, optional
         """
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if not simulation_mode else None
+        self.dev = (
+            DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+            if not simulation_mode
+            else None
+        )
         self.kernel_source = kernel_source
         self.simulation_mode = simulation_mode
         self.kernel_options = kernel_options
@@ -59,7 +74,7 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.cache_manager = cache_manager
         self.num_gpus = num_gpus
         self.actors = actors
-        
+
         initialize_ray()
 
         if num_gpus is None:
@@ -93,38 +108,53 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         :returns: Results of the tuning process.
         :rtype: list of dict
         """
-        if tuning_options is None: #HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
+        if (
+            tuning_options is None
+        ):  # HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
-        
+
         # Create RemoteActor instances
         if self.actors is None:
-            runner_attributes = [self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers]
-            self.actors = [create_actor_on_device(*runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode) for _id in range(self.num_gpus)]
-            
+            runner_attributes = [
+                self.kernel_source,
+                self.kernel_options,
+                self.device_options,
+                self.iterations,
+                self.observers,
+            ]
+            self.actors = [
+                create_actor_on_device(
+                    *runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode
+                )
+                for _id in range(self.num_gpus)
+            ]
+
         # Check if all GPUs are of the same type
         if not self.simulation_mode and not self._check_gpus_equals():
-            raise GPUTypeMismatchError(f"Different GPU types found") 
+            raise GPUTypeMismatchError(f"Different GPU types found")
 
         if self.cache_manager is None:
             if cache_manager is None:
                 cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
             self.cache_manager = cache_manager
-        
+
         # set the cache manager for each actor. Can't be done in constructor because we do not always yet have the tuning_options
         for actor in self.actors:
             actor.set_cache_manager.remote(self.cache_manager)
-    
+
         # Some observers can't be pickled
         run_tuning_options = copy.deepcopy(tuning_options)
-        run_tuning_options['observers'] = None
+        run_tuning_options["observers"] = None
         # Determine what type of parallelism and run appropriately
         if parameter_space and not ensemble and not searchspace:
             results, tuning_options_list = self.parallel_function_evaluation(run_tuning_options, parameter_space)
         elif ensemble and searchspace and not parameter_space:
-            results, tuning_options_list = self.multi_strategy_parallel_execution(ensemble, run_tuning_options, searchspace)
+            results, tuning_options_list = self.multi_strategy_parallel_execution(
+                ensemble, run_tuning_options, searchspace
+            )
         else:
             raise ValueError("Invalid arguments to parallel runner run method")
-        
+
         # Update tuning options
         # NOTE: tuning options won't have the state of the observers created in the actors as they can't be pickled
         cache, cachefile = ray.get(self.cache_manager.get_cache.remote())
@@ -132,7 +162,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         tuning_options.cachefile = cachefile
         if self.simulation_mode:
             tuning_options.simulated_time += self._calculate_simulated_time(tuning_options_list)
-        
+
         return results
 
     def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspace):
@@ -174,9 +204,11 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
             strategy = ensemble_queue.popleft()
             searchspace = searchspaces.popleft()
             remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
-            task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+            task = actor.execute.remote(
+                strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options
+            )
             pending_tasks[task] = actor
-        
+
         # Manage task completion and redistribution
         while pending_tasks:
             done_ids, _ = ray.wait(list(pending_tasks.keys()), num_returns=1)
@@ -190,15 +222,16 @@ def multi_strategy_parallel_execution(self, ensemble, tuning_options, searchspac
                     strategy = ensemble_queue.popleft()
                     searchspace = searchspaces.popleft()
                     remote_tuning_options = self._setup_tuning_options(tuning_options, evaluations_per_strategy)
-                    task = actor.execute.remote(strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options)
+                    task = actor.execute.remote(
+                        strategy=strategy, searchspace=searchspace, tuning_options=remote_tuning_options
+                    )
                     pending_tasks[task] = actor
-        
+
         # Process results
         results, tuning_options_list = self._process_results_ensemble(all_results)
-        
+
         return results, tuning_options_list
 
-    
     def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         """Set up tuning options for each strategy in the ensemble.
 
@@ -216,7 +249,7 @@ def _setup_tuning_options(self, tuning_options, evaluations_per_strategy):
         # the stop criterion uses the max feval in tuning options for some reason
         new_tuning_options["max_fevals"] = new_tuning_options.strategy_options["max_fevals"]
         return new_tuning_options
-    
+
     def _process_results_ensemble(self, all_results):
         """Process the results from the ensemble execution.
 
@@ -229,13 +262,12 @@ def _process_results_ensemble(self, all_results):
         results = []
         tuning_options_list = []
 
-        for (strategy_results, tuning_options) in all_results:
+        for strategy_results, tuning_options in all_results:
             results.extend(strategy_results)
             tuning_options_list.append(tuning_options)
 
         return results, tuning_options_list
 
-
     def parallel_function_evaluation(self, tuning_options, parameter_space):
         """Perform parallel function evaluation.
 
@@ -251,11 +283,13 @@ def parallel_function_evaluation(self, tuning_options, parameter_space):
         # Create a pool of RemoteActor actors
         self.actor_pool = ActorPool(self.actors)
         # Distribute execution of the `execute` method across the actor pool with varying parameters and tuning options, collecting the results asynchronously.
-        all_results = list(self.actor_pool.map_unordered(lambda a, v: a.execute.remote(tuning_options, element=v), parameter_space))
+        all_results = list(
+            self.actor_pool.map_unordered(lambda a, v: a.execute.remote(tuning_options, element=v), parameter_space)
+        )
         results = [x[0] for x in all_results]
         tuning_options_list = [x[1] for x in all_results]
         return results, tuning_options_list
-    
+
     def _process_results(self, all_results, searchspace):
         """
         Process the results and remove duplicates based on the searchspace.
@@ -263,14 +297,14 @@ def _process_results(self, all_results, searchspace):
         unique_configs = set()
         final_results = []
 
-        for (strategy_results, tuning_options) in all_results:
+        for strategy_results, tuning_options in all_results:
             for new_result in strategy_results:
                 config_signature = tuple(new_result[key] for key in searchspace.tune_params)
                 if config_signature not in unique_configs:
                     final_results.append(new_result)
                     unique_configs.add(config_signature)
         return final_results
-    
+
     def _calculate_simulated_time(self, tuning_options_list):
         """
         Calculate the maximum simulated time from the list of tuning options.
@@ -303,4 +337,4 @@ def clean_up_ray(self):
             for actor in self.actors:
                 ray.kill(actor)
         if self.cache_manager is not None:
-            ray.kill(self.cache_manager)
\ No newline at end of file
+            ray.kill(self.cache_manager)
diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index 9aeb56855..7e1370754 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -2,22 +2,24 @@
 
 from kernel_tuner.util import store_cache
 
+
 @ray.remote(num_cpus=1)
 class CacheManager:
     def __init__(self, cache, cachefile):
-        from kernel_tuner.interface import Options # importing here due to circular import
-        self.tuning_options = Options({'cache': cache, 'cachefile': cachefile})
+        from kernel_tuner.interface import Options  # importing here due to circular import
+
+        self.tuning_options = Options({"cache": cache, "cachefile": cachefile})
 
     def store(self, key, params):
         store_cache(key, params, self.tuning_options)
 
     def check_and_retrieve(self, key):
         """Checks if a result exists for the given key and returns it if found."""
-        if self.tuning_options['cache']:
-            return self.tuning_options['cache'].get(key, None)
+        if self.tuning_options["cache"]:
+            return self.tuning_options["cache"].get(key, None)
         else:
             return None
-    
+
     def get_cache(self):
         """Returns the current tuning options."""
-        return self.tuning_options['cache'], self.tuning_options['cachefile']
+        return self.tuning_options["cache"], self.tuning_options["cachefile"]
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index c0743ad22..bd732ebab 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -6,17 +6,20 @@
 from kernel_tuner.observers.register import RegisterObserver
 from kernel_tuner.util import get_gpu_id, get_gpu_type
 
+
 @ray.remote
-class RemoteActor():
-    def __init__(self, 
-                 kernel_source,
-                 kernel_options, 
-                 device_options,
-                 iterations,
-                 observers_type_and_arguments,
-                 id,
-                 cache_manager=None,
-                 simulation_mode=False):
+class RemoteActor:
+    def __init__(
+        self,
+        kernel_source,
+        kernel_options,
+        device_options,
+        iterations,
+        observers_type_and_arguments,
+        id,
+        cache_manager=None,
+        simulation_mode=False,
+    ):
         self.kernel_source = kernel_source
         self.kernel_options = kernel_options
         self.device_options = device_options
@@ -26,49 +29,61 @@ def __init__(self,
         self.runner = None
         self.id = None
         self._reinitialize_observers(observers_type_and_arguments)
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=self.observers, **device_options) if not simulation_mode else None
+        self.dev = (
+            DeviceInterface(kernel_source, iterations=iterations, observers=self.observers, **device_options)
+            if not simulation_mode
+            else None
+        )
 
     def get_environment(self):
         return self.dev.get_environment()
-    
+
     def execute(self, tuning_options, strategy=None, searchspace=None, element=None):
-        tuning_options['observers'] = self.observers
+        tuning_options["observers"] = self.observers
         if self.runner is None:
             self.init_runner()
         if strategy and searchspace:
-            results = strategy.tune(searchspace, self.runner,  tuning_options)
+            results = strategy.tune(searchspace, self.runner, tuning_options)
             # observers can't be pickled
-            tuning_options['observers'] = None
+            tuning_options["observers"] = None
             return results, tuning_options
         elif element:
-            results = self.runner.run([element],  tuning_options)[0]
+            results = self.runner.run([element], tuning_options)[0]
             # observers can't be pickled
-            tuning_options['observers'] = None
+            tuning_options["observers"] = None
             return results, tuning_options
         else:
             raise ValueError("Invalid arguments for ray actor's execute method.")
-        
+
     def set_cache_manager(self, cache_manager):
         if self.cache_manager is None:
             self.cache_manager = cache_manager
 
     def get_cache_magaer(self):
         return self.cache_manager
-    
+
     def init_runner(self):
         if self.cache_manager is None:
             raise ValueError("Cache manager is not set.")
         if self.simulation_mode:
-            self.runner = SimulationRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                            self.iterations, self.observers)
+            self.runner = SimulationRunner(
+                self.kernel_source, self.kernel_options, self.device_options, self.iterations, self.observers
+            )
         else:
-            self.runner = SequentialRunner(self.kernel_source, self.kernel_options, self.device_options, 
-                                       self.iterations, self.observers, cache_manager=self.cache_manager, dev=self.dev)
+            self.runner = SequentialRunner(
+                self.kernel_source,
+                self.kernel_options,
+                self.device_options,
+                self.iterations,
+                self.observers,
+                cache_manager=self.cache_manager,
+                dev=self.dev,
+            )
 
     def _reinitialize_observers(self, observers_type_and_arguments):
         # observers can't be pickled to the actor so we need to re-initialize them
         self.observers = []
-        for (observer, arguments) in observers_type_and_arguments:
+        for observer, arguments in observers_type_and_arguments:
             if "device" in arguments:
                 self.id = get_gpu_id(self.kernel_source.lang) if self.id is None else self.id
                 arguments["device"] = self.id
@@ -76,7 +91,6 @@ def _reinitialize_observers(self, observers_type_and_arguments):
                 self.observers.append(RegisterObserver())
             else:
                 self.observers.append(observer(**arguments))
-        
 
     def get_gpu_type(self, lang):
         return get_gpu_type(lang)
diff --git a/kernel_tuner/runners/runner.py b/kernel_tuner/runners/runner.py
index 8c4de22d7..0929c9bbf 100644
--- a/kernel_tuner/runners/runner.py
+++ b/kernel_tuner/runners/runner.py
@@ -8,9 +8,7 @@ class Runner(ABC):
     """Base class for kernel_tuner runners"""
 
     @abstractmethod
-    def __init__(
-        self, kernel_source, kernel_options, device_options, iterations, observers
-    ):
+    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers):
         pass
 
     @abstractmethod
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 46ba17e0a..21369eaba 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -12,7 +12,9 @@
 class SequentialRunner(Runner):
     """SequentialRunner is used for tuning with a single process/thread."""
 
-    def __init__(self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, dev=None):
+    def __init__(
+        self, kernel_source, kernel_options, device_options, iterations, observers, cache_manager=None, dev=None
+    ):
         """Instantiate the SequentialRunner.
 
         :param kernel_source: The kernel source
@@ -35,8 +37,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         :param cache_manager: Cache manager instance. Defaults to None.
         :type cache_manager: kernel_tuner.runners.ray.cache_manager.CacheManager, optional
         """
-        #detect language and create high-level device interface
-        self.dev = DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options) if dev is None else dev
+        # detect language and create high-level device interface
+        self.dev = (
+            DeviceInterface(kernel_source, iterations=iterations, observers=observers, **device_options)
+            if dev is None
+            else dev
+        )
 
         self.units = self.dev.units
         self.quiet = device_options.quiet
@@ -47,12 +53,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-        self.device_options = device_options # needed for the ensemble strategy down the line
-        self.iterations = iterations # needed for the ensemble strategy down the line
-        self.observers = observers # needed for the ensemble strategy down the line
+        self.device_options = device_options  # needed for the ensemble strategy down the line
+        self.iterations = iterations  # needed for the ensemble strategy down the line
+        self.observers = observers  # needed for the ensemble strategy down the line
         self.cache_manager = cache_manager
 
-        #move data to the GPU
+        # move data to the GPU
         self.gpu_args = self.dev.ready_argument_list(kernel_options.arguments)
 
     def get_environment(self, tuning_options):
@@ -73,7 +79,7 @@ def run(self, parameter_space, tuning_options):
         :rtype: dict())
 
         """
-        logging.debug('sequential runner started for ' + self.kernel_options.kernel_name)
+        logging.debug("sequential runner started for " + self.kernel_options.kernel_name)
 
         results = []
 
@@ -89,33 +95,46 @@ def run(self, parameter_space, tuning_options):
             cache_result = self.config_in_cache(x_int, tuning_options)
             if cache_result:
                 params.update(cache_result)
-                params['compile_time'] = 0
-                params['verification_time'] = 0
-                params['benchmark_time'] = 0
+                params["compile_time"] = 0
+                params["verification_time"] = 0
+                params["benchmark_time"] = 0
             else:
                 # attempt to warmup the GPU by running the first config in the parameter space and ignoring the result
                 if not self.warmed_up:
                     warmup_time = perf_counter()
-                    self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                    self.dev.compile_and_benchmark(
+                        self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
+                    )
                     self.warmed_up = True
                     warmup_time = 1e3 * (perf_counter() - warmup_time)
 
-                result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+                result = self.dev.compile_and_benchmark(
+                    self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options
+                )
 
                 params.update(result)
 
                 if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
-                    logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
+                    logging.debug("kernel configuration was skipped silently due to compile or runtime failure")
 
             # only compute metrics on configs that have not errored
             if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
                 params = process_metrics(params, tuning_options.metrics)
 
             # get the framework time by estimating based on other times
-            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) 
-            params['strategy_time'] = self.last_strategy_time
-            params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0)
-            params['timestamp'] = str(datetime.now(timezone.utc))
+            total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time)
+            params["strategy_time"] = self.last_strategy_time
+            params["framework_time"] = max(
+                total_time
+                - (
+                    params["compile_time"]
+                    + params["verification_time"]
+                    + params["benchmark_time"]
+                    + params["strategy_time"]
+                ),
+                0,
+            )
+            params["timestamp"] = str(datetime.now(timezone.utc))
             self.start_time = perf_counter()
 
             if result:
@@ -131,7 +150,7 @@ def run(self, parameter_space, tuning_options):
         return results
 
     def config_in_cache(self, x_int, tuning_options):
-        if self.cache_manager and tuning_options.strategy_options['check_and_retrieve']:
+        if self.cache_manager and tuning_options.strategy_options["check_and_retrieve"]:
             return ray.get(self.cache_manager.check_and_retrieve.remote(x_int))
         elif tuning_options.cache and x_int in tuning_options.cache:
             return tuning_options.cache[x_int]
@@ -142,4 +161,4 @@ def store_in_cache(self, x_int, params, tuning_options):
         if self.cache_manager:
             self.cache_manager.store.remote(x_int, params)
         else:
-            store_cache(x_int, params, tuning_options)
\ No newline at end of file
+            store_cache(x_int, params, tuning_options)
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
index f354333b6..1cf489834 100644
--- a/kernel_tuner/runners/simulation.py
+++ b/kernel_tuner/runners/simulation.py
@@ -14,11 +14,11 @@ class SimulationDevice(_SimulationDevice):
 
     @property
     def name(self):
-        return self.env['device_name']
+        return self.env["device_name"]
 
     @name.setter
     def name(self, value):
-        self.env['device_name'] = value
+        self.env["device_name"] = value
         if not self.quiet:
             print("Simulating: " + value)
 
@@ -58,9 +58,9 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.units = {}
 
-        self.device_options = device_options # needed for the ensemble strategy down the line
-        self.iterations = iterations # needed for the ensemble strategy down the line
-        self.observers = observers # needed for the ensemble strategy down the line
+        self.device_options = device_options  # needed for the ensemble strategy down the line
+        self.iterations = iterations  # needed for the ensemble strategy down the line
+        self.observers = observers  # needed for the ensemble strategy down the line
 
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
@@ -82,13 +82,12 @@ def run(self, parameter_space, tuning_options):
             execution times.
         :rtype: dict()
         """
-        logging.debug('simulation runner started for ' + self.kernel_options.kernel_name)
+        logging.debug("simulation runner started for " + self.kernel_options.kernel_name)
 
         results = []
 
         # iterate over parameter space
         for element in parameter_space:
-
             # check if element is in the cache
             x_int = ",".join([str(i) for i in element])
             if tuning_options.cache and x_int in tuning_options.cache:
@@ -102,21 +101,22 @@ def run(self, parameter_space, tuning_options):
                 # configuration is already counted towards the unique_results.
                 # It is the responsibility of cost_func to add configs to unique_results.
                 if x_int in tuning_options.unique_results:
-
-                    result['compile_time'] = 0
-                    result['verification_time'] = 0
-                    result['benchmark_time'] = 0
+                    result["compile_time"] = 0
+                    result["verification_time"] = 0
+                    result["benchmark_time"] = 0
 
                 else:
                     # configuration is evaluated for the first time, print to the console
-                    util.print_config_output(tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units)
+                    util.print_config_output(
+                        tuning_options.tune_params, result, self.quiet, tuning_options.metrics, self.units
+                    )
 
                 # Everything but the strategy time and framework time are simulated,
                 # self.last_strategy_time is set by cost_func
-                result['strategy_time'] = self.last_strategy_time
+                result["strategy_time"] = self.last_strategy_time
 
                 try:
-                    simulated_time = result['compile_time'] + result['verification_time'] + result['benchmark_time']
+                    simulated_time = result["compile_time"] + result["verification_time"] + result["benchmark_time"]
                     tuning_options.simulated_time += simulated_time
                 except KeyError:
                     if "time_limit" in tuning_options:
@@ -126,13 +126,15 @@ def run(self, parameter_space, tuning_options):
 
                 total_time = 1000 * (perf_counter() - self.start_time)
                 self.start_time = perf_counter()
-                result['framework_time'] = total_time - self.last_strategy_time
+                result["framework_time"] = total_time - self.last_strategy_time
 
                 results.append(result)
                 continue
 
             # if the element is not in the cache, raise an error
-            check = util.check_restrictions(tuning_options.restrictions, dict(zip(tuning_options['tune_params'].keys(), element)), True)
+            check = util.check_restrictions(
+                tuning_options.restrictions, dict(zip(tuning_options["tune_params"].keys(), element)), True
+            )
             err_string = f"kernel configuration {element} not in cache, does {'' if check else 'not '}pass extra restriction check ({check})"
             logging.debug(err_string)
             raise ValueError(f"{err_string} - in simulation mode, all configurations must be present in the cache")
diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
index a1ab0a616..cb6d80d04 100644
--- a/kernel_tuner/searchspace.py
+++ b/kernel_tuner/searchspace.py
@@ -150,7 +150,7 @@ def __init__(
     #         num_solutions: int = csp.n_solutions()  # number of solutions
     #         solutions = [csp.values(sol=i) for i in range(num_solutions)]  # list of solutions
 
-    def __build_searchspace_bruteforce(self, block_size_names: list, max_threads: int, solver = None):
+    def __build_searchspace_bruteforce(self, block_size_names: list, max_threads: int, solver=None):
         # bruteforce solving of the searchspace
 
         from itertools import product
@@ -172,9 +172,15 @@ def __build_searchspace_bruteforce(self, block_size_names: list, max_threads: in
                 restrictions = [restrictions]
             block_size_restriction_spaced = f"{' * '.join(used_block_size_names)} <= {max_threads}"
             block_size_restriction_unspaced = f"{'*'.join(used_block_size_names)} <= {max_threads}"
-            if block_size_restriction_spaced not in restrictions and block_size_restriction_unspaced not in restrictions:
+            if (
+                block_size_restriction_spaced not in restrictions
+                and block_size_restriction_unspaced not in restrictions
+            ):
                 restrictions.append(block_size_restriction_spaced)
-                if isinstance(self._modified_restrictions, list) and block_size_restriction_spaced not in self._modified_restrictions:
+                if (
+                    isinstance(self._modified_restrictions, list)
+                    and block_size_restriction_spaced not in self._modified_restrictions
+                ):
                     self._modified_restrictions.append(block_size_restriction_spaced)
                     if isinstance(self.restrictions, list):
                         self.restrictions.append(block_size_restriction_spaced)
@@ -269,12 +275,7 @@ def get_params():
                 TP(key, Set(values))
             return params
 
-        tuning_result = (
-            Tuner()
-            .tuning_parameters(*get_params())
-            .search_technique(Exhaustive())
-            .tune(costfunc)
-        )
+        tuning_result = Tuner().tuning_parameters(*get_params()).search_technique(Exhaustive()).tune(costfunc)
         return tuning_result
 
     def __build_searchspace_ATF_cache(self, block_size_names: list, max_threads: int, solver: Solver):
@@ -328,7 +329,10 @@ def __build_searchspace(self, block_size_names: list, max_threads: int, solver:
         if len(valid_block_size_names) > 0:
             parameter_space.addConstraint(MaxProdConstraint(max_threads), valid_block_size_names)
             max_block_size_product = f"{' * '.join(valid_block_size_names)} <= {max_threads}"
-            if isinstance(self._modified_restrictions, list) and max_block_size_product not in self._modified_restrictions:
+            if (
+                isinstance(self._modified_restrictions, list)
+                and max_block_size_product not in self._modified_restrictions
+            ):
                 self._modified_restrictions.append(max_block_size_product)
                 if isinstance(self.restrictions, list):
                     self.restrictions.append((MaxProdConstraint(max_threads), valid_block_size_names))
@@ -353,10 +357,7 @@ def __add_restrictions(self, parameter_space: Problem) -> Problem:
                     parameter_space.addConstraint(restriction, required_params)
                 elif isinstance(restriction, Constraint):
                     all_params_required = all(param_name in required_params for param_name in self.param_names)
-                    parameter_space.addConstraint(
-                        restriction,
-                        None if all_params_required else required_params
-                    )
+                    parameter_space.addConstraint(restriction, None if all_params_required else required_params)
                 else:
                     raise ValueError(f"Unrecognized restriction {restriction}")
 
diff --git a/kernel_tuner/strategies/basinhopping.py b/kernel_tuner/strategies/basinhopping.py
index 20e800f6e..291987a81 100644
--- a/kernel_tuner/strategies/basinhopping.py
+++ b/kernel_tuner/strategies/basinhopping.py
@@ -8,8 +8,11 @@
 
 supported_methods = ["Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "SLSQP"]
 
-_options = dict(method=(f"Local optimization algorithm to use, choose any from {supported_methods}", "L-BFGS-B"),
-                       T=("Temperature parameter for the accept or reject criterion", 1.0))
+_options = dict(
+    method=(f"Local optimization algorithm to use, choose any from {supported_methods}", "L-BFGS-B"),
+    T=("Temperature parameter for the accept or reject criterion", 1.0),
+)
+
 
 def tune(searchspace: Searchspace, runner, tuning_options):
     method, T = common.get_options(tuning_options.strategy_options, _options)
@@ -21,16 +24,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     kwargs = setup_method_arguments(method, bounds)
     options = setup_method_options(method, tuning_options)
-    kwargs['options'] = options
-
+    kwargs["options"] = options
 
     minimizer_kwargs = dict(**kwargs)
     minimizer_kwargs["method"] = method
 
     opt_result = None
     try:
-        opt_result = scipy.optimize.basinhopping(cost_func, x0, T=T, stepsize=eps,
-                                             minimizer_kwargs=minimizer_kwargs, disp=tuning_options.verbose)
+        opt_result = scipy.optimize.basinhopping(
+            cost_func, x0, T=T, stepsize=eps, minimizer_kwargs=minimizer_kwargs, disp=tuning_options.verbose
+        )
     except util.StopCriterionReached as e:
         if tuning_options.verbose:
             print(e)
diff --git a/kernel_tuner/strategies/brute_force.py b/kernel_tuner/strategies/brute_force.py
index cf6ba521b..9b2284969 100644
--- a/kernel_tuner/strategies/brute_force.py
+++ b/kernel_tuner/strategies/brute_force.py
@@ -6,14 +6,16 @@
 
 _options = dict(num_gpus=("Number of gpus to run parallel execution", None))
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     if isinstance(runner, ParallelRunner):
         if tuning_options.strategy_options is None:
             tuning_options.strategy_options = {}
-        tuning_options.strategy_options['check_and_retrieve'] = False
+        tuning_options.strategy_options["check_and_retrieve"] = False
         cache_manager = CacheManager.remote(tuning_options.cache, tuning_options.cachefile)
-        return runner.run(parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager)
+        return runner.run(
+            parameter_space=searchspace.sorted_list(), tuning_options=tuning_options, cache_manager=cache_manager
+        )
     else:
         return runner.run(searchspace.sorted_list(), tuning_options)
 
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index c4e00bb8d..bc9834063 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -38,7 +38,9 @@
 
 def get_strategy_docstring(name, strategy_options):
     """Generate docstring for a 'tune' method of a strategy."""
-    return _docstring_template.replace("$NAME$", name).replace("$STRAT_OPT$", make_strategy_options_doc(strategy_options))
+    return _docstring_template.replace("$NAME$", name).replace(
+        "$STRAT_OPT$", make_strategy_options_doc(strategy_options)
+    )
 
 
 def make_strategy_options_doc(strategy_options):
@@ -74,20 +76,20 @@ def __call__(self, x, check_restrictions=True):
         self.runner.last_strategy_time = 1000 * (perf_counter() - self.runner.last_strategy_start_time)
 
         # error value to return for numeric optimizers that need a numerical value
-        logging.debug('_cost_func called')
-        logging.debug('x: ' + str(x))
+        logging.debug("_cost_func called")
+        logging.debug("x: " + str(x))
 
         # check if max_fevals is reached or time limit is exceeded
         util.check_stop_criterion(self.tuning_options)
 
         x_list = [x] if self._is_single_configuration(x) else x
         configs = [self._prepare_config(cfg) for cfg in x_list]
-        
+
         legal_configs = configs
         illegal_results = []
         if check_restrictions and self.searchspace.restrictions:
             legal_configs, illegal_results = self._get_legal_configs(configs)
-        
+
         final_results = self._evaluate_configs(legal_configs) if len(legal_configs) > 0 else []
         # get numerical return values, taking optimization direction into account
         all_results = final_results + illegal_results
@@ -95,31 +97,31 @@ def __call__(self, x, check_restrictions=True):
         for result in all_results:
             return_value = result[self.tuning_options.objective] or sys.float_info.max
             return_values.append(return_value if not self.tuning_options.objective_higher_is_better else -return_value)
-        
+
         if len(return_values) == 1:
             return return_values[0]
         return return_values
-    
+
     def _is_single_configuration(self, x):
         """
         Determines if the input is a single configuration based on its type and composition.
-        
+
         Parameters:
             x: The input to check, which can be an int, float, numpy array, list, or tuple.
 
         Returns:
-            bool: True if `x` is a single configuration, which includes being a singular int or float, 
+            bool: True if `x` is a single configuration, which includes being a singular int or float,
                 a numpy array of ints or floats, or a list or tuple where all elements are ints or floats.
                 Otherwise, returns False.
         """
         if isinstance(x, (int, float)):
             return True
         if isinstance(x, np.ndarray):
-            return x.dtype.kind in 'if'  # Checks for data type being integer ('i') or float ('f')
+            return x.dtype.kind in "if"  # Checks for data type being integer ('i') or float ('f')
         if isinstance(x, (list, tuple)):
             return all(isinstance(item, (int, float)) for item in x)
         return False
-    
+
     def _prepare_config(self, x):
         """
         Prepare a single configuration by snapping to nearest values and/or scaling.
@@ -139,11 +141,11 @@ def _prepare_config(self, x):
         else:
             params = x
         return params
-    
+
     def _get_legal_configs(self, configs):
         """
-        Filters and categorizes configurations into legal and illegal based on defined restrictions. 
-        Configurations are checked against restrictions; illegal ones are modified to indicate an invalid state and 
+        Filters and categorizes configurations into legal and illegal based on defined restrictions.
+        Configurations are checked against restrictions; illegal ones are modified to indicate an invalid state and
         included in the results. Legal configurations are collected and returned for potential use.
 
         Parameters:
@@ -163,11 +165,11 @@ def _get_legal_configs(self, configs):
             else:
                 legal_configs.append(config)
         return legal_configs, results
-    
+
     def _evaluate_configs(self, configs):
         """
-        Evaluate and manage configurations based on tuning options. Results are sorted by timestamp to maintain 
-        order during parallel processing. The function ensures no duplicates in results and checks for stop criteria 
+        Evaluate and manage configurations based on tuning options. Results are sorted by timestamp to maintain
+        order during parallel processing. The function ensures no duplicates in results and checks for stop criteria
         post-processing. Strategy start time is updated upon completion.
 
         Parameters:
@@ -179,7 +181,7 @@ def _evaluate_configs(self, configs):
         results = self.runner.run(configs, self.tuning_options)
         # sort based on timestamp, needed because of parallel tuning of populations and restrospective stop criterion check
         if "timestamp" in results[0]:
-            results.sort(key=lambda x: x['timestamp'])
+            results.sort(key=lambda x: x["timestamp"])
 
         final_results = []
         for result in results:
@@ -231,10 +233,10 @@ def get_bounds_x0_eps(self):
                     eps = min(eps, np.amin(np.gradient(vals)))
 
         self.tuning_options["eps"] = eps
-        logging.debug('get_bounds_x0_eps called')
-        logging.debug('bounds ' + str(bounds))
-        logging.debug('x0 ' + str(x0))
-        logging.debug('eps ' + str(eps))
+        logging.debug("get_bounds_x0_eps called")
+        logging.debug("bounds " + str(bounds))
+        logging.debug("x0 " + str(x0))
+        logging.debug("eps " + str(eps))
 
         return bounds, x0, eps
 
@@ -252,7 +254,7 @@ def setup_method_arguments(method, bounds):
     kwargs = {}
     # pass bounds to methods that support it
     if method in ["L-BFGS-B", "TNC", "SLSQP"]:
-        kwargs['bounds'] = bounds
+        kwargs["bounds"] = bounds
     return kwargs
 
 
@@ -265,21 +267,21 @@ def setup_method_options(method, tuning_options):
         maxiter = tuning_options.strategy_options.maxiter
     else:
         maxiter = 100
-    kwargs['maxiter'] = maxiter
+    kwargs["maxiter"] = maxiter
     if method in ["Nelder-Mead", "Powell"]:
-        kwargs['maxfev'] = maxiter
+        kwargs["maxfev"] = maxiter
     elif method == "L-BFGS-B":
-        kwargs['maxfun'] = maxiter
+        kwargs["maxfun"] = maxiter
 
     # pass eps to methods that support it
     if method in ["CG", "BFGS", "L-BFGS-B", "TNC", "SLSQP"]:
-        kwargs['eps'] = tuning_options.eps
+        kwargs["eps"] = tuning_options.eps
     elif method == "COBYLA":
-        kwargs['rhobeg'] = tuning_options.eps
+        kwargs["rhobeg"] = tuning_options.eps
 
     # not all methods support 'disp' option
-    if method not in ['TNC']:
-        kwargs['disp'] = tuning_options.verbose
+    if method not in ["TNC"]:
+        kwargs["disp"] = tuning_options.verbose
 
     return kwargs
 
@@ -326,25 +328,33 @@ def scale_from_params(params, tune_params, eps):
     """Helper func to do the inverse of the 'unscale' function."""
     x = np.zeros(len(params))
     for i, v in enumerate(tune_params.values()):
-        x[i] = 0.5 * eps + v.index(params[i])*eps
+        x[i] = 0.5 * eps + v.index(params[i]) * eps
     return x
 
+
 def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
-    
     num_devices = get_num_devices(simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
-         warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
+        warnings.warn(
+            "Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.",
+            UserWarning,
+        )
 
-def create_actor_on_device(kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, id):
+
+def create_actor_on_device(
+    kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, id
+):
     # Check if Ray is initialized, raise an error if not
     if not ray.is_initialized():
-        raise RuntimeError("Ray is not initialized. Initialize Ray before creating an actor (remember to include resources).")
+        raise RuntimeError(
+            "Ray is not initialized. Initialize Ray before creating an actor (remember to include resources)."
+        )
 
     if simulation_mode:
         resource_options = {"num_cpus": 1}
     else:
         resource_options = {"num_gpus": 1}
-    
+
     observers_type_and_arguments = []
     if observers is not None:
         # observers can't be pickled so we will re-initialize them in the actors
@@ -354,19 +364,21 @@ def create_actor_on_device(kernel_source, kernel_options, device_options, iterat
                 observers_type_and_arguments.append((observer.__class__, observer.init_arguments))
             if isinstance(observer, RegisterObserver):
                 observers_type_and_arguments.append((observer.__class__, []))
-    
+
     # Create the actor with the specified options and resources
-    return RemoteActor.options(**resource_options).remote(kernel_source, 
-                                                            kernel_options, 
-                                                            device_options, 
-                                                            iterations, 
-                                                            observers_type_and_arguments=observers_type_and_arguments,
-                                                            cache_manager=cache_manager,
-                                                            simulation_mode=simulation_mode,
-                                                            id=id)
+    return RemoteActor.options(**resource_options).remote(
+        kernel_source,
+        kernel_options,
+        device_options,
+        iterations,
+        observers_type_and_arguments=observers_type_and_arguments,
+        cache_manager=cache_manager,
+        simulation_mode=simulation_mode,
+        id=id,
+    )
+
 
 def initialize_ray():
     # Initialize Ray
     if not ray.is_initialized():
         ray.init(include_dashboard=True, ignore_reinit_error=True)
-
diff --git a/kernel_tuner/strategies/diff_evo.py b/kernel_tuner/strategies/diff_evo.py
index 5ad2b9474..7aa717b26 100644
--- a/kernel_tuner/strategies/diff_evo.py
+++ b/kernel_tuner/strategies/diff_evo.py
@@ -6,16 +6,27 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
 
-supported_methods = ["best1bin", "best1exp", "rand1exp", "randtobest1exp", "best2exp", "rand2exp", "randtobest1bin", "best2bin", "rand2bin", "rand1bin"]
-
-_options = dict(method=(f"Creation method for new population, any of {supported_methods}", "best1bin"),
-                       popsize=("Population size", 20),
-                       maxiter=("Number of generations", 100))
+supported_methods = [
+    "best1bin",
+    "best1exp",
+    "rand1exp",
+    "randtobest1exp",
+    "best2exp",
+    "rand2exp",
+    "randtobest1bin",
+    "best2bin",
+    "rand2bin",
+    "rand1bin",
+]
+
+_options = dict(
+    method=(f"Creation method for new population, any of {supported_methods}", "best1bin"),
+    popsize=("Population size", 20),
+    maxiter=("Number of generations", 100),
+)
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
-
-
     method, popsize, maxiter = common.get_options(tuning_options.strategy_options, _options)
 
     # build a bounds array as needed for the optimizer
@@ -28,8 +39,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     # call the differential evolution optimizer
     opt_result = None
     try:
-        opt_result = differential_evolution(cost_func, bounds, maxiter=maxiter, popsize=popsize, init=population,
-                                        polish=False, strategy=method, disp=tuning_options.verbose)
+        opt_result = differential_evolution(
+            cost_func,
+            bounds,
+            maxiter=maxiter,
+            popsize=popsize,
+            init=population,
+            polish=False,
+            strategy=method,
+            disp=tuning_options.verbose,
+        )
     except util.StopCriterionReached as e:
         if tuning_options.verbose:
             print(e)
diff --git a/kernel_tuner/strategies/dual_annealing.py b/kernel_tuner/strategies/dual_annealing.py
index 0f44bd849..bbb8ffa48 100644
--- a/kernel_tuner/strategies/dual_annealing.py
+++ b/kernel_tuner/strategies/dual_annealing.py
@@ -6,23 +6,22 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc, setup_method_arguments, setup_method_options
 
-supported_methods = ['COBYLA', 'L-BFGS-B', 'SLSQP', 'CG', 'Powell', 'Nelder-Mead', 'BFGS', 'trust-constr']
+supported_methods = ["COBYLA", "L-BFGS-B", "SLSQP", "CG", "Powell", "Nelder-Mead", "BFGS", "trust-constr"]
 
 _options = dict(method=(f"Local optimization method to use, choose any from {supported_methods}", "Powell"))
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     method = common.get_options(tuning_options.strategy_options, _options)[0]
 
-    #scale variables in x to make 'eps' relevant for multiple variables
+    # scale variables in x to make 'eps' relevant for multiple variables
     cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True)
 
     bounds, x0, _ = cost_func.get_bounds_x0_eps()
 
     kwargs = setup_method_arguments(method, bounds)
     options = setup_method_options(method, tuning_options)
-    kwargs['options'] = options
-
+    kwargs["options"] = options
 
     minimizer_kwargs = {}
     minimizer_kwargs["method"] = method
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 2dab125f4..53d3b5410 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -48,41 +48,54 @@
 _options = dict(
     ensemble=("List of strategies to be used in the ensemble", ["random_sample", "random_sample"]),
     max_fevals=("Maximum number of function evaluations", None),
-    num_gpus=("Number of gpus to run the parallel ensemble on", None)
+    num_gpus=("Number of gpus to run the parallel ensemble on", None),
 )
 
+
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     clean_up = True if actors is None and cache_manager is None else False
     options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     initialize_ray()
 
-    ensemble, max_fevals, num_gpus =common.get_options(tuning_options.strategy_options, _options)
-    num_devices = num_gpus if num_gpus is not None else get_num_devices(simulation_mode=simulation_mode)  
+    ensemble, max_fevals, num_gpus = common.get_options(tuning_options.strategy_options, _options)
+    num_devices = num_gpus if num_gpus is not None else get_num_devices(simulation_mode=simulation_mode)
     ensemble_size = len(ensemble)
 
     # setup strategy options
-    if 'bayes_opt' in ensemble: # All strategies start from a random sample except for BO
-        tuning_options.strategy_options["samplingmethod"] = 'random'
+    if "bayes_opt" in ensemble:  # All strategies start from a random sample except for BO
+        tuning_options.strategy_options["samplingmethod"] = "random"
     tuning_options.strategy_options["max_fevals"] = 100 * ensemble_size if max_fevals is None else max_fevals
-    tuning_options.strategy_options['check_and_retrieve'] = True
+    tuning_options.strategy_options["check_and_retrieve"] = True
 
     # define number of ray actors needed
     if num_devices < ensemble_size:
-        warnings.warn("Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.", UserWarning)
+        warnings.warn(
+            "Number of devices is less than the number of strategies in the ensemble. Some strategies will wait until devices are available.",
+            UserWarning,
+        )
     num_actors = num_devices if ensemble_size > num_devices else ensemble_size
 
     ensemble = [strategy_map[strategy] for strategy in ensemble]
 
-    parallel_runner = ParallelRunner(runner.kernel_source, runner.kernel_options, runner.device_options, 
-                                    runner.iterations, runner.observers, num_gpus=num_actors, cache_manager=cache_manager,
-                                    simulation_mode=simulation_mode, actors=actors)
-    
+    parallel_runner = ParallelRunner(
+        runner.kernel_source,
+        runner.kernel_options,
+        runner.device_options,
+        runner.iterations,
+        runner.observers,
+        num_gpus=num_actors,
+        cache_manager=cache_manager,
+        simulation_mode=simulation_mode,
+        actors=actors,
+    )
+
     final_results = parallel_runner.run(tuning_options=tuning_options, ensemble=ensemble, searchspace=searchspace)
 
     if clean_up:
         parallel_runner.clean_up_ray()
-    
+
     return final_results
 
-tune.__doc__ = common.get_strategy_docstring("Ensemble", _options)
\ No newline at end of file
+
+tune.__doc__ = common.get_strategy_docstring("Ensemble", _options)
diff --git a/kernel_tuner/strategies/firefly_algorithm.py b/kernel_tuner/strategies/firefly_algorithm.py
index dc43aae6f..429a338fa 100644
--- a/kernel_tuner/strategies/firefly_algorithm.py
+++ b/kernel_tuner/strategies/firefly_algorithm.py
@@ -9,14 +9,16 @@
 from kernel_tuner.strategies.common import CostFunc, scale_from_params
 from kernel_tuner.strategies.pso import Particle
 
-_options = dict(popsize=("Population size", 20),
-                       maxiter=("Maximum number of iterations", 100),
-                       B0=("Maximum attractiveness", 1.0),
-                       gamma=("Light absorption coefficient", 1.0),
-                       alpha=("Randomization parameter", 0.2))
+_options = dict(
+    popsize=("Population size", 20),
+    maxiter=("Maximum number of iterations", 100),
+    B0=("Maximum attractiveness", 1.0),
+    gamma=("Light absorption coefficient", 1.0),
+    alpha=("Randomization parameter", 0.2),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     # scale variables in x because PSO works with velocities to visit different configurations
     cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True)
 
@@ -57,7 +59,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         # compare all to all and compute attractiveness
         for i in range(num_particles):
             for j in range(num_particles):
-
                 if swarm[i].intensity < swarm[j].intensity:
                     dist = swarm[i].distance_to(swarm[j])
                     beta = B0 * np.exp(-gamma * dist * dist)
@@ -78,7 +79,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
         swarm.sort(key=lambda x: x.score)
 
     if tuning_options.verbose:
-        print('Final result:')
+        print("Final result:")
         print(best_position_global)
         print(best_score_global)
 
@@ -87,6 +88,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("firefly algorithm", _options)
 
+
 class Firefly(Particle):
     """Firefly object for use in the Firefly Algorithm."""
 
@@ -98,7 +100,7 @@ def __init__(self, bounds):
 
     def distance_to(self, other):
         """Return Euclidian distance between self and other Firefly."""
-        return np.linalg.norm(self.position-other.position)
+        return np.linalg.norm(self.position - other.position)
 
     def compute_intensity(self, fun):
         """Evaluate cost function and compute intensity at this position."""
diff --git a/kernel_tuner/strategies/genetic_algorithm.py b/kernel_tuner/strategies/genetic_algorithm.py
index 52361a744..913e449e9 100644
--- a/kernel_tuner/strategies/genetic_algorithm.py
+++ b/kernel_tuner/strategies/genetic_algorithm.py
@@ -17,7 +17,6 @@
 
 
 def tune(searchspace: Searchspace, runner, tuning_options):
-
     options = tuning_options.strategy_options
     pop_size, generations, method, mutation_chance = common.get_options(options, _options)
     crossover = supported_methods[method]
@@ -28,7 +27,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     population = list(list(p) for p in searchspace.get_random_sample(pop_size))
 
     for generation in range(generations):
-
         # determine fitness of population members
         weighted_population = []
         for dna in population:
@@ -46,7 +44,9 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
         # 'best_score' is used only for printing
         if tuning_options.verbose and cost_func.results:
-            best_score = util.get_best_config(cost_func.results, tuning_options.objective, tuning_options.objective_higher_is_better)[tuning_options.objective]
+            best_score = util.get_best_config(
+                cost_func.results, tuning_options.objective, tuning_options.objective_higher_is_better
+            )[tuning_options.objective]
 
         if tuning_options.verbose:
             print("Generation %d, best_score %f" % (generation, best_score))
@@ -176,4 +176,4 @@ def disruptive_uniform_crossover(dna1, dna2):
     "two_point": two_point_crossover,
     "uniform": uniform_crossover,
     "disruptive_uniform": disruptive_uniform_crossover,
-}
\ No newline at end of file
+}
diff --git a/kernel_tuner/strategies/greedy_ils.py b/kernel_tuner/strategies/greedy_ils.py
index 26d15f591..b134dff47 100644
--- a/kernel_tuner/strategies/greedy_ils.py
+++ b/kernel_tuner/strategies/greedy_ils.py
@@ -6,13 +6,18 @@
 from kernel_tuner.strategies.genetic_algorithm import mutate
 from kernel_tuner.strategies.hillclimbers import base_hillclimb
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
-                       no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
-                       random_walk=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", 0.3))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
+    no_improvement=("number of evaluations to exceed without improvement before restarting", 50),
+    random_walk=(
+        "controls greedyness, i.e. whether to restart from a position as soon as an improvement is found",
+        0.3,
+    ),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     dna_size = len(searchspace.tune_params.keys())
 
     options = tuning_options.strategy_options
@@ -30,15 +35,16 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     fevals = 0
     cost_func = CostFunc(searchspace, tuning_options, runner)
 
-    #while searching
+    # while searching
     candidate = searchspace.get_random_sample(1)[0]
     best_score = cost_func(candidate, check_restrictions=False)
 
     last_improvement = 0
     while fevals < max_fevals:
-
         try:
-            candidate = base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True)
+            candidate = base_hillclimb(
+                candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=True
+            )
             new_score = cost_func(candidate, check_restrictions=False)
         except util.StopCriterionReached as e:
             if tuning_options.verbose:
@@ -58,9 +64,10 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("Greedy Iterative Local Search (ILS)", _options)
 
+
 def random_walk(indiv, permutation_size, no_improve, last_improve, searchspace: Searchspace):
     if last_improve >= no_improve:
         return searchspace.get_random_sample(1)[0]
     for _ in range(permutation_size):
         indiv = mutate(indiv, 0, searchspace, cache=False)
-    return indiv
\ No newline at end of file
+    return indiv
diff --git a/kernel_tuner/strategies/greedy_mls.py b/kernel_tuner/strategies/greedy_mls.py
index 1b34da501..cf90a7df9 100644
--- a/kernel_tuner/strategies/greedy_mls.py
+++ b/kernel_tuner/strategies/greedy_mls.py
@@ -4,13 +4,15 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.hillclimbers import base_hillclimb
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
-                       order=("set a user-specified order to search among dimensions while hillclimbing", None),
-                       randomize=("use a random order to search among dimensions while hillclimbing", True))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
+    order=("set a user-specified order to search among dimensions while hillclimbing", None),
+    randomize=("use a random order to search among dimensions while hillclimbing", True),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     # retrieve options with defaults
     options = tuning_options.strategy_options
     neighbor, restart, order, randomize = common.get_options(options, _options)
@@ -24,12 +26,22 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
     fevals = 0
 
-    #while searching
+    # while searching
     while fevals < max_fevals:
         candidate = searchspace.get_random_sample(1)[0]
 
         try:
-            base_hillclimb(candidate, neighbor, max_fevals, searchspace, tuning_options, cost_func, restart=restart, randomize=randomize, order=order)
+            base_hillclimb(
+                candidate,
+                neighbor,
+                max_fevals,
+                searchspace,
+                tuning_options,
+                cost_func,
+                restart=restart,
+                randomize=randomize,
+                order=order,
+            )
         except util.StopCriterionReached as e:
             if tuning_options.verbose:
                 print(e)
diff --git a/kernel_tuner/strategies/hillclimbers.py b/kernel_tuner/strategies/hillclimbers.py
index b64e7d733..602d27d0e 100644
--- a/kernel_tuner/strategies/hillclimbers.py
+++ b/kernel_tuner/strategies/hillclimbers.py
@@ -5,9 +5,18 @@
 from kernel_tuner.strategies.common import CostFunc
 
 
-def base_hillclimb(base_sol: tuple, neighbor_method: str, max_fevals: int, searchspace: Searchspace, tuning_options,
-                   cost_func: CostFunc, restart=True, randomize=True, order=None):
-    """ Hillclimbing search until max_fevals is reached or no improvement is found
+def base_hillclimb(
+    base_sol: tuple,
+    neighbor_method: str,
+    max_fevals: int,
+    searchspace: Searchspace,
+    tuning_options,
+    cost_func: CostFunc,
+    restart=True,
+    randomize=True,
+    order=None,
+):
+    """Hillclimbing search until max_fevals is reached or no improvement is found
 
     Base hillclimber that evaluates neighbouring solutions in a random or fixed order
     and possibly immediately moves to the neighbour if it is an improvement.
diff --git a/kernel_tuner/strategies/minimize.py b/kernel_tuner/strategies/minimize.py
index 80c1c6f82..58a93e0b1 100644
--- a/kernel_tuner/strategies/minimize.py
+++ b/kernel_tuner/strategies/minimize.py
@@ -16,8 +16,8 @@
 
 _options = dict(method=(f"Local optimization algorithm to use, choose any from {supported_methods}", "L-BFGS-B"))
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     method = get_options(tuning_options.strategy_options, _options)[0]
 
     # scale variables in x to make 'eps' relevant for multiple variables
diff --git a/kernel_tuner/strategies/mls.py b/kernel_tuner/strategies/mls.py
index b8ecf030c..7f0601378 100644
--- a/kernel_tuner/strategies/mls.py
+++ b/kernel_tuner/strategies/mls.py
@@ -3,13 +3,15 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.greedy_mls import tune as mls_tune
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", False),
-                       order=("set a user-specified order to search among dimensions while hillclimbing", None),
-                       randomize=("use a random order to search among dimensions while hillclimbing", True))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", False),
+    order=("set a user-specified order to search among dimensions while hillclimbing", None),
+    randomize=("use a random order to search among dimensions while hillclimbing", True),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     # Default MLS uses 'best improvement' hillclimbing, so greedy hillclimbing is disabled with restart defaulting to False
     _, restart, _, _ = common.get_options(tuning_options.strategy_options, _options)
 
diff --git a/kernel_tuner/strategies/ordered_greedy_mls.py b/kernel_tuner/strategies/ordered_greedy_mls.py
index cd40ba778..f72257020 100644
--- a/kernel_tuner/strategies/ordered_greedy_mls.py
+++ b/kernel_tuner/strategies/ordered_greedy_mls.py
@@ -3,13 +3,15 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.greedy_mls import tune as mls_tune
 
-_options = dict(neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
-                       restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
-                       order=("set a user-specified order to search among dimensions while hillclimbing", None),
-                       randomize=("use a random order to search among dimensions while hillclimbing", False))
+_options = dict(
+    neighbor=("Method for selecting neighboring nodes, choose from Hamming or adjacent", "Hamming"),
+    restart=("controls greedyness, i.e. whether to restart from a position as soon as an improvement is found", True),
+    order=("set a user-specified order to search among dimensions while hillclimbing", None),
+    randomize=("use a random order to search among dimensions while hillclimbing", False),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
+def tune(searchspace: Searchspace, runner, tuning_options):
     _, restart, _, randomize = common.get_options(tuning_options.strategy_options, _options)
 
     # Delegate to Greedy MLS, but make sure our defaults are used if not overwritten by the user
diff --git a/kernel_tuner/strategies/pso.py b/kernel_tuner/strategies/pso.py
index 5b0df1429..ec92c1094 100644
--- a/kernel_tuner/strategies/pso.py
+++ b/kernel_tuner/strategies/pso.py
@@ -9,21 +9,22 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc, scale_from_params
 
-_options = dict(popsize=("Population size", 20),
-                       maxiter=("Maximum number of iterations", 100),
-                       w=("Inertia weight constant", 0.5),
-                       c1=("Cognitive constant", 2.0),
-                       c2=("Social constant", 1.0))
+_options = dict(
+    popsize=("Population size", 20),
+    maxiter=("Maximum number of iterations", 100),
+    w=("Inertia weight constant", 0.5),
+    c1=("Cognitive constant", 2.0),
+    c2=("Social constant", 1.0),
+)
 
-def tune(searchspace: Searchspace, runner, tuning_options):
 
-    #scale variables in x because PSO works with velocities to visit different configurations
+def tune(searchspace: Searchspace, runner, tuning_options):
+    # scale variables in x because PSO works with velocities to visit different configurations
     cost_func = CostFunc(searchspace, tuning_options, runner, scaling=True)
 
-    #using this instead of get_bounds because scaling is used
+    # using this instead of get_bounds because scaling is used
     bounds, _, eps = cost_func.get_bounds_x0_eps()
 
-
     num_particles, maxiter, w, c1, c2 = common.get_options(tuning_options.strategy_options, _options)
 
     best_score_global = sys.float_info.max
@@ -64,7 +65,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             swarm[j].update_position(bounds)
 
     if tuning_options.verbose:
-        print('Final result:')
+        print("Final result:")
         print(best_position_global)
         print(best_score_global)
 
@@ -73,6 +74,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("Particle Swarm Optimization (PSO)", _options)
 
+
 class Particle:
     def __init__(self, bounds):
         self.ndim = len(bounds)
diff --git a/kernel_tuner/strategies/simulated_annealing.py b/kernel_tuner/strategies/simulated_annealing.py
index dce929b7b..8a4e43348 100644
--- a/kernel_tuner/strategies/simulated_annealing.py
+++ b/kernel_tuner/strategies/simulated_annealing.py
@@ -9,10 +9,13 @@
 from kernel_tuner.strategies import common
 from kernel_tuner.strategies.common import CostFunc
 
-_options = dict(T=("Starting temperature", 1.0),
-                       T_min=("End temperature", 0.001),
-                       alpha=("Alpha parameter", 0.995),
-                       maxiter=("Number of iterations within each annealing step", 1))
+_options = dict(
+    T=("Starting temperature", 1.0),
+    T_min=("End temperature", 0.001),
+    alpha=("Alpha parameter", 0.995),
+    maxiter=("Number of iterations within each annealing step", 1),
+)
+
 
 def tune(searchspace: Searchspace, runner, tuning_options):
     # SA works with real parameter values and does not need scaling
@@ -23,7 +26,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
     T_start = T
 
     # compute how many iterations would be needed to complete the annealing schedule
-    max_iter = int(np.ceil(np.log(T_min)/np.log(alpha)))
+    max_iter = int(np.ceil(np.log(T_min) / np.log(alpha)))
 
     # if user supplied max_fevals that is lower then max_iter we will
     # scale the annealing schedule to fit max_fevals
@@ -45,7 +48,6 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             iteration += 1
 
         for _ in range(niter):
-
             new_pos = neighbor(pos, searchspace)
             try:
                 new_cost = cost_func(new_pos, check_restrictions=False)
@@ -59,12 +61,12 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
             if ap > r:
                 if tuning_options.verbose:
-                    print("new position accepted", new_pos, new_cost, 'old:', pos, old_cost, 'ap', ap, 'r', r, 'T', T)
+                    print("new position accepted", new_pos, new_cost, "old:", pos, old_cost, "ap", ap, "r", r, "T", T)
                 pos = new_pos
                 old_cost = new_cost
 
         c = len(tuning_options.unique_results)
-        T = T_start * alpha**(max_iter/max_feval*c)
+        T = T_start * alpha ** (max_iter / max_feval * c)
 
         # check if solver gets stuck and if so restart from random position
         if c == c_old:
@@ -77,7 +79,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
             stuck = 0
 
         # safeguard
-        if iteration > 10*max_iter:
+        if iteration > 10 * max_iter:
             break
 
     return cost_func.results
@@ -85,6 +87,7 @@ def tune(searchspace: Searchspace, runner, tuning_options):
 
 tune.__doc__ = common.get_strategy_docstring("Simulated Annealing", _options)
 
+
 def acceptance_prob(old_cost, new_cost, T, tuning_options):
     """Annealing equation, with modifications to work towards a lower value."""
     error_val = sys.float_info.max if not tuning_options.objective_higher_is_better else -sys.float_info.max
@@ -99,14 +102,18 @@ def acceptance_prob(old_cost, new_cost, T, tuning_options):
         return 1.0
     # maybe move if old cost is better than new cost depending on T and random value
     if tuning_options.objective_higher_is_better:
-        return np.exp(((new_cost-old_cost)/new_cost)/T)
-    return np.exp(((old_cost-new_cost)/old_cost)/T)
+        return np.exp(((new_cost - old_cost) / new_cost) / T)
+    return np.exp(((old_cost - new_cost) / old_cost) / T)
 
 
 def neighbor(pos, searchspace: Searchspace):
     """Return a random neighbor of pos."""
     # Note: this is not the same as the previous implementation, because it is possible that non-edge parameters remain the same, but suggested configurations will all be within restrictions
-    neighbors = searchspace.get_neighbors(tuple(pos), neighbor_method='Hamming') if random.random() < 0.2 else searchspace.get_neighbors(tuple(pos), neighbor_method='strictly-adjacent')
+    neighbors = (
+        searchspace.get_neighbors(tuple(pos), neighbor_method="Hamming")
+        if random.random() < 0.2
+        else searchspace.get_neighbors(tuple(pos), neighbor_method="strictly-adjacent")
+    )
     if len(neighbors) > 0:
         return list(random.choice(neighbors))
     # if there are no neighbors, return a random configuration
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 6be5a270b..f2faf0469 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -93,6 +93,7 @@ class SkippableFailure(Exception):
 class StopCriterionReached(Exception):
     """Exception thrown when a stop criterion has been reached."""
 
+
 class GPUTypeMismatchError(Exception):
     """Exception thrown when GPU types are not the same in parallel execution"""
 
@@ -908,7 +909,6 @@ def is_or_evals_to_number(s: str) -> Optional[Union[int, float]]:
             except Exception:
                 # it's not a solvable subexpression, return None
                 return None
-            
 
         # either the left or right side of the equation must evaluate to a constant number
         left_num = is_or_evals_to_number(left)
@@ -1295,6 +1295,7 @@ def cuda_error_check(error):
             _, desc = nvrtc.nvrtcGetErrorString(error)
             raise RuntimeError(f"NVRTC error: {desc.decode()}")
 
+
 def get_num_devices(simulation_mode=False):
     resources = ray.cluster_resources()
     if simulation_mode:
@@ -1303,6 +1304,7 @@ def get_num_devices(simulation_mode=False):
         num_devices = resources.get("GPU")
     return int(num_devices)
 
+
 def get_gpu_id(lang):
     if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
         gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES") or os.environ.get("NVIDIA_VISIBLE_DEVICES") or "No GPU assigned"
@@ -1310,10 +1312,15 @@ def get_gpu_id(lang):
         raise NotImplementedError("TODO: implement other languages")
     return int(gpu_id)
 
+
 def get_gpu_type(lang):
     gpu_id = get_gpu_id(lang)
     if lang == "CUDA" or lang == "CUPY" or lang == "NVCUDA":
-        result = subprocess.run(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader', '-i', str(gpu_id)], capture_output=True, text=True)
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader", "-i", str(gpu_id)],
+            capture_output=True,
+            text=True,
+        )
         return result.stdout.strip()
     else:
-        raise NotImplementedError("TODO: implement other languages")
\ No newline at end of file
+        raise NotImplementedError("TODO: implement other languages")

From d560a30cf865d054f07b7e683d98e6db8c35f200 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Wed, 26 Mar 2025 16:31:09 +0100
Subject: [PATCH 104/106] Fix a bunch of SonarCloud warnings.

---
 kernel_tuner/core.py                      |  3 ++-
 kernel_tuner/interface.py                 |  6 +++++-
 kernel_tuner/runners/parallel.py          |  8 +++++---
 kernel_tuner/runners/ray/cache_manager.py |  2 ++
 kernel_tuner/runners/ray/remote_actor.py  |  2 ++
 kernel_tuner/strategies/common.py         | 12 +++++++-----
 kernel_tuner/strategies/ensemble.py       |  4 ++--
 7 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 714f81b83..e4579b60e 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -930,7 +930,8 @@ def wrap_templated_kernel(kernel_string, kernel_name):
 
     template_parameters = match.group(1).split(",")
     argument_list = match.group(3).split(",")
-    argument_list = [s.strip() for s in argument_list]  # remove extra whitespace around 'type name' strings
+    # remove extra whitespace around 'type name' strings
+    argument_list = [s.strip() for s in argument_list]
 
     type_list, name_list = split_argument_list(argument_list)
 
diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 2cf27403a..dc7d452a8 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -658,7 +658,11 @@ def tune_kernel(
         strategy = brute_force
 
     # select the runner for this job based on input
-    selected_runner = SimulationRunner if simulation_mode else (ParallelRunner if parallel_mode else SequentialRunner)
+    selected_runner = SequentialRunner
+    if simulation_mode:
+        selected_runner = SimulationRunner
+    elif parallel_mode:
+        selected_runner = ParallelRunner
     tuning_options.simulated_time = 0
     if parallel_mode:
         num_gpus = tuning_options["num_gpus"] if "num_gpus" in tuning_options else None
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index 0b563a546..fcd5e3634 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -1,3 +1,4 @@
+"""Parallel runner"""
 import ray
 import sys
 from ray.util.actor_pool import ActorPool
@@ -110,7 +111,8 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         """
         if (
             tuning_options is None
-        ):  # HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
+        ):
+            # HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
 
         # Create RemoteActor instances
@@ -124,14 +126,14 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
             ]
             self.actors = [
                 create_actor_on_device(
-                    *runner_attributes, id=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode
+                    *runner_attributes, identifier=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode
                 )
                 for _id in range(self.num_gpus)
             ]
 
         # Check if all GPUs are of the same type
         if not self.simulation_mode and not self._check_gpus_equals():
-            raise GPUTypeMismatchError(f"Different GPU types found")
+            raise GPUTypeMismatchError("Different GPU types found")
 
         if self.cache_manager is None:
             if cache_manager is None:
diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index 7e1370754..c0f20abe0 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -1,3 +1,4 @@
+"""Ray caching module."""
 import ray
 
 from kernel_tuner.util import store_cache
@@ -5,6 +6,7 @@
 
 @ray.remote(num_cpus=1)
 class CacheManager:
+    """Manage the cache used by the parallel actors."""
     def __init__(self, cache, cachefile):
         from kernel_tuner.interface import Options  # importing here due to circular import
 
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index bd732ebab..0792d63f3 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -1,3 +1,4 @@
+"""Ray actors module."""
 import ray
 
 from kernel_tuner.runners.sequential import SequentialRunner
@@ -9,6 +10,7 @@
 
 @ray.remote
 class RemoteActor:
+    """Remote actor for the parallel tuner."""
     def __init__(
         self,
         kernel_source,
diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
index bc9834063..94f73c958 100644
--- a/kernel_tuner/strategies/common.py
+++ b/kernel_tuner/strategies/common.py
@@ -102,7 +102,8 @@ def __call__(self, x, check_restrictions=True):
             return return_values[0]
         return return_values
 
-    def _is_single_configuration(self, x):
+    @staticmethod
+    def _is_single_configuration(x):
         """
         Determines if the input is a single configuration based on its type and composition.
 
@@ -117,7 +118,8 @@ def _is_single_configuration(self, x):
         if isinstance(x, (int, float)):
             return True
         if isinstance(x, np.ndarray):
-            return x.dtype.kind in "if"  # Checks for data type being integer ('i') or float ('f')
+            # Checks for data type being integer ('i') or float ('f')
+            return x.dtype.kind in "if"
         if isinstance(x, (list, tuple)):
             return all(isinstance(item, (int, float)) for item in x)
         return False
@@ -332,7 +334,7 @@ def scale_from_params(params, tune_params, eps):
     return x
 
 
-def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
+def check_num_devices(ensemble_size: int, simulation_mode: bool):
     num_devices = get_num_devices(simulation_mode=simulation_mode)
     if num_devices < ensemble_size:
         warnings.warn(
@@ -342,7 +344,7 @@ def check_num_devices(ensemble_size: int, simulation_mode: bool, runner):
 
 
 def create_actor_on_device(
-    kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, id
+    kernel_source, kernel_options, device_options, iterations, observers, cache_manager, simulation_mode, identifier
 ):
     # Check if Ray is initialized, raise an error if not
     if not ray.is_initialized():
@@ -374,7 +376,7 @@ def create_actor_on_device(
         observers_type_and_arguments=observers_type_and_arguments,
         cache_manager=cache_manager,
         simulation_mode=simulation_mode,
-        id=id,
+        identifier=identifier,
     )
 
 
diff --git a/kernel_tuner/strategies/ensemble.py b/kernel_tuner/strategies/ensemble.py
index 53d3b5410..d3160abd5 100644
--- a/kernel_tuner/strategies/ensemble.py
+++ b/kernel_tuner/strategies/ensemble.py
@@ -54,7 +54,6 @@
 
 def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, actors=None):
     clean_up = True if actors is None and cache_manager is None else False
-    options = tuning_options.strategy_options
     simulation_mode = True if isinstance(runner, SimulationRunner) else False
     initialize_ray()
 
@@ -63,7 +62,8 @@ def tune(searchspace: Searchspace, runner, tuning_options, cache_manager=None, a
     ensemble_size = len(ensemble)
 
     # setup strategy options
-    if "bayes_opt" in ensemble:  # All strategies start from a random sample except for BO
+    if "bayes_opt" in ensemble:
+        # All strategies start from a random sample except for BO
         tuning_options.strategy_options["samplingmethod"] = "random"
     tuning_options.strategy_options["max_fevals"] = 100 * ensemble_size if max_fevals is None else max_fevals
     tuning_options.strategy_options["check_and_retrieve"] = True

From f1f872e608de5c650aacab849645bfb6e91f5a8e Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Wed, 26 Mar 2025 16:42:38 +0100
Subject: [PATCH 105/106] Fix some tests failing.

---
 kernel_tuner/runners/ray/remote_actor.py      | 8 ++++----
 kernel_tuner/strategies/bayes_opt.py          | 8 ++++----
 test/strategies/test_bayesian_optimization.py | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index 0792d63f3..ed1154ebd 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -18,7 +18,7 @@ def __init__(
         device_options,
         iterations,
         observers_type_and_arguments,
-        id,
+        identifier,
         cache_manager=None,
         simulation_mode=False,
     ):
@@ -29,7 +29,7 @@ def __init__(
         self.cache_manager = cache_manager
         self.simulation_mode = simulation_mode
         self.runner = None
-        self.id = None
+        self.identifier = None
         self._reinitialize_observers(observers_type_and_arguments)
         self.dev = (
             DeviceInterface(kernel_source, iterations=iterations, observers=self.observers, **device_options)
@@ -87,8 +87,8 @@ def _reinitialize_observers(self, observers_type_and_arguments):
         self.observers = []
         for observer, arguments in observers_type_and_arguments:
             if "device" in arguments:
-                self.id = get_gpu_id(self.kernel_source.lang) if self.id is None else self.id
-                arguments["device"] = self.id
+                self.identifier = get_gpu_id(self.kernel_source.lang) if self.identifier is None else self.identifier
+                arguments["device"] = self.identifier
             if isinstance(observer, RegisterObserver):
                 self.observers.append(RegisterObserver())
             else:
diff --git a/kernel_tuner/strategies/bayes_opt.py b/kernel_tuner/strategies/bayes_opt.py
index bd20e29a9..d4dac0524 100644
--- a/kernel_tuner/strategies/bayes_opt.py
+++ b/kernel_tuner/strategies/bayes_opt.py
@@ -235,10 +235,10 @@ def get_hyperparam(name: str, default, supported_values=list()):
         self.invalid_value = 1e20
         self.opt_direction = opt_direction
         if opt_direction == "min":
-            self.worst_value = np.PINF
+            self.worst_value = np.inf
             self.argopt = np.argmin
         elif opt_direction == "max":
-            self.worst_value = np.NINF
+            self.worst_value = -np.inf
             self.argopt = np.argmax
         else:
             raise ValueError("Invalid optimization direction '{}'".format(opt_direction))
@@ -262,7 +262,7 @@ def get_hyperparam(name: str, default, supported_values=list()):
         self.__visited_num = 0
         self.__visited_valid_num = 0
         self.__visited_searchspace_indices = [False] * self.searchspace_size
-        self.__observations = [np.NaN] * self.searchspace_size
+        self.__observations = [np.nan] * self.searchspace_size
         self.__valid_observation_indices = [False] * self.searchspace_size
         self.__valid_params = list()
         self.__valid_observations = list()
@@ -311,7 +311,7 @@ def is_not_visited(self, index: int) -> bool:
 
     def is_valid(self, observation: float) -> bool:
         """Returns whether an observation is valid."""
-        return not (observation is None or observation == self.invalid_value or observation == np.NaN)
+        return not (observation is None or observation == self.invalid_value or observation == np.nan)
 
     def get_af_by_name(self, name: str):
         """Get the basic acquisition functions by their name."""
diff --git a/test/strategies/test_bayesian_optimization.py b/test/strategies/test_bayesian_optimization.py
index dd206a37b..8d929054a 100644
--- a/test/strategies/test_bayesian_optimization.py
+++ b/test/strategies/test_bayesian_optimization.py
@@ -74,7 +74,7 @@ def test_bo_initialization():
     assert BO.searchspace == pruned_parameter_space
     assert BO.unvisited_cache == pruned_parameter_space
     assert len(BO.observations) == len(pruned_parameter_space)
-    assert BO.current_optimum == np.PINF
+    assert BO.current_optimum == np.inf
 
 def test_bo_initial_sample_lhs():
     sample = BO.draw_latin_hypercube_samples(num_samples=1)

From de27d90a351f22abcfc99f4b70c2d8d3761b55e1 Mon Sep 17 00:00:00 2001
From: Alessio Sclocco <a.sclocco@esciencecenter.nl>
Date: Wed, 26 Mar 2025 16:54:15 +0100
Subject: [PATCH 106/106] Fix SonarQube warning and format.

---
 kernel_tuner/integration.py                   |   2 +-
 kernel_tuner/runners/parallel.py              |   9 +-
 kernel_tuner/runners/ray/cache_manager.py     |   6 +-
 kernel_tuner/runners/ray/remote_actor.py      |   8 +-
 kernel_tuner/runners/sequential.py            |   9 +-
 kernel_tuner/runners/simulation.py            |   9 +-
 test/context.py                               |  25 +--
 test/strategies/test_bayesian_optimization.py |  35 ++--
 test/strategies/test_common.py                |  37 +++--
 test/strategies/test_strategies.py            |  36 ++--
 test/test_accuracy.py                         |   4 +-
 test/test_common.py                           |  37 ++---
 test/test_compiler_functions.py               |  16 +-
 test/test_core.py                             |  65 +++++---
 test/test_cuda_functions.py                   |   2 -
 test/test_cupy_functions.py                   |   2 -
 test/test_energy.py                           |   8 +-
 test/test_ensemble_tuning.py                  |   9 +-
 test/test_file_utils.py                       |  11 +-
 test/test_hip_functions.py                    |  22 ++-
 test/test_hyper.py                            |   2 -
 test/test_integration.py                      | 105 +++++++-----
 test/test_kernelbuilder.py                    |  18 +-
 test/test_nvml_mocked.py                      |  34 ++--
 test/test_observers.py                        |  16 +-
 test/test_opencl_functions.py                 |   6 +-
 test/test_parallel_tuning.py                  |   4 +-
 test/test_pycuda_functions.py                 |   4 -
 test/test_pycuda_mocked.py                    |  63 +++----
 test/test_runners.py                          | 157 +++++++-----------
 test/test_searchspace.py                      | 118 ++++++-------
 test/test_util_functions.py                   |  76 +++------
 32 files changed, 480 insertions(+), 475 deletions(-)

diff --git a/kernel_tuner/integration.py b/kernel_tuner/integration.py
index 01f072694..4f92d9582 100644
--- a/kernel_tuner/integration.py
+++ b/kernel_tuner/integration.py
@@ -56,7 +56,7 @@ def get_objective_defaults(objective, objective_higher_is_better):
 }
 
 
-class TuneResults:
+class TuneResults(object):
     """Object to represent the tuning results stored to file"""
 
     def __init__(self, results_filename):
diff --git a/kernel_tuner/runners/parallel.py b/kernel_tuner/runners/parallel.py
index fcd5e3634..f5266dde1 100644
--- a/kernel_tuner/runners/parallel.py
+++ b/kernel_tuner/runners/parallel.py
@@ -109,9 +109,7 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
         :returns: Results of the tuning process.
         :rtype: list of dict
         """
-        if (
-            tuning_options is None
-        ):
+        if tuning_options is None:
             # HACK as tuning_options can't be the first argument and parameter_space needs to be a default argument
             raise ValueError("tuning_options cannot be None")
 
@@ -126,7 +124,10 @@ def run(self, parameter_space=None, tuning_options=None, ensemble=None, searchsp
             ]
             self.actors = [
                 create_actor_on_device(
-                    *runner_attributes, identifier=_id, cache_manager=self.cache_manager, simulation_mode=self.simulation_mode
+                    *runner_attributes,
+                    identifier=_id,
+                    cache_manager=self.cache_manager,
+                    simulation_mode=self.simulation_mode,
                 )
                 for _id in range(self.num_gpus)
             ]
diff --git a/kernel_tuner/runners/ray/cache_manager.py b/kernel_tuner/runners/ray/cache_manager.py
index c0f20abe0..9e19fabec 100644
--- a/kernel_tuner/runners/ray/cache_manager.py
+++ b/kernel_tuner/runners/ray/cache_manager.py
@@ -5,10 +5,12 @@
 
 
 @ray.remote(num_cpus=1)
-class CacheManager:
+class CacheManager(object):
     """Manage the cache used by the parallel actors."""
+
     def __init__(self, cache, cachefile):
-        from kernel_tuner.interface import Options  # importing here due to circular import
+        # importing here due to circular import
+        from kernel_tuner.interface import Options
 
         self.tuning_options = Options({"cache": cache, "cachefile": cachefile})
 
diff --git a/kernel_tuner/runners/ray/remote_actor.py b/kernel_tuner/runners/ray/remote_actor.py
index ed1154ebd..0d6fe3c39 100644
--- a/kernel_tuner/runners/ray/remote_actor.py
+++ b/kernel_tuner/runners/ray/remote_actor.py
@@ -9,8 +9,9 @@
 
 
 @ray.remote
-class RemoteActor:
+class RemoteActor(object):
     """Remote actor for the parallel tuner."""
+
     def __init__(
         self,
         kernel_source,
@@ -61,7 +62,7 @@ def set_cache_manager(self, cache_manager):
         if self.cache_manager is None:
             self.cache_manager = cache_manager
 
-    def get_cache_magaer(self):
+    def get_cache_manager(self):
         return self.cache_manager
 
     def init_runner(self):
@@ -94,5 +95,6 @@ def _reinitialize_observers(self, observers_type_and_arguments):
             else:
                 self.observers.append(observer(**arguments))
 
-    def get_gpu_type(self, lang):
+    @staticmethod
+    def get_gpu_type(lang):
         return get_gpu_type(lang)
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 21369eaba..9660623dd 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -53,9 +53,12 @@ def __init__(
         self.last_strategy_start_time = self.start_time
         self.last_strategy_time = 0
         self.kernel_options = kernel_options
-        self.device_options = device_options  # needed for the ensemble strategy down the line
-        self.iterations = iterations  # needed for the ensemble strategy down the line
-        self.observers = observers  # needed for the ensemble strategy down the line
+        # needed for the ensemble strategy down the line
+        self.device_options = device_options
+        # needed for the ensemble strategy down the line
+        self.iterations = iterations
+        # needed for the ensemble strategy down the line
+        self.observers = observers
         self.cache_manager = cache_manager
 
         # move data to the GPU
diff --git a/kernel_tuner/runners/simulation.py b/kernel_tuner/runners/simulation.py
index 1cf489834..edf9a40e0 100644
--- a/kernel_tuner/runners/simulation.py
+++ b/kernel_tuner/runners/simulation.py
@@ -58,9 +58,12 @@ def __init__(self, kernel_source, kernel_options, device_options, iterations, ob
         self.last_strategy_time = 0
         self.units = {}
 
-        self.device_options = device_options  # needed for the ensemble strategy down the line
-        self.iterations = iterations  # needed for the ensemble strategy down the line
-        self.observers = observers  # needed for the ensemble strategy down the line
+        # needed for the ensemble strategy down the line
+        self.device_options = device_options
+        # needed for the ensemble strategy down the line
+        self.iterations = iterations
+        # needed for the ensemble strategy down the line
+        self.observers = observers
 
     def get_environment(self, tuning_options):
         env = self.dev.get_environment()
diff --git a/test/context.py b/test/context.py
index d1cbcf3c3..f932ad670 100644
--- a/test/context.py
+++ b/test/context.py
@@ -39,9 +39,7 @@
 try:
     import cupy
 
-    cupy.cuda.Device(
-        0
-    ).attributes  # triggers exception if there are no CUDA-capable devices
+    cupy.cuda.Device(0).attributes  # triggers exception if there are no CUDA-capable devices
     cupy_present = True
 except Exception:
     cupy_present = False
@@ -55,27 +53,18 @@
 
 try:
     from hip import hip
+
     hip_present = True
 except ImportError:
     hip_present = False
 
-skip_if_no_pycuda = pytest.mark.skipif(
-    not pycuda_present, reason="PyCuda not installed or no CUDA device detected"
-)
+skip_if_no_pycuda = pytest.mark.skipif(not pycuda_present, reason="PyCuda not installed or no CUDA device detected")
 skip_if_no_pynvml = pytest.mark.skipif(not pynvml_present, reason="NVML not installed")
-skip_if_no_cupy = pytest.mark.skipif(
-    not cupy_present, reason="CuPy not installed or no CUDA device detected"
-)
-skip_if_no_cuda = pytest.mark.skipif(
-    not cuda_present, reason="NVIDIA CUDA not installed"
-)
-skip_if_no_opencl = pytest.mark.skipif(
-    not opencl_present, reason="PyOpenCL not installed or no OpenCL device detected"
-)
+skip_if_no_cupy = pytest.mark.skipif(not cupy_present, reason="CuPy not installed or no CUDA device detected")
+skip_if_no_cuda = pytest.mark.skipif(not cuda_present, reason="NVIDIA CUDA not installed")
+skip_if_no_opencl = pytest.mark.skipif(not opencl_present, reason="PyOpenCL not installed or no OpenCL device detected")
 skip_if_no_gcc = pytest.mark.skipif(not gcc_present, reason="No gcc on PATH")
-skip_if_no_gfortran = pytest.mark.skipif(
-    not gfortran_present, reason="No gfortran on PATH"
-)
+skip_if_no_gfortran = pytest.mark.skipif(not gfortran_present, reason="No gfortran on PATH")
 skip_if_no_openmp = pytest.mark.skipif(not openmp_present, reason="No OpenMP found")
 skip_if_no_openacc = pytest.mark.skipif(not openacc_present, reason="No nvc++ on PATH")
 skip_if_no_hip = pytest.mark.skipif(not hip_present, reason="No HIP Python found")
diff --git a/test/strategies/test_bayesian_optimization.py b/test/strategies/test_bayesian_optimization.py
index 8d929054a..1aaa853d1 100644
--- a/test/strategies/test_bayesian_optimization.py
+++ b/test/strategies/test_bayesian_optimization.py
@@ -23,10 +23,10 @@
 max_threads = 1024
 searchspace = Searchspace(tune_params, [], max_threads)
 
-dev_dict = {'max_threads': max_threads}
-dev = namedtuple('Struct', dev_dict.keys())(*dev_dict.values())
-runner_dict = {'dev': dev}
-runner = namedtuple('Struct', runner_dict.keys())(*runner_dict.values())
+dev_dict = {"max_threads": max_threads}
+dev = namedtuple("Struct", dev_dict.keys())(*dev_dict.values())
+runner_dict = {"dev": dev}
+runner = namedtuple("Struct", runner_dict.keys())(*runner_dict.values())
 cost_func = CostFunc(searchspace, tuning_options, runner)
 
 # initialize required data
@@ -34,10 +34,19 @@
 _, _, eps = cost_func.get_bounds_x0_eps()
 original_to_normalized, normalized_to_original = bayes_opt.generate_normalized_param_dicts(tune_params, eps)
 normalized_parameter_space = bayes_opt.normalize_parameter_space(parameter_space, tune_params, original_to_normalized)
-pruned_parameter_space, removed_tune_params = bayes_opt.prune_parameter_space(normalized_parameter_space, tuning_options, tune_params, original_to_normalized)
+pruned_parameter_space, removed_tune_params = bayes_opt.prune_parameter_space(
+    normalized_parameter_space, tuning_options, tune_params, original_to_normalized
+)
 
 # initialize BO
-BO = BayesianOptimization(pruned_parameter_space, removed_tune_params, tuning_options, original_to_normalized, normalized_to_original, cost_func)
+BO = BayesianOptimization(
+    pruned_parameter_space,
+    removed_tune_params,
+    tuning_options,
+    original_to_normalized,
+    normalized_to_original,
+    cost_func,
+)
 predictions, _, std = BO.predict_list(BO.unvisited_cache)
 
 
@@ -61,7 +70,7 @@ def test_normalize_parameter_space():
 
 
 def test_prune_parameter_space():
-    assert removed_tune_params == [None, None, list(normalized_to_original['z'].keys())[0]]
+    assert removed_tune_params == [None, None, list(normalized_to_original["z"].keys())[0]]
     for index in range(len(pruned_parameter_space)):
         assert len(pruned_parameter_space[index]) <= len(parameter_space[index])
         assert len(parameter_space[index]) - len(pruned_parameter_space[index]) == 1
@@ -76,6 +85,7 @@ def test_bo_initialization():
     assert len(BO.observations) == len(pruned_parameter_space)
     assert BO.current_optimum == np.inf
 
+
 def test_bo_initial_sample_lhs():
     sample = BO.draw_latin_hypercube_samples(num_samples=1)
     print(sample)
@@ -85,18 +95,19 @@ def test_bo_initial_sample_lhs():
     assert len(sample[0]) == 2
     assert isinstance(sample[0][0], tuple)
     assert isinstance(sample[0][1], int)
-    assert len(sample[0][0]) == 2   # tune_params["z"] is dropped because it only has a single value
+    assert len(sample[0][0]) == 2  # tune_params["z"] is dropped because it only has a single value
     assert isinstance(sample[0][0][0], float)
     samples = BO.draw_latin_hypercube_samples(num_samples=3)
     assert len(samples) == 3
     with raises(ValueError):
         samples = BO.draw_latin_hypercube_samples(num_samples=30)
 
+
 def test_bo_is_better_than():
-    BO.opt_direction = 'max'
+    BO.opt_direction = "max"
     assert BO.is_better_than(2, 1)
     assert BO.is_better_than(-0.1, -0.2)
-    BO.opt_direction = 'min'
+    BO.opt_direction = "min"
     assert BO.is_better_than(1, 2)
     assert BO.is_better_than(-0.2, -0.1)
 
@@ -107,12 +118,12 @@ def test_bo_is_not_visited():
 
 
 def test_bo_get_af_by_name():
-    for basic_af in ['ei', 'poi', 'lcb']:
+    for basic_af in ["ei", "poi", "lcb"]:
         assert callable(BO.get_af_by_name(basic_af))
 
 
 def test_bo_set_acquisition_function():
-    BO.set_acquisition_function('multi-fast')
+    BO.set_acquisition_function("multi-fast")
     assert callable(BO.optimize)
 
 
diff --git a/test/strategies/test_common.py b/test/strategies/test_common.py
index 29ead8615..b769a107d 100644
--- a/test/strategies/test_common.py
+++ b/test/strategies/test_common.py
@@ -13,9 +13,7 @@
 
 
 def fake_runner():
-    fake_result = {
-        'time': 5
-    }
+    fake_result = {"time": 5}
     runner = Mock()
     runner.last_strategy_start_time = perf_counter()
     runner.run.return_value = [fake_result]
@@ -27,9 +25,18 @@ def fake_runner():
 
 def test_cost_func():
     x = [1, 4]
-    tuning_options = Options(scaling=False, snap=False, tune_params=tune_params,
-                             restrictions=None, strategy_options={}, cache={}, unique_results={},
-                             objective="time", objective_higher_is_better=False, metrics=None)
+    tuning_options = Options(
+        scaling=False,
+        snap=False,
+        tune_params=tune_params,
+        restrictions=None,
+        strategy_options={},
+        cache={},
+        unique_results={},
+        objective="time",
+        objective_higher_is_better=False,
+        metrics=None,
+    )
     runner = fake_runner()
 
     time = CostFunc(Searchspace(tune_params, None, 1024), tuning_options, runner)(x)
@@ -38,10 +45,20 @@ def test_cost_func():
     # check if restrictions are properly handled
     def restrictions(_):
         return False
-    tuning_options = Options(scaling=False, snap=False, tune_params=tune_params,
-                             restrictions=restrictions, strategy_options={},
-                             verbose=True, cache={}, unique_results={},
-                             objective="time", objective_higher_is_better=False, metrics=None)
+
+    tuning_options = Options(
+        scaling=False,
+        snap=False,
+        tune_params=tune_params,
+        restrictions=restrictions,
+        strategy_options={},
+        verbose=True,
+        cache={},
+        unique_results={},
+        objective="time",
+        objective_higher_is_better=False,
+        metrics=None,
+    )
     time = CostFunc(Searchspace(tune_params, restrictions, 1024), tuning_options, runner)(x)
     assert time == sys.float_info.max
 
diff --git a/test/strategies/test_strategies.py b/test/strategies/test_strategies.py
index 1001aabec..e3d8009d3 100644
--- a/test/strategies/test_strategies.py
+++ b/test/strategies/test_strategies.py
@@ -9,6 +9,7 @@
 
 cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/../test_cache_file.json"
 
+
 @pytest.fixture
 def vector_add():
     kernel_string = """
@@ -33,21 +34,28 @@ def vector_add():
     return ["vector_add", kernel_string, size, args, tune_params]
 
 
-@pytest.mark.parametrize('strategy', strategy_map)
+@pytest.mark.parametrize("strategy", strategy_map)
 def test_strategies(vector_add, strategy):
-
     options = dict(popsize=5)
 
     print(f"testing {strategy}")
 
     if hasattr(kernel_tuner.interface.strategy_map[strategy], "_options"):
-        filter_options = {opt:val for opt, val in options.items() if opt in kernel_tuner.interface.strategy_map[strategy]._options}
+        filter_options = {
+            opt: val for opt, val in options.items() if opt in kernel_tuner.interface.strategy_map[strategy]._options
+        }
     else:
         filter_options = options
     filter_options["max_fevals"] = 10
 
-    results, _ = kernel_tuner.tune_kernel(*vector_add, strategy=strategy, strategy_options=filter_options,
-                                         verbose=False, cache=cache_filename, simulation_mode=True)
+    results, _ = kernel_tuner.tune_kernel(
+        *vector_add,
+        strategy=strategy,
+        strategy_options=filter_options,
+        verbose=False,
+        cache=cache_filename,
+        simulation_mode=True,
+    )
 
     assert len(results) > 0
 
@@ -63,15 +71,15 @@ def test_strategies(vector_add, strategy):
 
     # check whether the returned dictionaries contain exactly the expected keys and the appropriate type
     expected_items = {
-        'block_size_x': int,
-        'time': (float, int),
-        'times': list,
-        'compile_time': (float, int),
-        'verification_time': (float, int),
-        'benchmark_time': (float, int),
-        'strategy_time': (float, int),
-        'framework_time': (float, int),
-        'timestamp': str
+        "block_size_x": int,
+        "time": (float, int),
+        "times": list,
+        "compile_time": (float, int),
+        "verification_time": (float, int),
+        "benchmark_time": (float, int),
+        "strategy_time": (float, int),
+        "framework_time": (float, int),
+        "timestamp": str,
     }
     for res in results:
         assert len(res) == len(expected_items)
diff --git a/test/test_accuracy.py b/test/test_accuracy.py
index 1e5070637..17d6f5156 100644
--- a/test/test_accuracy.py
+++ b/test/test_accuracy.py
@@ -47,9 +47,7 @@ def test_tunable_precision():
     from kernel_tuner.accuracy import TunablePrecision
 
     inputs = np.array([1, 2, 3], dtype=np.float64)
-    x = TunablePrecision(
-        "foo", inputs, dict(float16=np.half, float32=np.float32, float64=np.double)
-    )
+    x = TunablePrecision("foo", inputs, dict(float16=np.half, float32=np.float32, float64=np.double))
 
     assert np.all(x(dict(foo="float16")) == inputs)
     assert x(dict(foo="float16")).dtype == np.half
diff --git a/test/test_common.py b/test/test_common.py
index 7c1bd6838..e4500fc73 100644
--- a/test/test_common.py
+++ b/test/test_common.py
@@ -9,7 +9,7 @@
 
 def test_get_bounds_x0_eps():
     tune_params = dict()
-    tune_params['x'] = [0, 1, 2, 3, 4]
+    tune_params["x"] = [0, 1, 2, 3, 4]
     searchspace = Searchspace(tune_params, [], 1024)
 
     tuning_options = Options()
@@ -28,11 +28,10 @@ def test_get_bounds_x0_eps():
 
 
 def test_get_bounds():
-
     tune_params = dict()
-    tune_params['x'] = [0, 1, 2, 3, 4]
-    tune_params['y'] = [i for i in range(0, 10000, 100)]
-    tune_params['z'] = [-11.2, 55.67, 123.27]
+    tune_params["x"] = [0, 1, 2, 3, 4]
+    tune_params["y"] = [i for i in range(0, 10000, 100)]
+    tune_params["z"] = [-11.2, 55.67, 123.27]
 
     for k in tune_params.keys():
         random.shuffle(tune_params[k])
@@ -45,30 +44,28 @@ def test_get_bounds():
 
 
 def test_snap_to_nearest_config():
-
     tune_params = dict()
-    tune_params['x'] = [0, 1, 2, 3, 4, 5]
-    tune_params['y'] = [0, 1, 2, 3, 4, 5]
-    tune_params['z'] = [0, 1, 2, 3, 4, 5]
-    tune_params['w'] = ['a', 'b', 'c']
+    tune_params["x"] = [0, 1, 2, 3, 4, 5]
+    tune_params["y"] = [0, 1, 2, 3, 4, 5]
+    tune_params["z"] = [0, 1, 2, 3, 4, 5]
+    tune_params["w"] = ["a", "b", "c"]
 
-    x = [-5.7, 3.14, 1e6, 'b']
-    expected = [0, 3, 5, 'b']
+    x = [-5.7, 3.14, 1e6, "b"]
+    expected = [0, 3, 5, "b"]
 
     answer = common.snap_to_nearest_config(x, tune_params)
     assert answer == expected
 
 
 def test_unscale():
-
     params = dict()
-    params['x'] = [2**i for i in range(4, 9)]
-    eps = 1.0 / len(params['x'])
+    params["x"] = [2**i for i in range(4, 9)]
+    eps = 1.0 / len(params["x"])
 
-    assert common.unscale_and_snap_to_nearest([0], params, eps)[0] == params['x'][0]
-    assert common.unscale_and_snap_to_nearest([1], params, eps)[0] == params['x'][-1]
+    assert common.unscale_and_snap_to_nearest([0], params, eps)[0] == params["x"][0]
+    assert common.unscale_and_snap_to_nearest([1], params, eps)[0] == params["x"][-1]
 
-    intervals = np.linspace(0, 1, len(params['x']) * 10)
+    intervals = np.linspace(0, 1, len(params["x"]) * 10)
 
     freq = dict()
     for i in intervals:
@@ -82,6 +79,6 @@ def test_unscale():
     print(freq)
 
     for v in freq.values():
-        assert v == freq[params['x'][0]]
+        assert v == freq[params["x"][0]]
 
-    assert len(freq.keys()) == len(params['x'])
+    assert len(freq.keys()) == len(params["x"])
diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py
index 913fee85d..99a0071fc 100644
--- a/test/test_compiler_functions.py
+++ b/test/test_compiler_functions.py
@@ -159,9 +159,7 @@ def test_compile(npct, subprocess):
     kernel_string = "this is a fake C program"
     kernel_name = "blabla"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions()
     f = cfunc.compile(kernel_instance)
@@ -191,9 +189,7 @@ def test_compile_detects_device_code(npct, subprocess):
     kernel_string = "this code clearly contains device code __global__ kernel(float* arg){ return; }"
     kernel_name = "blabla"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions()
     cfunc.compile(kernel_instance)
@@ -347,9 +343,7 @@ def test_complies_fortran_function_no_module():
     """
     kernel_name = "my_test_function"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     cfunc = CompilerFunctions(compiler="gfortran")
     func = cfunc.compile(kernel_instance)
@@ -378,9 +372,7 @@ def test_complies_fortran_function_with_module():
     """
     kernel_name = "my_test_function"
     kernel_sources = KernelSource(kernel_name, kernel_string, "C")
-    kernel_instance = KernelInstance(
-        kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []
-    )
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
 
     try:
         cfunc = CompilerFunctions(compiler="gfortran")
diff --git a/test/test_core.py b/test/test_core.py
index a8624470e..6156afbf2 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -14,9 +14,12 @@
 from .context import skip_if_no_pycuda
 
 
-mock_config = {"return_value.compile.return_value": "compile",
-               "return_value.ready_argument_list.return_value": "ready_argument_list",
-               "return_value.max_threads": 1024}
+mock_config = {
+    "return_value.compile.return_value": "compile",
+    "return_value.ready_argument_list.return_value": "ready_argument_list",
+    "return_value.max_threads": 1024,
+}
+
 
 def get_vector_add_args():
     size = int(1e6)
@@ -42,9 +45,19 @@ def env():
     lang = "CUDA"
     kernel_source = core.KernelSource(kernel_name, kernel_string, lang)
     verbose = True
-    kernel_options = Options(kernel_name=kernel_name, kernel_string=kernel_string, problem_size=args[-1],
-                             arguments=args, lang=lang, grid_div_x=None, grid_div_y=None, grid_div_z=None,
-                             cmem_args=None, texmem_args=None, block_size_names=None)
+    kernel_options = Options(
+        kernel_name=kernel_name,
+        kernel_string=kernel_string,
+        problem_size=args[-1],
+        arguments=args,
+        lang=lang,
+        grid_div_x=None,
+        grid_div_y=None,
+        grid_div_z=None,
+        cmem_args=None,
+        texmem_args=None,
+        block_size_names=None,
+    )
     device_options = Options(device=0, platform=0, quiet=False, compiler=None, compiler_options=None)
     dev = core.DeviceInterface(kernel_source, iterations=7, **device_options)
     instance = dev.create_kernel_instance(kernel_source, kernel_options, params, verbose)
@@ -54,7 +67,6 @@ def env():
 
 @skip_if_no_pycuda
 def test_default_verify_function(env):
-
     # gpu_args = dev.ready_argument_list(args)
     # func = dev.compile_kernel(instance, verbose)
 
@@ -93,7 +105,7 @@ def test_default_verify_function(env):
     assert True
 
 
-@patch('kernel_tuner.core.PyCudaFunctions')
+@patch("kernel_tuner.core.PyCudaFunctions")
 def test_check_kernel_output(dev_func_interface):
     dev_func_interface.configure_mock(**mock_config)
 
@@ -105,17 +117,17 @@ def test_check_kernel_output(dev_func_interface):
     wrong = [np.array([1, 2, 3, 4]).astype(np.float32)]
     atol = 1e-6
 
-    dev.check_kernel_output('func', answer, instance, answer, atol, None, True)
+    dev.check_kernel_output("func", answer, instance, answer, atol, None, True)
 
     dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0])
-    dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1))
+    dfi.run_kernel.assert_called_once_with("func", answer, (256, 1, 1), (1, 1, 1))
 
     print(dfi.mock_calls)
 
     assert dfi.memcpy_dtoh.called == 1
 
     for name, args, _ in dfi.mock_calls:
-        if name == 'memcpy_dtoh':
+        if name == "memcpy_dtoh":
             assert all(args[0] == answer[0])
             assert all(args[1] == answer[0])
 
@@ -124,7 +136,7 @@ def test_check_kernel_output(dev_func_interface):
     # obviously does not result in the result_host array containing anything
     # non-zero
     try:
-        dev.check_kernel_output('func', wrong, instance, wrong, atol, None, True)
+        dev.check_kernel_output("func", wrong, instance, wrong, atol, None, True)
         print("check_kernel_output failed to throw an exception")
         assert False
     except Exception:
@@ -132,7 +144,6 @@ def test_check_kernel_output(dev_func_interface):
 
 
 def test_default_verify_function_arrays():
-
     answer = [np.zeros(4).astype(np.float32), None, np.ones(5).astype(np.int32)]
 
     answer_type_error1 = [np.zeros(4).astype(np.float32)]
@@ -157,7 +168,6 @@ def test_default_verify_function_arrays():
 
 
 def test_default_verify_function_scalar():
-
     answer = [np.zeros(4).astype(np.float32), None, np.int64(42)]
 
     instance = core.KernelInstance("name", None, "kernel_string", [], (256, 1, 1), (1, 1, 1), {}, answer)
@@ -198,16 +208,18 @@ def test_preprocess_gpu_arguments():
 
 def test_split_argument_list():
     test_string = "T *c, const T *__restrict__ a, T\n *\n b\n , int n"
-    ans1, ans2 = core.split_argument_list([s.strip() for s in test_string.split(',')])
+    ans1, ans2 = core.split_argument_list([s.strip() for s in test_string.split(",")])
     assert ans1 == ["T *", "const T *__restrict__", "T *", "int"]
     assert ans2 == ["c", "a", "b", "n"]
 
+
 def test_apply_template_typenames():
     type_list = ["T *", "CONST __restrict__", "double"]
     templated_typenames = {"T": "test"}
     core.apply_template_typenames(type_list, templated_typenames)
     assert type_list == ["test *", "CONST __restrict__", "double"]
 
+
 def test_get_templated_typenames():
     template_arguments = ["double", "32"]
     template_parameters = ["typename TF", "test1", "test2"]
@@ -217,6 +229,7 @@ def test_get_templated_typenames():
     assert len(ans) == 1
     assert ans["TF"] == "double"
 
+
 def test_wrap_templated_kernel():
     kernel_string = """
 template<typename TF> __global__ void vector_add(TF *c, const TF *__restrict__ a, TF * b , int n) {
@@ -228,15 +241,16 @@ def test_wrap_templated_kernel():
 """
     kernel_name = "vector_add<float>"
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
-    #check __global__ in templated definition is replaced with __device__
+    # check __global__ in templated definition is replaced with __device__
     assert "template<typename TF> __device__ void vector_add" in ans
-    #check if template instantiation is inserted
+    # check if template instantiation is inserted
     assert "template __device__ void vector_add<float>(float *, const float *__restrict__, float *, int);" in ans
-    #check if wrapper functions with C linkage is inserted
-    assert "extern \"C\" __global__ void vector_add" in ans
-    #check if original kernel is called
+    # check if wrapper functions with C linkage is inserted
+    assert 'extern "C" __global__ void vector_add' in ans
+    # check if original kernel is called
     assert "vector_add<float>(c, a, b, n);" in ans
 
+
 def test_wrap_templated_kernel2():
     kernel_string = """
 template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add(TF *c, const TF *__restrict__ a, TF * b , int n) {
@@ -251,6 +265,7 @@ def test_wrap_templated_kernel2():
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
     assert True
 
+
 def test_wrap_templated_kernel3():
     kernel_string = """
 template<typename TF> __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1(TF *c, const TF *__restrict__ a, TF * b , int n) {
@@ -271,7 +286,10 @@ def test_wrap_templated_kernel3():
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
 
     # test that the template wrapper matches the right kernel (the first and not the second)
-    assert 'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)' in ans
+    assert (
+        'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)'
+        in ans
+    )
 
 
 def test_wrap_templated_kernel4():
@@ -295,4 +313,7 @@ def test_wrap_templated_kernel4():
     ans, _ = core.wrap_templated_kernel(kernel_string, kernel_name)
 
     # test that the template wrapper matches the right kernel (the second not the first)
-    assert 'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)' in ans
+    assert (
+        'extern "C" __global__ void __launch_bounds__(THREADS_PER_BLOCK, BLOCKS_PER_SM) vector_add1_wrapper(float * c, const float *__restrict__ a, float * b, int n)'
+        in ans
+    )
diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py
index 1dc68652d..47200cf30 100644
--- a/test/test_cuda_functions.py
+++ b/test/test_cuda_functions.py
@@ -16,7 +16,6 @@
 
 @skip_if_no_cuda
 def test_ready_argument_list():
-
     size = 1000
     a = np.int32(75)
     b = np.random.randn(size).astype(np.float32)
@@ -34,7 +33,6 @@ def test_ready_argument_list():
 
 @skip_if_no_cuda
 def test_compile():
-
     kernel_string = """
     extern "C" __global__ void vector_add(float *c, float *a, float *b, int n) {
         int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/test/test_cupy_functions.py b/test/test_cupy_functions.py
index 4bb4d16f4..be781f368 100644
--- a/test/test_cupy_functions.py
+++ b/test/test_cupy_functions.py
@@ -1,4 +1,3 @@
-
 import kernel_tuner
 
 from .context import skip_if_no_cupy
@@ -9,4 +8,3 @@
 def test_tune_kernel(env):
     result, _ = kernel_tuner.tune_kernel(*env, lang="cupy", verbose=True)
     assert len(result) > 0
-
diff --git a/test/test_energy.py b/test/test_energy.py
index 187ac1cdc..3ca9f1e27 100644
--- a/test/test_energy.py
+++ b/test/test_energy.py
@@ -6,11 +6,13 @@
 
 cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/synthetic_fp32_cache_NVIDIA_RTX_A4000.json"
 
+
 @skip_if_no_pycuda
 @skip_if_no_pynvml
 def test_create_power_frequency_model():
-
-    ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_power_frequency_model(cache=cache_filename, simulation_mode=True)
+    ridge_frequency, freqs, nvml_power, fitted_params, scaling = energy.create_power_frequency_model(
+        cache=cache_filename, simulation_mode=True
+    )
     target_value = 1350
     tolerance = 0.05
-    assert target_value * (1-tolerance) <= ridge_frequency <= target_value * (1+tolerance)
+    assert target_value * (1 - tolerance) <= ridge_frequency <= target_value * (1 + tolerance)
diff --git a/test/test_ensemble_tuning.py b/test/test_ensemble_tuning.py
index 69efb5a68..9f8cc75df 100644
--- a/test/test_ensemble_tuning.py
+++ b/test/test_ensemble_tuning.py
@@ -13,6 +13,7 @@
 except Exception:
     pass
 
+
 @pytest.fixture
 def env():
     kernel_string = """
@@ -41,9 +42,11 @@ def env():
 
     return ["vector_add", kernel_string, size, args, tune_params]
 
+
 @skip_if_no_pycuda
 def test_parallel_tune_kernel(env):
     strategy_options = {"ensemble": ["greedy_ils", "greedy_ils"]}
-    result, _ = tune_kernel(*env, lang="CUDA", verbose=True, strategy="ensemble", 
-                            parallel_mode=True, strategy_options=strategy_options)
-    assert len(result) > 0
\ No newline at end of file
+    result, _ = tune_kernel(
+        *env, lang="CUDA", verbose=True, strategy="ensemble", parallel_mode=True, strategy_options=strategy_options
+    )
+    assert len(result) > 0
diff --git a/test/test_file_utils.py b/test/test_file_utils.py
index 622e06b44..4e181729c 100644
--- a/test/test_file_utils.py
+++ b/test/test_file_utils.py
@@ -5,6 +5,7 @@
 from jsonschema import validate
 import numpy as np
 import warnings
+
 try:
     from hip import hip
 except:
@@ -64,6 +65,7 @@ def test_store_metadata_file():
         # clean up
         delete_temp_file(filename)
 
+
 def hip_check(call_result):
     err = call_result[0]
     result = call_result[1:]
@@ -73,6 +75,7 @@ def hip_check(call_result):
         raise RuntimeError(str(err))
     return result
 
+
 @skip_if_no_hip
 def test_check_argument_list_device_array():
     """Test check_argument_list with DeviceArray"""
@@ -84,12 +87,8 @@ def test_check_argument_list_device_array():
     host_array = np.ones((100,), dtype=np.float32)
     num_bytes = host_array.size * host_array.itemsize
     device_array = hip_check(hip.hipMalloc(num_bytes))
-    device_array.configure(
-        typestr="float32",
-        shape=host_array.shape,
-        itemsize=host_array.itemsize
-    )
-    
+    device_array.configure(typestr="float32", shape=host_array.shape, itemsize=host_array.itemsize)
+
     with warnings.catch_warnings():
         warnings.simplefilter("error")
         check_argument_list("simple_kernel", float_kernel, [device_array])
diff --git a/test/test_hip_functions.py b/test/test_hip_functions.py
index e192223ed..eb587c618 100644
--- a/test/test_hip_functions.py
+++ b/test/test_hip_functions.py
@@ -10,10 +10,12 @@
 
 try:
     from hip import hip, hiprtc
+
     hip_present = True
 except ImportError:
     pass
 
+
 def hip_check(call_result):
     err = call_result[0]
     result = call_result[1:]
@@ -25,6 +27,7 @@ def hip_check(call_result):
         raise RuntimeError(str(err))
     return result
 
+
 @pytest.fixture
 def env():
     kernel_string = """
@@ -48,6 +51,7 @@ def env():
 
     return ["vector_add", kernel_string, size, args, tune_params]
 
+
 @skip_if_no_hip
 def test_ready_argument_list():
     size = 1000
@@ -67,6 +71,7 @@ def test_ready_argument_list():
     assert gpu_args[1].value == a
     assert gpu_args[3].value == c
 
+
 @skip_if_no_hip
 def test_compile():
     kernel_string = """
@@ -87,6 +92,7 @@ def test_compile():
     except Exception as e:
         pytest.fail("Did not expect any exception:" + str(e))
 
+
 @skip_if_no_hip
 def test_memset_and_memcpy_dtoh():
     a = [1, 2, 3, 4]
@@ -101,6 +107,7 @@ def test_memset_and_memcpy_dtoh():
 
     assert all(output == np.full(4, 4))
 
+
 @skip_if_no_hip
 def test_memcpy_htod():
     a = [1, 2, 3, 4]
@@ -114,6 +121,7 @@ def test_memcpy_htod():
 
     assert all(output == x)
 
+
 @skip_if_no_hip
 def test_copy_constant_memory_args():
     kernel_string = """
@@ -133,7 +141,7 @@ def test_copy_constant_memory_args():
     kernel = dev.compile(kernel_instance)
 
     my_constant_data = np.full(100, 23).astype(np.float32)
-    cmem_args = {'my_constant_data': my_constant_data}
+    cmem_args = {"my_constant_data": my_constant_data}
     dev.copy_constant_memory_args(cmem_args)
 
     output = np.full(100, 0).astype(np.float32)
@@ -147,16 +155,12 @@ def test_copy_constant_memory_args():
 
     assert (my_constant_data == output).all()
 
+
 @skip_if_no_hip
 def test_smem_args(env):
-    result, _ = tune_kernel(*env,
-                          smem_args=dict(size="block_size_x*4"),
-                          verbose=True, lang="HIP")
+    result, _ = tune_kernel(*env, smem_args=dict(size="block_size_x*4"), verbose=True, lang="HIP")
     tune_params = env[-1]
     assert len(result) == len(tune_params["block_size_x"])
-    result, _ = tune_kernel(
-        *env,
-        smem_args=dict(size=lambda p: p['block_size_x'] * 4),
-        verbose=True, lang="HIP")
+    result, _ = tune_kernel(*env, smem_args=dict(size=lambda p: p["block_size_x"] * 4), verbose=True, lang="HIP")
     tune_params = env[-1]
-    assert len(result) == len(tune_params["block_size_x"])
\ No newline at end of file
+    assert len(result) == len(tune_params["block_size_x"])
diff --git a/test/test_hyper.py b/test/test_hyper.py
index 9d1dc55df..9b54b66bd 100644
--- a/test/test_hyper.py
+++ b/test/test_hyper.py
@@ -4,7 +4,6 @@
 
 
 def test_hyper(env):
-
     hyper_params = dict()
     hyper_params["popsize"] = [5]
     hyper_params["maxiter"] = [5, 10]
@@ -15,4 +14,3 @@ def test_hyper(env):
 
     result = tune_hyper_params(target_strategy, hyper_params, *env, verbose=True, cache=cache_filename)
     assert len(result) > 0
-
diff --git a/test/test_integration.py b/test/test_integration.py
index aafb437f1..30c3e26ac 100644
--- a/test/test_integration.py
+++ b/test/test_integration.py
@@ -11,11 +11,8 @@
 
 @pytest.fixture()
 def fake_results():
-    #create fake results for testing
-    tune_params = {
-        "a": [1, 2, 4],
-        "b": [4, 5, 6]
-    }
+    # create fake results for testing
+    tune_params = {"a": [1, 2, 4], "b": [4, 5, 6]}
     problem_size = 100
     parameter_space = itertools.product(*tune_params.values())
     results = [dict(zip(tune_params.keys(), element)) for element in parameter_space]
@@ -28,42 +25,50 @@ def fake_results():
         r["strategy_time"] = 20.0 + (i / 5)
         r["verification_time"] = 20.0 + (i / 5)
         r["timestamp"] = str(datetime.now(timezone.utc))
-    env = {
-        "device_name": "My GPU"
-    }
+    env = {"device_name": "My GPU"}
 
     return "fake_kernel", "fake_string", tune_params, problem_size, parameter_space, results, env
 
 
 def test_store_results(fake_results):
-
     filename = "temp_test_results_file.json"
     kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results
 
     try:
-        #test basic operation
+        # test basic operation
         integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3)
         meta, stored_data = integration._read_results_file(filename)
 
         assert len([d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100"]) == 3
 
-        #test if results for a different problem_size values are added
+        # test if results for a different problem_size values are added
         integration.store_results(filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         meta, stored_data = integration._read_results_file(filename)
 
         assert len([d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100"]) == 3
         assert len([d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "1000"]) == 3
 
-        #test if results for a different GPU can be added
-        integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, { "device_name": "Another GPU"}, top=3)
+        # test if results for a different GPU can be added
+        integration.store_results(
+            filename,
+            kernel_name,
+            kernel_string,
+            tune_params,
+            problem_size,
+            results,
+            {"device_name": "Another GPU"},
+            top=3,
+        )
         meta, stored_data = integration._read_results_file(filename)
 
         assert len(set([d["device_name"] for d in stored_data])) == 2
 
-        #test if overwriting results works
+        # test if overwriting results works
         for i, r in enumerate(results):
             r["time"] = 50.0 + i
-        integration.store_results(filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=0.1)
+        integration.store_results(
+            filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=0.1
+        )
         meta, stored_data = integration._read_results_file(filename)
 
         my_gpu_100_data = [d for d in stored_data if d["device_name"] == "My_GPU" and d["problem_size"] == "100"]
@@ -75,58 +80,59 @@ def test_store_results(fake_results):
 
 
 def test_setup_device_targets(fake_results):
-
     results_filename = "temp_test_results_file.json"
     header_filename = "temp_test_header_file.h"
     kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results
 
     try:
-        integration.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3)
-        #results file
-        #{'My_GPU': {'100': [{'a': 1, 'b': 4, 'time': 100.0}, {'a': 1, 'b': 5, 'time': 101.0}, {'a': 1, 'b': 6, 'time': 102.0}]}}
+        integration.store_results(
+            results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3
+        )
+        # results file
+        # {'My_GPU': {'100': [{'a': 1, 'b': 4, 'time': 100.0}, {'a': 1, 'b': 5, 'time': 101.0}, {'a': 1, 'b': 6, 'time': 102.0}]}}
 
         integration.create_device_targets(header_filename, results_filename)
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
 
         assert "#ifdef TARGET_My_GPU" in output_str
         assert "#define a 1" in output_str
         assert "#define b 4" in output_str
 
-        #test output when more then one problem size is used, and best configuration is different
+        # test output when more then one problem size is used, and best configuration is different
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 4:
-                e['time'] += 100
+            if e["a"] == 1 and e["b"] == 4:
+                e["time"] += 100
         integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         integration.create_device_targets(header_filename, results_filename, objective="time")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"])
         assert expected in output_str
 
-        #test output when more then one problem size is used, and best configuration depends on total time
+        # test output when more then one problem size is used, and best configuration depends on total time
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 6:
-                e['time'] -= 3
+            if e["a"] == 1 and e["b"] == 6:
+                e["time"] -= 3
         integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         integration.create_device_targets(header_filename, results_filename, objective="time")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"])
         assert expected in output_str
 
-        #test output when more then one GPU is used
+        # test output when more then one GPU is used
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 6:
-                e['time'] += 3.1
-        env['device_name'] = "My_GPU2"
+            if e["a"] == 1 and e["b"] == 6:
+                e["time"] += 3.1
+        env["device_name"] = "My_GPU2"
         integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3)
         integration.create_device_targets(header_filename, results_filename, objective="time")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"])
         assert expected in output_str
@@ -141,34 +147,45 @@ def test_setup_device_targets(fake_results):
 
 
 def test_setup_device_targets_max(fake_results):
-
     results_filename = "temp_test_results_file.json"
     header_filename = "temp_test_header_file.h"
     kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results
 
-    #add GFLOP/s as metric
+    # add GFLOP/s as metric
     for i, e in enumerate(results):
-        e['GFLOP/s'] = 1e5 / e['time']
+        e["GFLOP/s"] = 1e5 / e["time"]
 
     try:
-        integration.store_results(results_filename, kernel_name, kernel_string, tune_params, problem_size, results, env, top=3, objective="GFLOP/s")
+        integration.store_results(
+            results_filename,
+            kernel_name,
+            kernel_string,
+            tune_params,
+            problem_size,
+            results,
+            env,
+            top=3,
+            objective="GFLOP/s",
+        )
         integration.create_device_targets(header_filename, results_filename, objective="GFLOP/s")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         assert "TARGET_My_GPU" in output_str
         assert "#define a 1" in output_str
         assert "#define b 4" in output_str
 
-        #test output when more then one problem size is used, and best configuration is different
+        # test output when more then one problem size is used, and best configuration is different
         for i, e in enumerate(results):
-            if e['a'] == 1 and e['b'] == 4:
-                e['time'] += 100
-                e['GFLOP/s'] = 1e5 / e['time']
-        integration.store_results(results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3, objective="GFLOP/s")
+            if e["a"] == 1 and e["b"] == 4:
+                e["time"] += 100
+                e["GFLOP/s"] = 1e5 / e["time"]
+        integration.store_results(
+            results_filename, kernel_name, kernel_string, tune_params, 1000, results, env, top=3, objective="GFLOP/s"
+        )
         integration.create_device_targets(header_filename, results_filename, objective="GFLOP/s")
 
-        with open(header_filename, 'r') as fh:
+        with open(header_filename, "r") as fh:
             output_str = fh.read()
         expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"])
         assert expected in output_str
diff --git a/test/test_kernelbuilder.py b/test/test_kernelbuilder.py
index c706e3953..6ef7c9d4a 100644
--- a/test/test_kernelbuilder.py
+++ b/test/test_kernelbuilder.py
@@ -32,7 +32,7 @@ def test_PythonKernel(test_kernel, backend):
     kernel_name, kernel_string, n, args, params = test_kernel
     kernel_function = kernelbuilder.PythonKernel(*test_kernel, lang=backend)
     reference = kernel_function(*args)
-    assert np.allclose(reference[0], args[1]+args[2])
+    assert np.allclose(reference[0], args[1] + args[2])
 
 
 @pytest.mark.parametrize("backend", backends)
@@ -42,21 +42,23 @@ def test_PythonKernel_tuned(test_kernel, backend):
     c, a, b, n = args
     test_results_file = "test_results_file.json"
     results = params.copy()
-    results['time'] = 1.0
+    results["time"] = 1.0
     env = {"device_name": "bogus GPU"}
     try:
-        #create a fake results file
+        # create a fake results file
         integration.store_results(test_results_file, kernel_name, kernel_string, params, n, [results], env)
 
-        #create a kernel using the results
-        kernel_function = kernelbuilder.PythonKernel(kernel_name, kernel_string, n, args, results_file=test_results_file, lang=backend)
+        # create a kernel using the results
+        kernel_function = kernelbuilder.PythonKernel(
+            kernel_name, kernel_string, n, args, results_file=test_results_file, lang=backend
+        )
 
-        #test if params were retrieved correctly
+        # test if params were retrieved correctly
         assert kernel_function.params["block_size_x"] == 384
 
-        #see if it functions properly
+        # see if it functions properly
         reference = kernel_function(c, a, b, n)
-        assert np.allclose(reference[0], a+b)
+        assert np.allclose(reference[0], a + b)
 
     finally:
         util.delete_temp_file(test_results_file)
diff --git a/test/test_nvml_mocked.py b/test/test_nvml_mocked.py
index b986f6686..43b0ec294 100644
--- a/test/test_nvml_mocked.py
+++ b/test/test_nvml_mocked.py
@@ -8,49 +8,49 @@
 from kernel_tuner.observers.nvml import get_nvml_pwr_limits, get_nvml_gr_clocks, get_nvml_mem_clocks, get_idle_power
 
 
-
 def setup_mock(nvml):
-    nvml.return_value.configure_mock(pwr_constraints=(90000, 150000),
-                                     supported_mem_clocks=[2100],
-                                     supported_gr_clocks={2100: [1000, 2000, 3000]},
-                                     pwr_usage=lambda : 5000)
+    nvml.return_value.configure_mock(
+        pwr_constraints=(90000, 150000),
+        supported_mem_clocks=[2100],
+        supported_gr_clocks={2100: [1000, 2000, 3000]},
+        pwr_usage=lambda: 5000,
+    )
 
     return nvml
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_nvml_pwr_limits(nvml):
     nvml = setup_mock(nvml)
     result = get_nvml_pwr_limits(0, quiet=True)
-    assert result['nvml_pwr_limit'] == [90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
+    assert result["nvml_pwr_limit"] == [90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
 
     result = get_nvml_pwr_limits(0, n=5, quiet=True)
-    assert len(result['nvml_pwr_limit']) == 5
-    assert result['nvml_pwr_limit'][0] == 90
-    assert result['nvml_pwr_limit'][-1] == 150
+    assert len(result["nvml_pwr_limit"]) == 5
+    assert result["nvml_pwr_limit"][0] == 90
+    assert result["nvml_pwr_limit"][-1] == 150
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_nvml_gr_clocks(nvml):
     nvml = setup_mock(nvml)
     result = get_nvml_gr_clocks(0, quiet=True)
-    assert result['nvml_gr_clock'] == [1000, 2000, 3000]
+    assert result["nvml_gr_clock"] == [1000, 2000, 3000]
 
     result = get_nvml_gr_clocks(0, n=2, quiet=True)
-    assert result['nvml_gr_clock'] == [1000, 3000]
+    assert result["nvml_gr_clock"] == [1000, 3000]
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_nvml_mem_clocks(nvml):
     nvml = setup_mock(nvml)
     result = get_nvml_mem_clocks(0, quiet=False)
     print(result)
-    assert result['nvml_mem_clock'] == [2100]
+    assert result["nvml_mem_clock"] == [2100]
 
 
-@patch('kernel_tuner.observers.nvml.nvml')
+@patch("kernel_tuner.observers.nvml.nvml")
 def test_get_idle_power(nvml):
     nvml = setup_mock(nvml)
     result = get_idle_power(0)
     assert np.isclose(result, 5)
-
diff --git a/test/test_observers.py b/test/test_observers.py
index 97928b477..a20cec89c 100644
--- a/test/test_observers.py
+++ b/test/test_observers.py
@@ -30,6 +30,7 @@ def test_nvml_observer(env):
     assert "temperature" in result[0]
     assert result[0]["temperature"] > 0
 
+
 @skip_if_no_pycuda
 def test_custom_observer(env):
     env[-1]["block_size_x"] = [128]
@@ -43,34 +44,39 @@ def get_results(self):
     assert "name" in result[0]
     assert len(result[0]["name"]) > 0
 
+
 @skip_if_no_pycuda
 def test_register_observer_pycuda(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CUDA')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang="CUDA")
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0
 
+
 @skip_if_no_cupy
 def test_register_observer_cupy(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CuPy')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang="CuPy")
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0
 
+
 @skip_if_no_cuda
 def test_register_observer_nvcuda(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='NVCUDA')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang="NVCUDA")
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0
 
+
 @skip_if_no_opencl
 def test_register_observer_opencl(env_opencl):
     with raises(NotImplementedError) as err:
-        kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='OpenCL')
+        kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang="OpenCL")
     assert err.errisinstance(NotImplementedError)
     assert "OpenCL" in str(err.value)
 
+
 @skip_if_no_hip
 def test_register_observer_hip(env_hip):
     with raises(NotImplementedError) as err:
-        kernel_tuner.tune_kernel(*env_hip, observers=[RegisterObserver()], lang='HIP')
+        kernel_tuner.tune_kernel(*env_hip, observers=[RegisterObserver()], lang="HIP")
     assert err.errisinstance(NotImplementedError)
     assert "Hip" in str(err.value)
diff --git a/test/test_opencl_functions.py b/test/test_opencl_functions.py
index 644c5dc08..5fafb0b03 100644
--- a/test/test_opencl_functions.py
+++ b/test/test_opencl_functions.py
@@ -15,7 +15,6 @@
 
 @skip_if_no_opencl
 def test_ready_argument_list():
-
     size = 1000
     a = np.int32(75)
     b = np.random.randn(size).astype(np.float32)
@@ -36,7 +35,6 @@ def test_ready_argument_list():
 
 @skip_if_no_opencl
 def test_compile():
-
     original_kernel = """
     __kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) {
         int gid = get_global_id(0);
@@ -58,13 +56,13 @@ def test_compile():
 
 @skip_if_no_opencl
 def test_run_kernel():
-
     threads = (1, 2, 3)
     grid = (4, 5, 1)
 
     def test_func(queue, global_size, local_size, arg):
         assert all(global_size == np.array([4, 10, 3]))
-        return type('Event', (object,), {'wait': lambda self: 0})()
+        return type("Event", (object,), {"wait": lambda self: 0})()
+
     dev = opencl.OpenCLFunctions(0)
     dev.run_kernel(test_func, [0], threads, grid)
 
diff --git a/test/test_parallel_tuning.py b/test/test_parallel_tuning.py
index bbe4d96b7..a169c3ddc 100644
--- a/test/test_parallel_tuning.py
+++ b/test/test_parallel_tuning.py
@@ -13,6 +13,7 @@
 except Exception:
     pass
 
+
 @pytest.fixture
 def env():
     kernel_string = """
@@ -36,7 +37,8 @@ def env():
 
     return ["vector_add", kernel_string, size, args, tune_params]
 
+
 @skip_if_no_pycuda
 def test_parallel_tune_kernel(env):
     result, _ = tune_kernel(*env, lang="CUDA", verbose=True, parallel_mode=True)
-    assert len(result) > 0
\ No newline at end of file
+    assert len(result) > 0
diff --git a/test/test_pycuda_functions.py b/test/test_pycuda_functions.py
index 3581a43dd..02da8dcec 100644
--- a/test/test_pycuda_functions.py
+++ b/test/test_pycuda_functions.py
@@ -13,7 +13,6 @@
 
 @skip_if_no_pycuda
 def test_ready_argument_list():
-
     size = 1000
     a = np.int32(75)
     b = np.random.randn(size).astype(np.float32)
@@ -33,7 +32,6 @@ def test_ready_argument_list():
 
 @skip_if_no_pycuda
 def test_compile():
-
     kernel_string = """
     __global__ void vector_add(float *c, float *a, float *b, int n) {
         int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -55,5 +53,3 @@ def test_compile():
 
 def dummy_func(a, b, block=0, grid=0, stream=None, shared=0, texrefs=None):
     pass
-
-
diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py
index 6bdfeef07..2d94ca4aa 100644
--- a/test/test_pycuda_mocked.py
+++ b/test/test_pycuda_mocked.py
@@ -11,19 +11,21 @@
 
 def setup_mock(drv):
     context = Mock()
-    devprops = {'MAX_THREADS_PER_BLOCK': 1024,
-                'COMPUTE_CAPABILITY_MAJOR': 5,
-                'COMPUTE_CAPABILITY_MINOR': 5,}
+    devprops = {
+        "MAX_THREADS_PER_BLOCK": 1024,
+        "COMPUTE_CAPABILITY_MAJOR": 5,
+        "COMPUTE_CAPABILITY_MINOR": 5,
+    }
     context.return_value.get_device.return_value.get_attributes.return_value = devprops
     context.return_value.get_device.return_value.compute_capability.return_value = "55"
     drv.Device.return_value.retain_primary_context.return_value = context()
-    drv.mem_alloc.return_value = 'mem_alloc'
+    drv.mem_alloc.return_value = "mem_alloc"
     return drv
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_ready_argument_list(drv, *args):
     drv = setup_mock(drv)
 
@@ -39,21 +41,20 @@ def test_ready_argument_list(drv, *args):
     print(gpu_args)
 
     drv.mem_alloc.assert_called_once_with(20)
-    drv.memcpy_htod.assert_called_once_with('mem_alloc', b)
+    drv.memcpy_htod.assert_called_once_with("mem_alloc", b)
 
     assert isinstance(gpu_args[0], np.int32)
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_compile(drv, *args):
-
     # setup mocked stuff
     drv = setup_mock(drv)
     dev = pycuda.PyCudaFunctions(0)
     dev.source_mod = Mock()
-    dev.source_mod.return_value.get_function.return_value = 'func'
+    dev.source_mod.return_value.get_function.return_value = "func"
 
     # call compile
     kernel_string = "__global__ void vector_add()"
@@ -65,45 +66,45 @@ def test_compile(drv, *args):
     # verify behavior
     assert dev.source_mod.call_count == 1
     assert dev.current_module is dev.source_mod.return_value
-    assert func == 'func'
+    assert func == "func"
 
     assert kernel_string == list(dev.source_mod.mock_calls[0])[1][0]
     optional_args = list(dev.source_mod.mock_calls[0])[2]
-    assert optional_args['code'] == 'sm_55'
-    assert optional_args['arch'] == 'compute_55'
+    assert optional_args["code"] == "sm_55"
+    assert optional_args["arch"] == "compute_55"
 
 
 def dummy_func(a, b, block=0, grid=0, shared=0, stream=None, texrefs=None):
     pass
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_copy_constant_memory_args(drv, *args):
     drv = setup_mock(drv)
 
     fake_array = np.zeros(10).astype(np.float32)
-    cmem_args = {'fake_array': fake_array}
+    cmem_args = {"fake_array": fake_array}
 
     dev = pycuda.PyCudaFunctions(0)
     dev.current_module = Mock()
-    dev.current_module.get_global.return_value = ['get_global']
+    dev.current_module.get_global.return_value = ["get_global"]
 
     dev.copy_constant_memory_args(cmem_args)
 
-    drv.memcpy_htod.assert_called_once_with('get_global', fake_array)
-    dev.current_module.get_global.assert_called_once_with('fake_array')
+    drv.memcpy_htod.assert_called_once_with("get_global", fake_array)
+    dev.current_module.get_global.assert_called_once_with("fake_array")
 
 
-@patch('kernel_tuner.backends.pycuda.nvml')
-@patch('kernel_tuner.backends.pycuda.DynamicSourceModule')
-@patch('kernel_tuner.backends.pycuda.drv')
+@patch("kernel_tuner.backends.pycuda.nvml")
+@patch("kernel_tuner.backends.pycuda.DynamicSourceModule")
+@patch("kernel_tuner.backends.pycuda.drv")
 def test_copy_texture_memory_args(drv, *args):
     drv = setup_mock(drv)
 
     fake_array = np.zeros(10).astype(np.float32)
-    texmem_args = {'fake_tex': fake_array}
+    texmem_args = {"fake_tex": fake_array}
 
     texref = Mock()
 
@@ -114,13 +115,13 @@ def test_copy_texture_memory_args(drv, *args):
     dev.copy_texture_memory_args(texmem_args)
 
     drv.matrix_to_texref.assert_called_once_with(fake_array, texref, order="C")
-    dev.current_module.get_texref.assert_called_once_with('fake_tex')
+    dev.current_module.get_texref.assert_called_once_with("fake_tex")
 
-    texmem_args = {'fake_tex2': {'array': fake_array, 'filter_mode': 'linear', 'address_mode': ['border', 'clamp']}}
+    texmem_args = {"fake_tex2": {"array": fake_array, "filter_mode": "linear", "address_mode": ["border", "clamp"]}}
 
     dev.copy_texture_memory_args(texmem_args)
     drv.matrix_to_texref.assert_called_with(fake_array, texref, order="C")
-    dev.current_module.get_texref.assert_called_with('fake_tex2')
+    dev.current_module.get_texref.assert_called_with("fake_tex2")
     texref.set_filter_mode.assert_called_once_with(drv.filter_mode.LINEAR)
     texref.set_address_mode.assert_any_call(0, drv.address_mode.BORDER)
-    texref.set_address_mode.assert_any_call(1, drv.address_mode.CLAMP)
\ No newline at end of file
+    texref.set_address_mode.assert_any_call(1, drv.address_mode.CLAMP)
diff --git a/test/test_runners.py b/test/test_runners.py
index 527c1d252..01eebd0e2 100644
--- a/test/test_runners.py
+++ b/test/test_runners.py
@@ -10,8 +10,7 @@
 
 from .context import skip_if_no_pycuda
 
-cache_filename = os.path.dirname(
-    os.path.realpath(__file__)) + "/test_cache_file.json"
+cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/test_cache_file.json"
 
 
 @pytest.fixture
@@ -40,7 +39,6 @@ def env():
 
 @skip_if_no_pycuda
 def test_sequential_runner_alt_block_size_names(env):
-
     kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) {
         int i = blockIdx.x * block_dim_x + threadIdx.x;
         if (i<n) {
@@ -49,11 +47,7 @@ def test_sequential_runner_alt_block_size_names(env):
     }
     """
 
-    tune_params = {
-        "block_dim_x": [128 + 64 * i for i in range(5)],
-        "block_size_y": [1],
-        "block_size_z": [1]
-    }
+    tune_params = {"block_dim_x": [128 + 64 * i for i in range(5)], "block_size_y": [1], "block_size_z": [1]}
 
     env[1] = kernel_string
     env[-1] = tune_params
@@ -63,25 +57,24 @@ def test_sequential_runner_alt_block_size_names(env):
 
     block_size_names = ["block_dim_x"]
 
-    result, _ = tune_kernel(*env,
-                            grid_div_x=["block_dim_x"],
-                            answer=answer,
-                            block_size_names=block_size_names, objective='time', objective_higher_is_better=False)
+    result, _ = tune_kernel(
+        *env,
+        grid_div_x=["block_dim_x"],
+        answer=answer,
+        block_size_names=block_size_names,
+        objective="time",
+        objective_higher_is_better=False
+    )
 
     assert len(result) == len(tune_params["block_dim_x"])
 
 
 @skip_if_no_pycuda
 def test_smem_args(env):
-    result, _ = tune_kernel(*env,
-                            smem_args=dict(size="block_size_x*4"),
-                            verbose=True)
+    result, _ = tune_kernel(*env, smem_args=dict(size="block_size_x*4"), verbose=True)
     tune_params = env[-1]
     assert len(result) == len(tune_params["block_size_x"])
-    result, _ = tune_kernel(
-        *env,
-        smem_args=dict(size=lambda p: p['block_size_x'] * 4),
-        verbose=True)
+    result, _ = tune_kernel(*env, smem_args=dict(size=lambda p: p["block_size_x"] * 4), verbose=True)
     tune_params = env[-1]
     assert len(result) == len(tune_params["block_size_x"])
 
@@ -89,10 +82,7 @@ def test_smem_args(env):
 @skip_if_no_pycuda
 def test_build_cache(env):
     if not os.path.isfile(cache_filename):
-        result, _ = tune_kernel(*env,
-                                cache=cache_filename,
-                                verbose=False,
-                                quiet=True)
+        result, _ = tune_kernel(*env, cache=cache_filename, verbose=False, quiet=True)
         tune_params = env[-1]
         assert len(result) == len(tune_params["block_size_x"])
 
@@ -100,17 +90,18 @@ def test_build_cache(env):
 def test_simulation_runner(env):
     kernel_name, kernel_string, size, args, tune_params = env
     start = time.perf_counter()
-    result, res_env = tune_kernel(*env,
-                                  cache=cache_filename,
-                                  strategy="random_sample",
-                                  simulation_mode=True,
-                                  strategy_options=dict(fraction=1))
+    result, res_env = tune_kernel(
+        *env, cache=cache_filename, strategy="random_sample", simulation_mode=True, strategy_options=dict(fraction=1)
+    )
     actual_time = (time.perf_counter() - start) * 1e3  # ms
     assert len(result) == len(tune_params["block_size_x"])
 
     timings = [
-        'total_framework_time', 'total_strategy_time', 'total_compile_time',
-        'total_benchmark_time', 'overhead_time'
+        "total_framework_time",
+        "total_strategy_time",
+        "total_compile_time",
+        "total_benchmark_time",
+        "overhead_time",
     ]
 
     # ensure all keys are there and non zero
@@ -123,20 +114,22 @@ def test_simulation_runner(env):
 
     # ensure recorded time is sensible number
     recorded_time_including_simulation = sum(res_env[key] for key in timings)
-    assert recorded_time_including_simulation - res_env['simulated_time'] > 0
+    assert recorded_time_including_simulation - res_env["simulated_time"] > 0
 
     # ensure difference between recorded time and actual time + simulated less then 10ms
-    max_time = actual_time + res_env['simulated_time']
+    max_time = actual_time + res_env["simulated_time"]
     assert max_time - recorded_time_including_simulation < 10
 
 
 def test_diff_evo(env):
-    result, _ = tune_kernel(*env,
-                            strategy="diff_evo",
-                            strategy_options=dict(popsize=5),
-                            verbose=True,
-                            cache=cache_filename,
-                            simulation_mode=True)
+    result, _ = tune_kernel(
+        *env,
+        strategy="diff_evo",
+        strategy_options=dict(popsize=5),
+        verbose=True,
+        cache=cache_filename,
+        simulation_mode=True
+    )
     assert len(result) > 0
 
 
@@ -145,24 +138,20 @@ def test_time_keeping(env):
     kernel_name, kernel_string, size, args, tune_params = env
     answer = [args[1] + args[2], None, None, None]
 
-    options = dict(method="uniform",
-                   popsize=10,
-                   maxiter=1,
-                   mutation_chance=1,
-                   max_fevals=10)
+    options = dict(method="uniform", popsize=10, maxiter=1, mutation_chance=1, max_fevals=10)
     start = time.perf_counter()
-    result, env = tune_kernel(*env,
-                              strategy="genetic_algorithm",
-                              strategy_options=options,
-                              verbose=True,
-                              answer=answer)
+    result, env = tune_kernel(*env, strategy="genetic_algorithm", strategy_options=options, verbose=True, answer=answer)
     max_time = (time.perf_counter() - start) * 1e3  # ms
 
     assert len(result) >= 10
 
     timings = [
-        'total_framework_time', 'total_strategy_time', 'total_compile_time',
-        'total_verification_time', 'total_benchmark_time', 'overhead_time'
+        "total_framework_time",
+        "total_strategy_time",
+        "total_compile_time",
+        "total_verification_time",
+        "total_benchmark_time",
+        "overhead_time",
     ]
 
     # ensure all keys are there and non zero
@@ -178,32 +167,29 @@ def test_time_keeping(env):
 
 
 def test_bayesian_optimization(env):
-    for method in [
-            "poi", "ei", "lcb", "lcb-srinivas", "multi", "multi-advanced",
-            "multi-fast"
-    ]:
+    for method in ["poi", "ei", "lcb", "lcb-srinivas", "multi", "multi-advanced", "multi-fast"]:
         print(method, flush=True)
         options = dict(popsize=5, max_fevals=10, method=method)
-        result, _ = tune_kernel(*env,
-                                strategy="bayes_opt",
-                                strategy_options=options,
-                                verbose=True,
-                                cache=cache_filename,
-                                simulation_mode=True)
+        result, _ = tune_kernel(
+            *env,
+            strategy="bayes_opt",
+            strategy_options=options,
+            verbose=True,
+            cache=cache_filename,
+            simulation_mode=True
+        )
         assert len(result) > 0
 
 
 def test_random_sample(env):
-    result, _ = tune_kernel(*env,
-                            strategy="random_sample",
-                            strategy_options={"fraction": 0.1},
-                            cache=cache_filename,
-                            simulation_mode=True)
+    result, _ = tune_kernel(
+        *env, strategy="random_sample", strategy_options={"fraction": 0.1}, cache=cache_filename, simulation_mode=True
+    )
     # check that number of benchmarked kernels is 10% (rounded up)
     assert len(result) == 2
     # check all returned results make sense
     for v in result:
-        assert v['time'] > 0.0 and v['time'] < 1.0
+        assert v["time"] > 0.0 and v["time"] < 1.0
 
 
 @skip_if_no_pycuda
@@ -230,29 +216,18 @@ def test_interface_handles_compile_failures(env):
     }
     """
 
-    results, env = tune_kernel(kernel_name,
-                               kernel_string,
-                               size,
-                               args,
-                               tune_params,
-                               verbose=True)
+    results, env = tune_kernel(kernel_name, kernel_string, size, args, tune_params, verbose=True)
 
-    failed_config = [
-        record for record in results if record["block_size_x"] == 256
-    ][0]
+    failed_config = [record for record in results if record["block_size_x"] == 256][0]
     assert isinstance(failed_config["time"], util.CompilationFailedConfig)
 
 
 @skip_if_no_pycuda
 def test_runner(env):
-
     kernel_name, kernel_source, problem_size, arguments, tune_params = env
 
     # create KernelSource
-    kernelsource = core.KernelSource(kernel_name,
-                                     kernel_source,
-                                     lang=None,
-                                     defines=None)
+    kernelsource = core.KernelSource(kernel_name, kernel_source, lang=None, defines=None)
 
     # create option bags
     device = 0
@@ -263,20 +238,13 @@ def test_runner(env):
     objective = "GFLOP/s"
     metrics = dict({objective: lambda p: 1})
     opts = locals()
-    kernel_options = Options([(k, opts.get(k, None))
-                              for k in _kernel_options.keys()])
-    tuning_options = Options([(k, opts.get(k, None))
-                              for k in _tuning_options.keys()])
-    device_options = Options([(k, opts.get(k, None))
-                              for k in _device_options.keys()])
+    kernel_options = Options([(k, opts.get(k, None)) for k in _kernel_options.keys()])
+    tuning_options = Options([(k, opts.get(k, None)) for k in _tuning_options.keys()])
+    device_options = Options([(k, opts.get(k, None)) for k in _device_options.keys()])
     tuning_options.cachefile = None
 
     # create runner
-    runner = SequentialRunner(kernelsource,
-                              kernel_options,
-                              device_options,
-                              iterations,
-                              observers=None)
+    runner = SequentialRunner(kernelsource, kernel_options, device_options, iterations, observers=None)
     runner.warmed_up = True  # disable warm up for this test
 
     # select a config to run
@@ -285,12 +253,11 @@ def test_runner(env):
     # insert configurations to run with this runner in this list
     # each configuration is described as a list of values, one for each tunable parameter
     # the order should correspond to the order of parameters specified in tune_params
-    searchspace.append(
-        [32])  # vector_add only has one tunable parameter (block_size_x)
+    searchspace.append([32])  # vector_add only has one tunable parameter (block_size_x)
 
     # call the runner
     results = runner.run(searchspace, tuning_options)
 
     assert len(results) == 1
-    assert results[0]['block_size_x'] == 32
-    assert len(results[0]['times']) == iterations
+    assert results[0]["block_size_x"] == 32
+    assert len(results[0]["times"]) == iterations
diff --git a/test/test_searchspace.py b/test/test_searchspace.py
index 8672c1d03..f31b052ba 100644
--- a/test/test_searchspace.py
+++ b/test/test_searchspace.py
@@ -37,6 +37,7 @@
 
 # each GPU must have at least one layer and the sum of all layers must not exceed the total number of layers
 
+
 def _min_func(gpu1, gpu2, gpu3, gpu4):
     return min([gpu1, gpu2, gpu3, gpu4]) >= 1
 
@@ -79,12 +80,13 @@ def test_internal_representation():
     for index, dict_config in enumerate(searchspace.get_list_dict().keys()):
         assert dict_config == searchspace.list[index]
 
+
 def test_check_restrictions():
     """Test whether the outcome of restrictions is as expected when using check_restrictions."""
     from kernel_tuner.util import check_restrictions
 
-    param_config_false = {'x': 1, 'y': 4, 'z': "string_1" }
-    param_config_true = {'x': 3, 'y': 4, 'z': "string_1" }
+    param_config_false = {"x": 1, "y": 4, "z": "string_1"}
+    param_config_true = {"x": 3, "y": 4, "z": "string_1"}
 
     assert check_restrictions(simple_searchspace.restrictions, param_config_false, verbose=False) is False
     assert check_restrictions(simple_searchspace.restrictions, param_config_true, verbose=False) is True
@@ -95,12 +97,11 @@ def test_against_bruteforce():
     compare_two_searchspace_objects(simple_searchspace, simple_searchspace_bruteforce)
     compare_two_searchspace_objects(searchspace, searchspace_bruteforce)
 
+
 def test_sort():
     """Test that the sort searchspace option works as expected."""
     simple_searchspace_sort = Searchspace(
-        simple_tuning_options.tune_params,
-        simple_tuning_options.restrictions,
-        max_threads
+        simple_tuning_options.tune_params, simple_tuning_options.restrictions, max_threads
     )
 
     expected = [
@@ -130,9 +131,7 @@ def test_sort():
 def test_sort_reversed():
     """Test that the sort searchspace option with the sort_last_param_first option enabled works as expected."""
     simple_searchspace_sort_reversed = Searchspace(
-        simple_tuning_options.tune_params,
-        simple_tuning_options.restrictions,
-        max_threads
+        simple_tuning_options.tune_params, simple_tuning_options.restrictions, max_threads
     )
 
     expected = [
@@ -200,7 +199,9 @@ def test_random_sample():
         print(value_error_expectation_message)
         assert False
     except ValueError as e:
-        assert "number of samples requested" in str(e) and "is greater than the searchspace size" in str(e), f"Expected string not in error {e}"
+        assert "number of samples requested" in str(e) and "is greater than the searchspace size" in str(
+            e
+        ), f"Expected string not in error {e}"
     except Exception:
         print(value_error_expectation_message)
         assert False
@@ -238,8 +239,8 @@ def test_neighbors_hamming():
     """Test whether the neighbors with Hamming distance are as expected."""
     test_config = tuple([1, 4, "string_1"])
     expected_neighbors = [
-        (1.5, 4, 'string_1'),
-        (3, 4, 'string_1'),
+        (1.5, 4, "string_1"),
+        (3, 4, "string_1"),
     ]
 
     __test_neighbors(test_config, expected_neighbors, "Hamming")
@@ -249,10 +250,10 @@ def test_neighbors_strictlyadjacent():
     """Test whether the strictly adjacent neighbors are as expected."""
     test_config = tuple([1, 4, "string_1"])
     expected_neighbors = [
-        (1.5, 4, 'string_1'),
-        (1.5, 4, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 5.5, 'string_2'),
+        (1.5, 4, "string_1"),
+        (1.5, 4, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (1.5, 5.5, "string_2"),
     ]
 
     __test_neighbors(test_config, expected_neighbors, "strictly-adjacent")
@@ -262,10 +263,10 @@ def test_neighbors_adjacent():
     """Test whether the adjacent neighbors are as expected."""
     test_config = tuple([1, 4, "string_1"])
     expected_neighbors = [
-        (1.5, 4, 'string_1'),
-        (1.5, 4, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 5.5, 'string_2'),
+        (1.5, 4, "string_1"),
+        (1.5, 4, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (1.5, 5.5, "string_2"),
     ]
 
     __test_neighbors(test_config, expected_neighbors, "adjacent")
@@ -275,22 +276,18 @@ def test_neighbors_fictious():
     """Test whether the neighbors are as expected for a fictious parameter configuration (i.e. not existing in the search space due to restrictions)."""
     test_config = tuple([1.5, 4, "string_1"])
     expected_neighbors_hamming = [
-        (1.5, 4, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (3, 4, 'string_1'),
-    ]
-    expected_neighbors_strictlyadjacent = [
-        (1.5, 5.5, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 4, 'string_2')
+        (1.5, 4, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (3, 4, "string_1"),
     ]
+    expected_neighbors_strictlyadjacent = [(1.5, 5.5, "string_2"), (1.5, 5.5, "string_1"), (1.5, 4, "string_2")]
 
     expected_neighbors_adjacent = [
-        (1.5, 5.5, 'string_2'),
-        (1.5, 5.5, 'string_1'),
-        (1.5, 4, 'string_2'),
-        (3, 4, 'string_1'),
-        (3, 4, 'string_2'),
+        (1.5, 5.5, "string_2"),
+        (1.5, 5.5, "string_1"),
+        (1.5, 4, "string_2"),
+        (3, 4, "string_1"),
+        (3, 4, "string_2"),
     ]
 
     __test_neighbors_direct(test_config, expected_neighbors_hamming, "Hamming")
@@ -301,10 +298,7 @@ def test_neighbors_fictious():
 def test_neighbors_cached():
     """Test whether retrieving a set of neighbors twice returns the cached version."""
     simple_searchspace_duplicate = Searchspace(
-        simple_tuning_options.tune_params,
-        simple_tuning_options.restrictions,
-        max_threads,
-        neighbor_method="Hamming"
+        simple_tuning_options.tune_params, simple_tuning_options.restrictions, max_threads, neighbor_method="Hamming"
     )
 
     test_configs = simple_searchspace_duplicate.get_random_sample(5)
@@ -333,12 +327,7 @@ def test_order_param_configs():
     """Test whether the ordering of parameter configurations according to parameter index happens as expected."""
     test_order = [1, 2, 0]
     test_config = tuple([1, 4, "string_1"])
-    expected_order = [
-        (1.5, 5.5, 'string_2'),
-        (1.5, 4, 'string_2'),
-        (1.5, 4, 'string_1'),
-        (1.5, 5.5, 'string_1')
-    ]
+    expected_order = [(1.5, 5.5, "string_2"), (1.5, 4, "string_2"), (1.5, 4, "string_1"), (1.5, 5.5, "string_1")]
     neighbors = simple_searchspace.get_neighbors_no_cache(test_config, "adjacent")
 
     # test failsafe too few indices
@@ -391,9 +380,9 @@ def test_small_searchspace():
     """Test a small real-world searchspace and the usage of the `max_threads` parameter."""
     max_threads = 1024
     tune_params = dict()
-    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32*i for i in range(1,33)]
+    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)]
     tune_params["block_size_y"] = [2**i for i in range(6)]
-    tune_params["tile_size_x"] = [i for i in range(1,11)]
+    tune_params["tile_size_x"] = [i for i in range(1, 11)]
     restrictions = [
         "block_size_x*block_size_y >= 32",
         f"block_size_x*block_size_y <= {max_threads}",
@@ -402,42 +391,45 @@ def test_small_searchspace():
     searchspace_bruteforce = Searchspace(tune_params, restrictions, max_threads, framework="bruteforce")
     compare_two_searchspace_objects(searchspace, searchspace_bruteforce)
 
+
 def test_full_searchspace(compare_against_bruteforce=False):
     """Tests a full real-world searchspace (expdist). If `compare_against_bruteforce`, the searcspace will be bruteforced to compare against, this can take a long time!."""
     # device characteristics
     dev = {
-        'device_name': 'NVIDIA A40',
-        'max_threads': 1024,
-        'max_shared_memory_per_block': 49152,
-        'max_shared_memory': 102400
+        "device_name": "NVIDIA A40",
+        "max_threads": 1024,
+        "max_shared_memory_per_block": 49152,
+        "max_shared_memory": 102400,
     }
 
     # tunable parameters and restrictions
     tune_params = dict()
-    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32*i for i in range(1,33)]
+    tune_params["block_size_x"] = [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)]
     tune_params["block_size_y"] = [2**i for i in range(6)]
-    tune_params["tile_size_x"] = [i for i in range(1,11)]
-    tune_params["tile_size_y"] = [i for i in range(1,11)]
-    tune_params["temporal_tiling_factor"] = [i for i in range(1,11)]
+    tune_params["tile_size_x"] = [i for i in range(1, 11)]
+    tune_params["tile_size_y"] = [i for i in range(1, 11)]
+    tune_params["temporal_tiling_factor"] = [i for i in range(1, 11)]
     max_tfactor = max(tune_params["temporal_tiling_factor"])
     tune_params["max_tfactor"] = [max_tfactor]
-    tune_params["loop_unroll_factor_t"] = [i for i in range(1,max_tfactor+1)]
-    tune_params["sh_power"] = [0,1]
-    tune_params["blocks_per_sm"] = [0,1,2,3,4]
+    tune_params["loop_unroll_factor_t"] = [i for i in range(1, max_tfactor + 1)]
+    tune_params["sh_power"] = [0, 1]
+    tune_params["blocks_per_sm"] = [0, 1, 2, 3, 4]
 
     restrictions = [
-            "block_size_x*block_size_y >= 32",
-            "temporal_tiling_factor % loop_unroll_factor_t == 0",
-            f"block_size_x*block_size_y <= {dev['max_threads']}",
-            f"(block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
-            f"blocks_per_sm == 0 or (((block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})"
-        ]
+        "block_size_x*block_size_y >= 32",
+        "temporal_tiling_factor % loop_unroll_factor_t == 0",
+        f"block_size_x*block_size_y <= {dev['max_threads']}",
+        f"(block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
+        f"blocks_per_sm == 0 or (((block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})",
+    ]
 
     # build the searchspace
-    searchspace = Searchspace(tune_params, restrictions, max_threads=dev['max_threads'])
+    searchspace = Searchspace(tune_params, restrictions, max_threads=dev["max_threads"])
 
     if compare_against_bruteforce:
-        searchspace_bruteforce = Searchspace(tune_params, restrictions, max_threads=dev['max_threads'], framework='bruteforce')
+        searchspace_bruteforce = Searchspace(
+            tune_params, restrictions, max_threads=dev["max_threads"], framework="bruteforce"
+        )
         compare_two_searchspace_objects(searchspace, searchspace_bruteforce)
     else:
         assert searchspace.size == len(searchspace.list) == 349853
diff --git a/test/test_util_functions.py b/test/test_util_functions.py
index f3431991b..e90da8965 100644
--- a/test/test_util_functions.py
+++ b/test/test_util_functions.py
@@ -35,17 +35,13 @@ def test_get_grid_dimensions1():
     assert grid[1] == 28
     assert grid[2] == 1
 
-    grid = get_grid_dimensions(
-        problem_size, params, (grid_div[0], None, None), block_size_names
-    )
+    grid = get_grid_dimensions(problem_size, params, (grid_div[0], None, None), block_size_names)
 
     assert grid[0] == 25
     assert grid[1] == 1024
     assert grid[2] == 1
 
-    grid = get_grid_dimensions(
-        problem_size, params, (None, grid_div[1], None), block_size_names
-    )
+    grid = get_grid_dimensions(problem_size, params, (None, grid_div[1], None), block_size_names)
 
     assert grid[0] == 1024
     assert grid[1] == 28
@@ -67,9 +63,7 @@ def test_get_grid_dimensions2():
     grid_div_x = ["block_x*8"]
     grid_div_y = ["(block_y+2)/8"]
 
-    grid = get_grid_dimensions(
-        problem_size, params, (grid_div_x, grid_div_y, None), block_size_names
-    )
+    grid = get_grid_dimensions(problem_size, params, (grid_div_x, grid_div_y, None), block_size_names)
 
     assert grid[0] == 4
     assert grid[1] == 256
@@ -83,9 +77,7 @@ def test_get_grid_dimensions3():
     grid_div_y = ["(block_y+2)/8"]
 
     def assert_grid_dimensions(problem_size):
-        grid = get_grid_dimensions(
-            problem_size, params, (grid_div_x, grid_div_y, None), block_size_names
-        )
+        grid = get_grid_dimensions(problem_size, params, (grid_div_x, grid_div_y, None), block_size_names)
         assert grid[0] == 1
         assert grid[1] == 256
         assert grid[2] == 1
@@ -187,9 +179,7 @@ def test_prepare_kernel_string():
     # Throw exception on invalid name (for instance, a space in the name)
     invalid_defines = {"invalid name": "1"}
     with pytest.raises(ValueError):
-        prepare_kernel_string(
-            "this", kernel, params, grid, threads, block_size_names, "", invalid_defines
-        )
+        prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "", invalid_defines)
 
 
 def test_prepare_kernel_string_partial_loop_unrolling():
@@ -204,9 +194,7 @@ def test_prepare_kernel_string_partial_loop_unrolling():
     params = dict()
     params["loop_unroll_factor_monkey"] = 8
 
-    _, output = prepare_kernel_string(
-        "this", kernel, params, grid, threads, block_size_names, "CUDA", None
-    )
+    _, output = prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "CUDA", None)
     assert "constexpr int loop_unroll_factor_monkey = 8;" in output
 
     params["loop_unroll_factor_monkey"] = 0
@@ -214,6 +202,7 @@ def test_prepare_kernel_string_partial_loop_unrolling():
     assert "constexpr int loop_unroll_factor_monkey" not in output
     assert "#pragma unroll loop_unroll_factor_monkey" not in output
 
+
 def test_replace_param_occurrences():
     kernel = "this is a weird kernel"
     params = dict()
@@ -221,9 +210,7 @@ def test_replace_param_occurrences():
     params["weird"] = 14
 
     new_kernel = replace_param_occurrences(kernel, params)
-    assert (
-        new_kernel == "this 8 a 14 kernel"
-    )  # Note: The "is" in "this" should not be replaced
+    assert new_kernel == "this 8 a 14 kernel"  # Note: The "is" in "this" should not be replaced
 
     new_kernel = replace_param_occurrences(kernel, dict())
     assert kernel == new_kernel
@@ -351,9 +338,7 @@ def test_check_argument_list3():
         }
         """
     args = [np.uint16(42), np.float16([3, 4, 6]), np.int32([300])]
-    assert_user_warning(
-        check_argument_list, [kernel_name, kernel_string, args], "at position 2"
-    )
+    assert_user_warning(check_argument_list, [kernel_name, kernel_string, args], "at position 2")
 
 
 def test_check_argument_list4():
@@ -363,9 +348,7 @@ def test_check_argument_list4():
         }
         """
     args = [np.uint16(42), np.float16([3, 4, 6]), np.int64([300]), np.ubyte(32)]
-    assert_user_warning(
-        check_argument_list, [kernel_name, kernel_string, args], "do not match in size"
-    )
+    assert_user_warning(check_argument_list, [kernel_name, kernel_string, args], "do not match in size")
 
 
 def test_check_argument_list5():
@@ -483,18 +466,12 @@ def test_warnings(function, args, number, warning_type):
 
     # check warning does not triger when nondefault block size names are used correctly
     block_size_names = ["block_size_a", "block_size_b"]
-    tune_params = dict(
-        zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3])
-    )
-    test_warnings(
-        check_block_size_params_names_list, [block_size_names, tune_params], 0, None
-    )
+    tune_params = dict(zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3]))
+    test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 0, None)
 
     # check that a warning is issued when none of the default names are used and no alternative names are specified
     block_size_names = None
-    tune_params = dict(
-        zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3])
-    )
+    tune_params = dict(zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3]))
     test_warnings(
         check_block_size_params_names_list,
         [block_size_names, tune_params],
@@ -504,12 +481,8 @@ def test_warnings(function, args, number, warning_type):
 
     # check that no error is raised when any of the default block size names is being used
     block_size_names = None
-    tune_params = dict(
-        zip(["block_size_x", "several_other_things"], [[1, 2, 3, 4], [2, 4]])
-    )
-    test_warnings(
-        check_block_size_params_names_list, [block_size_names, tune_params], 0, None
-    )
+    tune_params = dict(zip(["block_size_x", "several_other_things"], [[1, 2, 3, 4], [2, 4]]))
+    test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 0, None)
 
 
 def test_get_kernel_string_func():
@@ -691,10 +664,7 @@ def test_process_metrics():
     # assert params["b"] == 15
 
     # test if a metric overrides any existing metrics
-    params = {
-        "x": 15,
-        "b": 12
-    }
+    params = {"x": 15, "b": 12}
     metrics = dict()
     metrics["b"] = "x"
     params = process_metrics(params, metrics)
@@ -704,7 +674,11 @@ def test_process_metrics():
 def test_parse_restrictions():
     tune_params = {"block_size_x": [50, 100], "use_padding": [0, 1]}
     restrict = ["block_size_x != 320"]
-    restrictions = ["block_size_x != 320", "use_padding == 0 or block_size_x % 32 != 0", "50 <= block_size_x * use_padding < 100"]
+    restrictions = [
+        "block_size_x != 320",
+        "use_padding == 0 or block_size_x % 32 != 0",
+        "50 <= block_size_x * use_padding < 100",
+    ]
 
     # test the monolithic parsed function
     parsed = parse_restrictions(restrict, tune_params, monolithic=True)[0]
@@ -746,11 +720,15 @@ def test_parse_restrictions():
     rw_tune_params = dict()
     rw_tune_params["tile_size_x"] = [1, 2, 3, 4, 5, 6, 7, 8]
     rw_tune_params["tile_size_y"] = [1, 2, 3, 4, 5, 6, 7, 8]
-    parsed_constraint, params_constraint = parse_restrictions(["tile_size_x*tile_size_y<30"], rw_tune_params, try_to_constraint=True)[0]
+    parsed_constraint, params_constraint = parse_restrictions(
+        ["tile_size_x*tile_size_y<30"], rw_tune_params, try_to_constraint=True
+    )[0]
     assert all(param in rw_tune_params for param in params_constraint)
     assert isinstance(parsed_constraint, MaxProdConstraint)
     assert parsed_constraint._maxprod == 29
-    parsed_constraint, params_constraint = parse_restrictions(["30<tile_size_x*tile_size_y"], rw_tune_params, try_to_constraint=True)[0]
+    parsed_constraint, params_constraint = parse_restrictions(
+        ["30<tile_size_x*tile_size_y"], rw_tune_params, try_to_constraint=True
+    )[0]
     assert all(param in rw_tune_params for param in params_constraint)
     assert isinstance(parsed_constraint, MinProdConstraint)
     assert parsed_constraint._minprod == 31