Make elements_per_thread tunable in KT example

stijnh · stijnh · commit 5addecff2715 · 2024-04-26T15:25:32.000+02:00
diff --git a/kernel_tuner/vector_add.cu b/kernel_tuner/vector_add.cu
@@ -1,9 +1,14 @@
 #include "kernel_float.h"
 namespace kf = kernel_float;
 
-__global__ void vector_add(kf::vec<float_type, 1>* c, const kf::vec<float_type, 1>* a, const kf::vec<float_type, 1>* b, int n) {
+__global__ void vector_add(
+        kf::vec<float_type, elements_per_thread>* c,
+        const kf::vec<float_type, elements_per_thread>* a,
+        const kf::vec<float_type, elements_per_thread>* b,
+        int n
+) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) {
+    if (i * elements_per_thread < n) {
         c[i] = a[i] + b[i];
     }
 }
diff --git a/kernel_tuner/vector_add.py b/kernel_tuner/vector_add.py
@@ -34,6 +34,7 @@ def tune():
     tune_params = dict()
     tune_params["block_size_x"] = [64, 128, 256, 512]
     tune_params["float_type"] = ["half", "float", "double"]
+    tune_params["elements_per_thread"] = [1, 2, 4, 8]
 
     # Observers will measure the error using either RMSE or MRE as error metric
     observers = [