Skip to content

Commit 5addecf

Browse files
committed
Make elements_per_thread tunable in KT example
1 parent 94c51dc commit 5addecf

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

kernel_tuner/vector_add.cu

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
#include "kernel_float.h"
22
namespace kf = kernel_float;
33

4-
__global__ void vector_add(kf::vec<float_type, 1>* c, const kf::vec<float_type, 1>* a, const kf::vec<float_type, 1>* b, int n) {
4+
__global__ void vector_add(
5+
kf::vec<float_type, elements_per_thread>* c,
6+
const kf::vec<float_type, elements_per_thread>* a,
7+
const kf::vec<float_type, elements_per_thread>* b,
8+
int n
9+
) {
510
int i = blockIdx.x * blockDim.x + threadIdx.x;
6-
if (i < n) {
11+
if (i * elements_per_thread < n) {
712
c[i] = a[i] + b[i];
813
}
914
}

kernel_tuner/vector_add.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def tune():
3434
tune_params = dict()
3535
tune_params["block_size_x"] = [64, 128, 256, 512]
3636
tune_params["float_type"] = ["half", "float", "double"]
37+
tune_params["elements_per_thread"] = [1, 2, 4, 8]
3738

3839
# Observers will measure the error using either RMSE or MRE as error metric
3940
observers = [

0 commit comments

Comments
 (0)