Skip to content

Commit ce289f4

Browse files
committed
fix: support iluvatar TG-V200
1 parent f53154d commit ce289f4

File tree

4 files changed

+8
-1
lines changed

4 files changed

+8
-1
lines changed

src/infiniop/devices/nvidia/nvidia_kernel_common.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define CUDA_BLOCK_SIZE_4096 4096
1616
#define CUDA_BLOCK_SIZE_1024 1024
1717
#define CUDA_BLOCK_SIZE_512 512
18+
#define CUDA_BLOCK_SIZE_2048 2048
1819

1920
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
2021

src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
8888
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
8989
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
9090
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
91+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
92+
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
93+
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
94+
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
9195
} else {
9296
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
9397
}

src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ infiniStatus_t Descriptor::calculate(
123123
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
124124
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
125125
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
126+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
127+
CHECK_STATUS(launchKernel<256>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
126128
} else {
127129
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
128130
}

xmake/iluvatar.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ target("infiniop-iluvatar")
4343

4444
set_warnings("all", "error")
4545
add_cuflags("-Wno-error=unused-private-field")
46-
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
46+
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", "--cuda-gpu-arch=ivcore20", {force = true})
4747
add_culdflags("-fPIC")
4848
add_cxflags("-fPIC")
4949

0 commit comments

Comments
 (0)