Fix KernelElem test hang caused by reading garbage Fault Tolerance state

Regina8023 · meta-codesync[bot] · commit 515ffb328e6f · 2025-11-04T00:00:43.000-08:00
Summary: Similar to D85974514, the KernelElem test kernels rely on proper initialized value of `shmDevState.enableCancellableWaits` because they call `KernelTestHostAbort` in the kernel. Manually set the value before running anything in the test kernel.

Reviewed By: arttianezhu

Differential Revision: D86158576

fbshipit-source-id: f7dfbb065bda209faa2bb47ed5850d554c97cf5d
diff --git a/comms/ctran/gpe/tests/KernelElemPoolUT.cc b/comms/ctran/gpe/tests/KernelElemPoolUT.cc
@@ -6,6 +6,7 @@
 #include "comms/ctran/Ctran.h"
 #include "comms/ctran/gpe/CtranGpeDev.h"
 #include "comms/ctran/gpe/CtranGpeImpl.h"
+#include "comms/ctran/utils/CudaWrap.h"
 // FIXME [REBASE]: update the path once moved to fbcode/comms
 #include "comms/ctran/gpe/tests/KernelElemPoolUTKernels.h"
 #include "comms/ctran/tests/CtranXPlatUtUtils.h"
@@ -149,12 +150,17 @@ TEST_F(KernelElemPoolTest, PostRevokeComplete) {
   dim3 grid = {ngroups, 1, 1};
   dim3 blocks = {640, 1, 1};
   void* args[] = {&elemList, &unuseIdx};
+
+  CUDACHECK_TEST(cudaFuncSetAttribute(
+      (void*)KElemPostRevokeKernel,
+      cudaFuncAttributeMaxDynamicSharedMemorySize,
+      sizeof(CtranAlgoDeviceState)));
   CUDACHECK_TEST(cudaLaunchKernel(
       reinterpret_cast<void*>(KElemPostRevokeKernel),
       grid,
       blocks,
       args,
-      0,
+      sizeof(CtranAlgoDeviceState),
       0));
 
   // Now host side posts the elems, revoke only 1 elem in the middle
@@ -224,8 +230,18 @@ TEST_F(KernelElemPoolTest, PostWait) {
   dim3 grid = {ngroups, 1, 1};
   dim3 blocks = {640, 1, 1};
   void* args[] = {&elem, &count, &vec1, &vec2};
+
+  CUDACHECK_TEST(cudaFuncSetAttribute(
+      (void*)KElemPostWaitKernel,
+      cudaFuncAttributeMaxDynamicSharedMemorySize,
+      sizeof(CtranAlgoDeviceState)));
   CUDACHECK_TEST(cudaLaunchKernel(
-      reinterpret_cast<void*>(KElemPostWaitKernel), grid, blocks, args, 0, 0));
+      reinterpret_cast<void*>(KElemPostWaitKernel),
+      grid,
+      blocks,
+      args,
+      sizeof(CtranAlgoDeviceState),
+      0));
 
   // Host side posts the elems
   elem->post();
@@ -282,12 +298,17 @@ TEST_F(KernelElemPoolTest, PostMultiGroupSets) {
   dim3 grid = {nGroups * nGroupsSets, 1, 1};
   dim3 blocks = {256, 1, 1};
   void* args[] = {&elemList, &countPerGroupSet, &nGroupsSets, &vec1, &vec2};
+
+  CUDACHECK_TEST(cudaFuncSetAttribute(
+      (void*)KElemPostMultiGroupsKernel,
+      cudaFuncAttributeMaxDynamicSharedMemorySize,
+      sizeof(CtranAlgoDeviceState)));
   CUDACHECK_TEST(cudaLaunchKernel(
       reinterpret_cast<void*>(KElemPostMultiGroupsKernel),
       grid,
       blocks,
       args,
-      0,
+      sizeof(CtranAlgoDeviceState),
       0));
 
   auto elem = elemList;
diff --git a/comms/ctran/gpe/tests/KernelElemPoolUTKernels.cu b/comms/ctran/gpe/tests/KernelElemPoolUTKernels.cu
@@ -1,6 +1,7 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 
 #include "comms/ctran/algos/DevCommon.cuh"
+#include "comms/ctran/algos/DevShmState.cuh"
 // FIXME [REBASE]: update the path once moved to fbcode/comms
 #include "comms/ctran/gpe/tests/KernelElemPoolUTKernels.h"
 
@@ -13,6 +14,9 @@ __global__ void KElemConsumerKernel(KernelElem* elemList) {
 }
 
 __global__ void KElemPostRevokeKernel(KernelElem* elemList, int unuseIdx) {
+  // TODO(T243528798): remove this preload of devstate by splitting h2d/d2h
+  // channels.
+  shmDevState.enableCancellableWaits = false;
   KernelElem* elem = elemList;
   int i = 0;
   while (elem) {
@@ -42,6 +46,9 @@ __global__ void KElemPostRevokeKernel(KernelElem* elemList, int unuseIdx) {
 
 __global__ void
 KElemPostWaitKernel(KernelElem* elem, size_t count, int* vec1, int* vec2) {
+  // TODO(T243528798): remove this preload of devstate by splitting h2d/d2h
+  // channels.
+  shmDevState.enableCancellableWaits = false;
   bool revoked = false;
   elemWaitPostOrRevokeByGroup(elem, blockIdx.x, &revoked);
 
@@ -63,6 +70,9 @@ __global__ void KElemPostMultiGroupsKernel(
     int nGroupSets,
     int* vec1,
     int* vec2) {
+  // TODO(T243528798): remove this preload of devstate by splitting h2d/d2h
+  // channels.
+  shmDevState.enableCancellableWaits = false;
   bool revoked = false;
   auto nGroupsPerSet = gridDim.x / nGroupSets;
   auto groupSetId = blockIdx.x / nGroupsPerSet;