removubg s2fft callbacks

ASKabalan · ASKabalan · commit 1ac35416b85b · 2025-06-30T11:10:40.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -53,16 +53,15 @@ if(CMAKE_CUDA_COMPILER)
       STABLE_ABI
       ${CMAKE_CURRENT_LIST_DIR}/lib/src/extensions.cc
       ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft.cu
-      ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft_callbacks.cu
       ${CMAKE_CURRENT_LIST_DIR}/lib/src/plan_cache.cc
       ${CMAKE_CURRENT_LIST_DIR}/lib/src/s2fft_kernels.cu)
 
-    target_link_libraries(_s2fft PRIVATE CUDA::cudart_static CUDA::cufft_static
-                                         CUDA::culibos)
+    target_link_libraries(_s2fft PRIVATE CUDA::cudart_static CUDA::cufft_static CUDA::culibos)
     target_include_directories(
-      _s2fft PUBLIC ${CMAKE_CURRENT_LIST_DIR}/lib/include ${XLA_DIR})
+      _s2fft PUBLIC ${CMAKE_CURRENT_LIST_DIR}/lib/include ${XLA_DIR} ${CUDAToolkit_INCLUDE_DIRS})
     set_target_properties(_s2fft PROPERTIES LINKER_LANGUAGE CUDA
                                             CUDA_SEPARABLE_COMPILATION ON)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -rdc=true")
     set(CMAKE_CUDA_ARCHITECTURES
         "70;80;89"
         CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.")
@@ -85,7 +84,7 @@ else()
     # Add the executable
     execute_process(
       COMMAND "${Python_EXECUTABLE}" "-c"
-              "from jax.extend import ffi; print(ffi.include_dir())"
+              "from jax import ffi; print(ffi.include_dir())"
       OUTPUT_STRIP_TRAILING_WHITESPACE
       OUTPUT_VARIABLE XLA_DIR)
     message(STATUS "XLA include directory: ${XLA_DIR}")
diff --git a/lib/include/s2fft.h b/lib/include/s2fft.h
@@ -14,7 +14,8 @@
 #include "cufft.h"
 #include "cufftXt.h"
 #include "thrust/device_vector.h"
-#include "s2fft_callbacks.h"
+#include "s2fft_kernels.h"
+
 
 namespace s2fft {
 
@@ -168,11 +169,9 @@ class s2fftExec {
      * @param stream The CUDA stream to use for execution.
      * @param data Pointer to the input/output data on the device.
      * @param workspace Pointer to the workspace memory on the device.
-     * @param callback_params Pointer to device memory containing callback parameters.
      * @return HRESULT indicating success or failure.
      */
-    HRESULT Forward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace,
-                    int64 *callback_params);
+    HRESULT Forward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace);
 
     /**
      * @brief Executes the backward Spherical Harmonic Transform.
@@ -184,11 +183,9 @@ class s2fftExec {
      * @param stream The CUDA stream to use for execution.
      * @param data Pointer to the input/output data on the device.
      * @param workspace Pointer to the workspace memory on the device.
-     * @param callback_params Pointer to device memory containing callback parameters.
      * @return HRESULT indicating success or failure.
      */
-    HRESULT Backward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace,
-                     int64 *callback_params);
+    HRESULT Backward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace);
 
 public:
     // cuFFT handles for polar and equatorial FFT plans
diff --git a/lib/include/s2fft_kernels.h b/lib/include/s2fft_kernels.h
@@ -11,12 +11,29 @@ typedef long long int int64;
 
 namespace s2fftKernels {
 
+enum fft_norm {
+  FORWARD = 1,
+  BACKWARD = 2,
+  ORTHO = 3,
+  NONE = 4
+};
+
 template <typename complex>
 HRESULT launch_spectral_folding(complex* data, complex* output, const int& nside, const int& L,
                                 const bool& shift, cudaStream_t stream);
 template <typename complex>
 HRESULT launch_spectral_extension(complex* data, complex* output, const int& nside, const int& L,
                                   cudaStream_t stream);
+
+template <typename complex>
+HRESULT launch_shift_normalize_kernel(
+    cudaStream_t stream,
+    complex* data,      // In-place data buffer
+    int nside,
+    bool apply_shift,
+    int norm
+);
+
 }  // namespace s2fftKernels
 
 #endif  // _S2FFT_KERNELS_H
diff --git a/lib/src/extensions.cc b/lib/src/extensions.cc
@@ -111,7 +111,7 @@ ffi::Error healpix_forward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resul
                     reinterpret_cast<int64*>(callback_params->typed_data() + i * params_offset);
 
             // Step 2g: Launch the forward transform on this sub-stream.
-            executor->Forward(descriptor, sub_stream, data_c, workspace_c, callback_params_c);
+            executor->Forward(descriptor, sub_stream, data_c, workspace_c);
             // Step 2h: Launch spectral extension kernel.
             s2fftKernels::launch_spectral_extension(data_c, out_c, descriptor.nside,
                                                     descriptor.harmonic_band_limit, sub_stream);
@@ -131,7 +131,7 @@ ffi::Error healpix_forward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resul
         auto executor = std::make_shared<s2fftExec<fft_complex_type>>();
         PlanCache::GetInstance().GetS2FFTExec(descriptor, executor);
         // Step 2m: Launch the forward transform.
-        executor->Forward(descriptor, stream, data_c, workspace_c, callback_params_c);
+        executor->Forward(descriptor, stream, data_c, workspace_c);
         // Step 2n: Launch spectral extension kernel.
         s2fftKernels::launch_spectral_extension(data_c, out_c, descriptor.nside,
                                                 descriptor.harmonic_band_limit, stream);
@@ -205,7 +205,7 @@ ffi::Error healpix_backward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resu
                                                   descriptor.harmonic_band_limit, descriptor.shift,
                                                   sub_stream);
             // Step 2h: Launch the backward transform on this sub-stream.
-            executor->Backward(descriptor, sub_stream, out_c, workspace_c, callback_params_c);
+            executor->Backward(descriptor, sub_stream, out_c, workspace_c);
         }
         // Step 2i: Join all forked streams back to the main stream.
         handler.join(stream);
@@ -228,7 +228,7 @@ ffi::Error healpix_backward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resu
         s2fftKernels::launch_spectral_folding(data_c, out_c, descriptor.nside, descriptor.harmonic_band_limit,
                                               descriptor.shift, stream);
         // Step 2n: Launch the backward transform.
-        executor->Backward(descriptor, stream, out_c, workspace_c, callback_params_c);
+        executor->Backward(descriptor, stream, out_c, workspace_c);
         return ffi::Error::Success();
     }
 }
diff --git a/lib/src/s2fft.cu b/lib/src/s2fft.cu
@@ -12,7 +12,7 @@
 #include <numeric>
 
 #include <vector>
-#include "s2fft_callbacks.h"
+#include "s2fft_kernels.h"
 
 namespace s2fft {
 
@@ -81,15 +81,7 @@ HRESULT s2fftExec<Complex>::Initialize(const s2fftDescriptor &descriptor) {
         // Step 7e: Update overall maximum workspace size again.
         worksize = std::max(worksize, polar_worksize);
 
-        // Step 7f: Allocate device memory for callback parameters and copy host parameters.
-        int64 params[2];
-        int64 *params_dev;
-        params[0] = n[0];
-        params[1] = idist;
-        cudaMalloc(&params_dev, 2 * sizeof(int64));
-        cudaMemcpy(params_dev, params, 2 * sizeof(int64), cudaMemcpyHostToDevice);
-
-        // Step 7g: Store the created plans.
+        // Step 7f: Store the created plans.
         m_polar_plans.push_back(plan);
         m_inverse_polar_plans.push_back(inverse_plan);
     }
@@ -117,34 +109,21 @@ HRESULT s2fftExec<Complex>::Initialize(const s2fftDescriptor &descriptor) {
     return S_OK;
 }
 
+
 template <typename Complex>
 HRESULT s2fftExec<Complex>::Forward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data,
-                                    Complex *workspace, int64 *callback_params) {
+                                    Complex *workspace) {
     // Step 1: Determine the FFT direction (forward or inverse based on adjoint flag).
     const int DIRECTION = desc.adjoint ? CUFFT_INVERSE : CUFFT_FORWARD;
     // Step 2: Extract normalization, shift, and double precision flags from the descriptor.
     const s2fftKernels::fft_norm &norm = desc.norm;
     const bool &shift = desc.shift;
-    const bool &isDouble = desc.double_precision;
 
     // Step 3: Execute FFTs for polar rings.
     for (int i = 0; i < m_nside - 1; i++) {
         // Step 3a: Get upper and lower ring offsets.
         int upper_ring_offset = m_upper_ring_offsets[i];
-        int lower_ring_offset = m_lower_ring_offsets[i];
-
-        // Step 3b: Set parameters for the polar ring FFT callback.
-        int64 param_offset = 2 * i;  // Offset for the parameters in the callback
-        int64 params[2];
-        params[0] = 4 * ((int64)i + 1);  // Size of the ring
-        params[1] = lower_ring_offset - upper_ring_offset;
 
-        // Step 3c: Copy callback parameters to device memory asynchronously.
-        int64 *params_device = callback_params + param_offset;
-        cudaMemcpyAsync(params_device, params, 2 * sizeof(int64), cudaMemcpyHostToDevice, stream);
-
-        // Step 3d: Set the forward callback for the current polar plan.
-        s2fftKernels::setForwardCallback(m_polar_plans[i], params_device, shift, false, isDouble, norm);
         // Step 3e: Set the CUDA stream and work area for the cuFFT plan.
         CUFFT_CALL(cufftSetStream(m_polar_plans[i], stream));
         CUFFT_CALL(cufftSetWorkArea(m_polar_plans[i], workspace));
@@ -153,51 +132,49 @@ HRESULT s2fftExec<Complex>::Forward(const s2fftDescriptor &desc, cudaStream_t st
                 cufftXtExec(m_polar_plans[i], data + upper_ring_offset, data + upper_ring_offset, DIRECTION));
     }
     // Step 4: Execute FFT for the equatorial ring.
-    // Step 4a: Set equator parameters for the callback.
-    int64 equator_size = (4 * m_nside);
-    int64 equator_offset = (m_nside - 1) * 2;
-    int64 *equator_params_device = callback_params + equator_offset;
-    // Step 4b: Copy equator parameters to device memory asynchronously.
-    cudaMemcpyAsync(equator_params_device, &equator_size, sizeof(int64), cudaMemcpyHostToDevice, stream);
-    // Step 4c: Set the forward callback for the equatorial plan.
-    s2fftKernels::setForwardCallback(m_equator_plan, equator_params_device, shift, true, isDouble, norm);
     // Step 4d: Set the CUDA stream and work area for the equatorial cuFFT plan.
     CUFFT_CALL(cufftSetStream(m_equator_plan, stream));
     CUFFT_CALL(cufftSetWorkArea(m_equator_plan, workspace));
     // Step 4e: Execute the cuFFT transform for the equator.
     CUFFT_CALL(cufftXtExec(m_equator_plan, data + m_equatorial_offset_start, data + m_equatorial_offset_start,
                            DIRECTION));
 
+    // Step 5: Launch the custom kernel for normalization and shifting.
+    switch (norm) {
+        case s2fftKernels::fft_norm::NONE:
+        case s2fftKernels::fft_norm::BACKWARD:
+            // No normalization, only shift if required.
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, shift, 2);
+            break;
+        case s2fftKernels::fft_norm::FORWARD:
+            // Normalize by sqrt(Npix).
+            std::cout << "Applying forward normalization." << std::endl;
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, shift, 0);
+            break;
+        case s2fftKernels::fft_norm::ORTHO:
+            // Normalize by Npix.
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, shift, 1);
+            break;
+        default:
+            return E_INVALIDARG;  // Invalid normalization type.
+    }
+    
+
     return S_OK;
 }
 
 template <typename Complex>
 HRESULT s2fftExec<Complex>::Backward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data,
-                                     Complex *workspace, int64 *callback_params) {
+                                     Complex *workspace) {
     // Step 1: Determine the FFT direction (forward or inverse based on adjoint flag).
     const int DIRECTION = desc.adjoint ? CUFFT_FORWARD : CUFFT_INVERSE;
     // Step 2: Extract normalization, shift, and double precision flags from the descriptor.
     const s2fftKernels::fft_norm &norm = desc.norm;
-    const bool &shift = desc.shift;
-    const bool &isDouble = desc.double_precision;
 
     // Step 3: Execute inverse FFTs for polar rings.
     for (int i = 0; i < m_nside - 1; i++) {
         // Step 3a: Get upper and lower ring offsets.
         int upper_ring_offset = m_upper_ring_offsets[i];
-        int lower_ring_offset = m_lower_ring_offsets[i];
-        // Step 3b: Set parameters for the polar ring inverse FFT callback.
-        int64 param_offset = 2 * i;  // Offset for the parameters in the callback
-        int64 params[2];
-        params[0] = 4 * ((int64)i + 1);  // Size of the ring
-        params[1] = lower_ring_offset - upper_ring_offset;
-
-        // Step 3c: Copy callback parameters to device memory asynchronously.
-        int64 *params_device = callback_params + param_offset;
-        cudaMemcpyAsync(params_device, params, 2 * sizeof(int64), cudaMemcpyHostToDevice, stream);
-        // Step 3d: Set the backward callback for the current polar plan.
-        s2fftKernels::setBackwardCallback(m_inverse_polar_plans[i], params_device, shift, false, isDouble,
-                                          norm);
 
         // Step 3e: Set the CUDA stream and work area for the cuFFT plan.
         CUFFT_CALL(cufftSetStream(m_inverse_polar_plans[i], stream));
@@ -207,22 +184,31 @@ HRESULT s2fftExec<Complex>::Backward(const s2fftDescriptor &desc, cudaStream_t s
                                DIRECTION));
     }
     // Step 4: Execute inverse FFT for the equatorial ring.
-    // Step 4a: Set equator parameters for the callback.
-    int64 equator_size = (4 * m_nside);
-    int64 equator_offset = (m_nside - 1) * 2;
-    int64 *equator_params_device = callback_params + equator_offset;
-    // Step 4b: Copy equator parameters to device memory asynchronously.
-    cudaMemcpyAsync(equator_params_device, &equator_size, sizeof(int64), cudaMemcpyHostToDevice, stream);
-    // Step 4c: Set the backward callback for the equatorial plan.
-    s2fftKernels::setBackwardCallback(m_inverse_equator_plan, equator_params_device, shift, true, isDouble,
-                                      norm);
     // Step 4d: Set the CUDA stream and work area for the equatorial cuFFT plan.
     CUFFT_CALL(cufftSetStream(m_inverse_equator_plan, stream));
     CUFFT_CALL(cufftSetWorkArea(m_inverse_equator_plan, workspace));
     // Step 4e: Execute the cuFFT transform for the equator.
     CUFFT_CALL(cufftXtExec(m_inverse_equator_plan, data + m_equatorial_offset_start,
                            data + m_equatorial_offset_start, DIRECTION));
 
+    // Step 5: Launch the custom kernel for normalization and shifting.
+    switch (norm) {
+        case s2fftKernels::fft_norm::NONE:
+        case s2fftKernels::fft_norm::FORWARD:
+            // No normalization, do nothing.
+            break;
+        case s2fftKernels::fft_norm::BACKWARD:
+            // Normalize by sqrt(Npix).
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, false, 0);
+            break;
+        case s2fftKernels::fft_norm::ORTHO:
+            // Normalize by Npix.
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, false, 1);
+            break;
+        default:
+            return E_INVALIDARG;  // Invalid normalization type.
+    }
+
     return S_OK;
 }
 
diff --git a/lib/src/s2fft_kernels.cu b/lib/src/s2fft_kernels.cu