astro-informatics
diff --git a/‎lib/include/s2fft.h‎
Lines changed: 10 additions & 2 deletions b/‎lib/include/s2fft.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎lib/include/s2fft_kernels.h‎
Lines changed: 8 additions & 5 deletions b/‎lib/include/s2fft_kernels.h‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎lib/src/extensions.cc‎
Lines changed: 75 additions & 15 deletions b/‎lib/src/extensions.cc‎
Lines changed: 75 additions & 15 deletions
diff --git a/‎lib/src/s2fft.cu‎
Lines changed: 12 additions & 7 deletions b/‎lib/src/s2fft.cu‎
Lines changed: 12 additions & 7 deletions
@@ -168,9 +168,13 @@ class s2fftExec {
      * @param stream The CUDA stream to use for execution.
      * @param data Pointer to the input/output data on the device.
      * @param workspace Pointer to the workspace memory on the device.
+     * @param shift_scratch Pointer to scratch buffer for out-of-place shifting (can be nullptr for in-place).
+     * @param use_out_of_place If true, use out-of-place shifting with shift_scratch; if false, use in-place
+     * with cooperative kernel.
      * @return HRESULT indicating success or failure.
      */
-    HRESULT Forward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace);
+    HRESULT Forward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace,
+                    Complex *shift_scratch, bool use_out_of_place);
 
     /**
      * @brief Executes the backward Spherical Harmonic Transform.
@@ -182,9 +186,13 @@ class s2fftExec {
      * @param stream The CUDA stream to use for execution.
      * @param data Pointer to the input/output data on the device.
      * @param workspace Pointer to the workspace memory on the device.
+     * @param shift_scratch Pointer to scratch buffer for out-of-place shifting (can be nullptr for in-place).
+     * @param use_out_of_place If true, use out-of-place shifting with shift_scratch; if false, use in-place
+     * with cooperative kernel.
      * @return HRESULT indicating success or failure.
      */
-    HRESULT Backward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace);
+    HRESULT Backward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data, Complex *workspace,
+                     Complex *shift_scratch, bool use_out_of_place);
 
 public:
     // cuFFT handles for polar and equatorial FFT plans
 
@@ -70,20 +70,23 @@ HRESULT launch_spectral_extension(complex* data, complex* output, const int& nsi
  * This function configures and launches the shift_normalize_kernel with appropriate
  * grid and block dimensions. It handles both single and double precision complex
  * types and applies the requested normalization and shifting operations to HEALPix
- * pixel data on a per-ring basis.
+ * pixel data. Supports both in-place (with cooperative kernel) and out-of-place
+ * (with scratch buffer) modes to enable compatibility with JAX transforms.
  *
  * @tparam complex The complex type (cufftComplex or cufftDoubleComplex).
  * @param stream CUDA stream for kernel execution.
- * @param data Input/output array of HEALPix pixel data (in-place processing).
+ * @param data Input/output array of HEALPix pixel data.
+ * @param shift_buffer Scratch buffer for out-of-place shifting (can be nullptr for in-place).
  * @param nside The HEALPix Nside parameter.
  * @param apply_shift Flag indicating whether to apply FFT shifting.
  * @param norm Normalization type (0=by nphi, 1=by sqrt(nphi), 2=no normalization).
+ * @param use_out_of_place If true, use out-of-place shifting with shift_buffer; if false, use in-place with
+ * cooperative kernel.
  * @return HRESULT indicating success or failure.
  */
 template <typename complex>
-HRESULT launch_shift_normalize_kernel(cudaStream_t stream,
-                                      complex* data,  // In-place data buffer
-                                      int nside, bool apply_shift, int norm);
+HRESULT launch_shift_normalize_kernel(cudaStream_t stream, complex* data, complex* shift_buffer, int nside,
+                                      bool apply_shift, int norm, bool use_out_of_place);
 
 }  // namespace s2fftKernels
 
 
@@ -2,6 +2,7 @@
 #include <cstddef>
 #include <complex>
 #include <type_traits>
+#include <cstdlib>
 
 namespace nb = nanobind;
 
@@ -62,21 +63,44 @@ constexpr bool is_double_v = is_double<T>::value;
  *
  * @tparam T The XLA data type (F32, F64, etc).
  * @param stream CUDA stream to use.
+ * @param scratch ScratchAllocator for temporary device memory.
  * @param input Input buffer containing HEALPix pixel-space data.
  * @param output Output buffer to store the FTM result.
  * @param workspace Output buffer for temporary workspace memory.
  * @param descriptor Descriptor containing transform parameters.
  * @return ffi::Error indicating success or failure.
  */
 template <ffi::DataType T>
-ffi::Error healpix_forward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Result<ffi::Buffer<T>> output,
-                           ffi::Result<ffi::Buffer<T>> workspace, s2fftDescriptor descriptor) {
+ffi::Error healpix_forward(cudaStream_t stream, ffi::ScratchAllocator& scratch, ffi::Buffer<T> input,
+                           ffi::Result<ffi::Buffer<T>> output, ffi::Result<ffi::Buffer<T>> workspace,
+                           s2fftDescriptor descriptor) {
     // Step 1: Determine the complex type based on the XLA data type.
     using fft_complex_type = fft_complex_t<T>;
     const auto& dim_in = input.dimensions();
 
+    // Step 1a: Parse environment variable for shift strategy (static for thread safety).
+    static const std::string shift_strategy = []() {
+        const char* env = std::getenv("S2FFT_CUDA_SHIFT_STRATEGY");
+        return env ? std::string(env) : "in_place";
+    }();
+    bool use_out_of_place = (shift_strategy == "out_of_place");
+    bool is_batched = (dim_in.size() == 2);
+
+    // Step 1b: Allocate scratch buffer if using out-of-place mode.
+    fft_complex_type* shift_scratch = nullptr;
+    if (use_out_of_place && descriptor.shift) {
+        int64_t Npix = descriptor.nside * descriptor.nside * 12;
+        int batch_count = is_batched ? dim_in[0] : 1;
+        size_t scratch_size = Npix * sizeof(fft_complex_type) * batch_count;
+        auto scratch_result = scratch.Allocate(scratch_size);
+        if (!scratch_result.has_value()) {
+            return ffi::Error::Internal("Failed to allocate scratch buffer for shift operation");
+        }
+        shift_scratch = reinterpret_cast<fft_complex_type*>(scratch_result.value());
+    }
+
     // Step 2: Handle batched and non-batched cases separately.
-    if (dim_in.size() == 2) {
+    if (is_batched) {
         // Step 2a: Batched case.
         int batch_count = dim_in[0];
         // Step 2b: Compute offsets for input and output for each batch.
@@ -104,7 +128,12 @@ ffi::Error healpix_forward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resul
                     reinterpret_cast<fft_complex_type*>(workspace->typed_data() + i * executor->m_work_size);
 
             // Step 2g: Launch the forward transform on this sub-stream.
-            executor->Forward(descriptor, sub_stream, data_c, workspace_c);
+            fft_complex_type* shift_scratch_batch =
+                    use_out_of_place && shift_scratch
+                            ? shift_scratch + i * (descriptor.nside * descriptor.nside * 12)
+                            : nullptr;
+            executor->Forward(descriptor, sub_stream, data_c, workspace_c, shift_scratch_batch,
+                              use_out_of_place);
             // Step 2h: Launch spectral extension kernel.
             s2fftKernels::launch_spectral_extension(data_c, out_c, descriptor.nside,
                                                     descriptor.harmonic_band_limit, sub_stream);
@@ -123,7 +152,7 @@ ffi::Error healpix_forward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resul
         auto executor = std::make_shared<s2fftExec<fft_complex_type>>();
         PlanCache::GetInstance().GetS2FFTExec(descriptor, executor);
         // Step 2m: Launch the forward transform.
-        executor->Forward(descriptor, stream, data_c, workspace_c);
+        executor->Forward(descriptor, stream, data_c, workspace_c, shift_scratch, use_out_of_place);
         // Step 2n: Launch spectral extension kernel.
         s2fftKernels::launch_spectral_extension(data_c, out_c, descriptor.nside,
                                                 descriptor.harmonic_band_limit, stream);
@@ -141,22 +170,45 @@ ffi::Error healpix_forward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resul
  *
  * @tparam T The XLA data type.
  * @param stream CUDA stream to use.
+ * @param scratch ScratchAllocator for temporary device memory.
  * @param input Input buffer containing FTM data.
  * @param output Output buffer to store HEALPix pixel-space data.
  * @param workspace Output buffer for temporary workspace memory.
  * @param descriptor Descriptor containing transform parameters.
  * @return ffi::Error indicating success or failure.
  */
 template <ffi::DataType T>
-ffi::Error healpix_backward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Result<ffi::Buffer<T>> output,
-                            ffi::Result<ffi::Buffer<T>> workspace, s2fftDescriptor descriptor) {
+ffi::Error healpix_backward(cudaStream_t stream, ffi::ScratchAllocator& scratch, ffi::Buffer<T> input,
+                            ffi::Result<ffi::Buffer<T>> output, ffi::Result<ffi::Buffer<T>> workspace,
+                            s2fftDescriptor descriptor) {
     // Step 1: Determine the complex type based on the XLA data type.
     using fft_complex_type = fft_complex_t<T>;
     const auto& dim_in = input.dimensions();
     const auto& dim_out = output->dimensions();
 
+    // Step 1a: Parse environment variable for shift strategy (static for thread safety).
+    static const std::string shift_strategy = []() {
+        const char* env = std::getenv("S2FFT_CUDA_SHIFT_STRATEGY");
+        return env ? std::string(env) : "in_place";
+    }();
+    bool use_out_of_place = (shift_strategy == "out_of_place");
+    bool is_batched = (dim_in.size() == 3);
+
+    // Step 1b: Allocate scratch buffer if using out-of-place mode.
+    fft_complex_type* shift_scratch = nullptr;
+    if (use_out_of_place && descriptor.shift) {
+        int64_t Npix = descriptor.nside * descriptor.nside * 12;
+        int batch_count = is_batched ? dim_in[0] : 1;
+        size_t scratch_size = Npix * sizeof(fft_complex_type) * batch_count;
+        auto scratch_result = scratch.Allocate(scratch_size);
+        if (!scratch_result.has_value()) {
+            return ffi::Error::Internal("Failed to allocate scratch buffer for shift operation");
+        }
+        shift_scratch = reinterpret_cast<fft_complex_type*>(scratch_result.value());
+    }
+
     // Step 2: Handle batched and non-batched cases separately.
-    if (dim_in.size() == 3) {
+    if (is_batched) {
         // Step 2a: Batched case.
         // Assertions to ensure correct input/output dimensions for batched operations.
         assert(dim_out.size() == 2);
@@ -191,7 +243,12 @@ ffi::Error healpix_backward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resu
                                                   descriptor.harmonic_band_limit, descriptor.shift,
                                                   sub_stream);
             // Step 2h: Launch the backward transform on this sub-stream.
-            executor->Backward(descriptor, sub_stream, out_c, workspace_c);
+            fft_complex_type* shift_scratch_batch =
+                    use_out_of_place && shift_scratch
+                            ? shift_scratch + i * (descriptor.nside * descriptor.nside * 12)
+                            : nullptr;
+            executor->Backward(descriptor, sub_stream, out_c, workspace_c, shift_scratch_batch,
+                               use_out_of_place);
         }
         // Step 2i: Join all forked streams back to the main stream.
         handler.join(stream);
@@ -213,7 +270,7 @@ ffi::Error healpix_backward(cudaStream_t stream, ffi::Buffer<T> input, ffi::Resu
         s2fftKernels::launch_spectral_folding(data_c, out_c, descriptor.nside, descriptor.harmonic_band_limit,
                                               descriptor.shift, stream);
         // Step 2n: Launch the backward transform.
-        executor->Backward(descriptor, stream, out_c, workspace_c);
+        executor->Backward(descriptor, stream, out_c, workspace_c, shift_scratch, use_out_of_place);
         return ffi::Error::Success();
     }
 }
@@ -298,19 +355,20 @@ s2fftDescriptor build_descriptor(int64_t nside, int64_t harmonic_band_limit, boo
  * @return ffi::Error indicating success or failure.
  */
 template <ffi::DataType T>
-ffi::Error healpix_fft_cuda(cudaStream_t stream, int64_t nside, int64_t harmonic_band_limit, bool reality,
-                            bool forward, bool normalize, bool adjoint, ffi::Buffer<T> input,
-                            ffi::Result<ffi::Buffer<T>> output, ffi::Result<ffi::Buffer<T>> workspace) {
+ffi::Error healpix_fft_cuda(cudaStream_t stream, ffi::ScratchAllocator scratch, int64_t nside,
+                            int64_t harmonic_band_limit, bool reality, bool forward, bool normalize,
+                            bool adjoint, ffi::Buffer<T> input, ffi::Result<ffi::Buffer<T>> output,
+                            ffi::Result<ffi::Buffer<T>> workspace) {
     // Step 1: Build the s2fftDescriptor based on the input parameters.
     size_t work_size = 0;  // Variable to hold the workspace size
     s2fftDescriptor descriptor = build_descriptor<T>(nside, harmonic_band_limit, reality, forward, normalize,
                                                      adjoint, true, work_size);
 
     // Step 2: Dispatch to either forward or backward transform based on the 'forward' flag.
     if (forward) {
-        return healpix_forward<T>(stream, input, output, workspace, descriptor);
+        return healpix_forward<T>(stream, scratch, input, output, workspace, descriptor);
     } else {
-        return healpix_backward<T>(stream, input, output, workspace, descriptor);
+        return healpix_backward<T>(stream, scratch, input, output, workspace, descriptor);
     }
 }
 
@@ -323,6 +381,7 @@ ffi::Error healpix_fft_cuda(cudaStream_t stream, int64_t nside, int64_t harmonic
 XLA_FFI_DEFINE_HANDLER_SYMBOL(healpix_fft_cuda_C64, healpix_fft_cuda<ffi::DataType::C64>,
                               ffi::Ffi::Bind()
                                       .Ctx<ffi::PlatformStream<cudaStream_t>>()
+                                      .Ctx<ffi::ScratchAllocator>()
                                       .Attr<int64_t>("nside")
                                       .Attr<int64_t>("harmonic_band_limit")
                                       .Attr<bool>("reality")
@@ -336,6 +395,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(healpix_fft_cuda_C64, healpix_fft_cuda<ffi::DataTy
 XLA_FFI_DEFINE_HANDLER_SYMBOL(healpix_fft_cuda_C128, healpix_fft_cuda<ffi::DataType::C128>,
                               ffi::Ffi::Bind()
                                       .Ctx<ffi::PlatformStream<cudaStream_t>>()
+                                      .Ctx<ffi::ScratchAllocator>()
                                       .Attr<int64_t>("nside")
                                       .Attr<int64_t>("harmonic_band_limit")
                                       .Attr<bool>("reality")
 
@@ -111,7 +111,7 @@ HRESULT s2fftExec<Complex>::Initialize(const s2fftDescriptor &descriptor) {
 
 template <typename Complex>
 HRESULT s2fftExec<Complex>::Forward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data,
-                                    Complex *workspace) {
+                                    Complex *workspace, Complex *shift_scratch, bool use_out_of_place) {
     // Step 1: Determine the FFT direction (forward or inverse based on adjoint flag).
     const int DIRECTION = desc.adjoint ? CUFFT_INVERSE : CUFFT_FORWARD;
     // Step 2: Extract normalization, shift, and double precision flags from the descriptor.
@@ -143,15 +143,18 @@ HRESULT s2fftExec<Complex>::Forward(const s2fftDescriptor &desc, cudaStream_t st
         case s2fftKernels::fft_norm::NONE:
         case s2fftKernels::fft_norm::BACKWARD:
             // No normalization, only shift if required.
-            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, shift, 2);
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, shift_scratch, m_nside, shift, 2,
+                                                        use_out_of_place);
             break;
         case s2fftKernels::fft_norm::FORWARD:
             // Normalize by sqrt(Npix).
-            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, shift, 0);
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, shift_scratch, m_nside, shift, 0,
+                                                        use_out_of_place);
             break;
         case s2fftKernels::fft_norm::ORTHO:
             // Normalize by Npix.
-            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, shift, 1);
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, shift_scratch, m_nside, shift, 1,
+                                                        use_out_of_place);
             break;
         default:
             return E_INVALIDARG;  // Invalid normalization type.
@@ -162,7 +165,7 @@ HRESULT s2fftExec<Complex>::Forward(const s2fftDescriptor &desc, cudaStream_t st
 
 template <typename Complex>
 HRESULT s2fftExec<Complex>::Backward(const s2fftDescriptor &desc, cudaStream_t stream, Complex *data,
-                                     Complex *workspace) {
+                                     Complex *workspace, Complex *shift_scratch, bool use_out_of_place) {
     // Step 1: Determine the FFT direction (forward or inverse based on adjoint flag).
     const int DIRECTION = desc.adjoint ? CUFFT_FORWARD : CUFFT_INVERSE;
     // Step 2: Extract normalization, shift, and double precision flags from the descriptor.
@@ -196,11 +199,13 @@ HRESULT s2fftExec<Complex>::Backward(const s2fftDescriptor &desc, cudaStream_t s
             break;
         case s2fftKernels::fft_norm::BACKWARD:
             // Normalize by sqrt(Npix).
-            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, false, 0);
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, shift_scratch, m_nside, false, 0,
+                                                        use_out_of_place);
             break;
         case s2fftKernels::fft_norm::ORTHO:
             // Normalize by Npix.
-            s2fftKernels::launch_shift_normalize_kernel(stream, data, m_nside, false, 1);
+            s2fftKernels::launch_shift_normalize_kernel(stream, data, shift_scratch, m_nside, false, 1,
+                                                        use_out_of_place);
             break;
         default:
             return E_INVALIDARG;  // Invalid normalization type.