Update healpix_ffts to use new FFI lowered cuda healpix ffts

ASKabalan · ASKabalan · commit 8fe86c24740b · 2025-03-26T17:19:16.000+01:00
diff --git a/s2fft/transforms/spherical.py b/s2fft/transforms/spherical.py
@@ -82,14 +82,25 @@ def inverse(
         recover acceleration by the number of devices.
 
     """
-    if spin >= 8 and method in ["numpy", "jax"]:
+    if spin >= 8 and method in ["numpy", "jax", "cuda"]:
         raise Warning("Recursive transform may provide lower precision beyond spin ~ 8")
 
     if method == "numpy":
         return inverse_numpy(flm, L, spin, nside, sampling, reality, precomps, L_lower)
-    elif method == "jax":
+    elif method in ["jax", "cuda"]:
+        use_healpix_custom_primitive = method == "cuda"
+        method = "jax"
         return inverse_jax(
-            flm, L, spin, nside, sampling, reality, precomps, spmd, L_lower
+            flm,
+            L,
+            spin,
+            nside,
+            sampling,
+            reality,
+            precomps,
+            spmd,
+            L_lower,
+            use_healpix_custom_primitive,
         )
     elif method == "jax_ssht":
         if sampling.lower() == "healpix":
@@ -205,7 +216,7 @@ def inverse_numpy(
             return np.fft.ifft(np.fft.ifftshift(ftm, axes=1), axis=1, norm="forward")
 
 
-@partial(jit, static_argnums=(1, 3, 4, 5, 7, 8))
+@partial(jit, static_argnums=(1, 3, 4, 5, 7, 8, 9))
 def inverse_jax(
     flm: jnp.ndarray,
     L: int,
@@ -216,6 +227,7 @@ def inverse_jax(
     precomps: List = None,
     spmd: bool = False,
     L_lower: int = 0,
+    use_healpix_custom_primitive: bool = False,
 ) -> jnp.ndarray:
     r"""
     Compute the inverse spin-spherical harmonic transform (JAX).
@@ -251,6 +263,12 @@ def inverse_jax(
         L_lower (int, optional): Harmonic lower-bound. Transform will only be computed
             for :math:`\texttt{L_lower} \leq \ell < \texttt{L}`. Defaults to 0.
 
+        use_healpix_custom_primitive (bool, optional): Whether to use a custom CUDA
+            primitive for computing HEALPix fast fourier transform when `sampling =
+            "healpix"` and running on a cuda compatible gpu device. using a custom
+            primitive reduces long compilation times when jit compiling. defaults to
+            `False`.
+
     Returns:
         jnp.ndarray: Signal on the sphere.
 
@@ -326,7 +344,10 @@ def f_bwd(res, gtm):
             jnp.flip(jnp.conj(ftm[:, L - 1 + m_offset + 1 :]), axis=-1)
         )
     if sampling.lower() == "healpix":
-        return hp.healpix_ifft(ftm, L, nside, "jax")
+        if use_healpix_custom_primitive:
+            return hp.healpix_ifft(ftm, L, nside, "cuda")
+        else:
+            return hp.healpix_ifft(ftm, L, nside, "jax")
     else:
         ftm = jnp.conj(jnp.fft.ifftshift(ftm, axes=1))
         f = jnp.conj(jnp.fft.fft(ftm, axis=1, norm="backward"))
@@ -406,7 +427,7 @@ def forward(
         recover acceleration by the number of devices.
 
     """
-    if spin >= 8 and method in ["numpy", "jax"]:
+    if spin >= 8 and method in ["numpy", "jax", "cuda"]:
         raise Warning("Recursive transform may provide lower precision beyond spin ~ 8")
 
     if iter is None:
diff --git a/s2fft/transforms/wigner.py b/s2fft/transforms/wigner.py
@@ -86,8 +86,20 @@ def inverse(
 
     if method == "numpy":
         return inverse_numpy(flmn, L, N, nside, sampling, reality, precomps, L_lower)
-    elif method == "jax":
-        return inverse_jax(flmn, L, N, nside, sampling, reality, precomps, L_lower)
+    elif method in ["jax", "cuda"]:
+        use_healpix_custom_primitive = method == "cuda"
+        method = "jax"
+        return inverse_jax(
+            flmn,
+            L,
+            N,
+            nside,
+            sampling,
+            reality,
+            precomps,
+            L_lower,
+            use_healpix_custom_primitive,
+        )
     elif method == "jax_ssht":
         if sampling.lower() == "healpix":
             raise ValueError("SSHT does not support healpix sampling.")
diff --git a/s2fft/utils/healpix_ffts.py b/s2fft/utils/healpix_ffts.py
@@ -1,5 +1,6 @@
 from functools import partial
 
+import jax
 import jax.numpy as jnp
 import jaxlib.mlir.ir as ir
 import numpy as np
@@ -8,8 +9,6 @@
 
 # did not find promote_dtypes_complex outside _src
 from jax._src.numpy.util import promote_dtypes_complex
-from jax.lib import xla_client
-from jaxlib.hlo_helpers import custom_call
 from s2fft_lib import _s2fft
 
 from s2fft.sampling import s2_samples as samples
@@ -703,16 +702,18 @@ def _healpix_fft_cuda_abstract(f, L, nside, reality, fft_type, norm):
         assert f.shape == healpix_size
         return f.update(shape=ftm_size, dtype=f.dtype)
     elif fft_type == "backward":
-        print(f"f.shape {f.shape}")
         assert f.shape == ftm_size
         return f.update(shape=healpix_size, dtype=f.dtype)
     else:
         raise ValueError(f"fft_type {fft_type} not recognised.")
 
 
 def _healpix_fft_cuda_lowering(ctx, f, *, L, nside, reality, fft_type, norm):
+    assert _s2fft.COMPILED_WITH_CUDA, """
+    S2FFT was compiled without CUDA support. Cuda functions are not supported.
+    Please make sure that nvcc is in your path and $CUDA_HOME is set then reinstall s2fft using pip.
+    """
     (aval_out,) = ctx.avals_out
-    a_type = ir.RankedTensorType(f.type)
 
     out_dtype = aval_out.dtype
     if out_dtype == np.complex64:
@@ -734,34 +735,53 @@ def _healpix_fft_cuda_lowering(ctx, f, *, L, nside, reality, fft_type, norm):
     else:
         raise ValueError(f"Unknown norm {norm}")
 
-    descriptor = _s2fft.build_healpix_fft_descriptor(
-        nside, L, reality, forward, normalize, is_double
+    if is_double:
+        ffi_lowered = jax.ffi.ffi_lowering("healpix_fft_cuda_c128")
+    else:
+        ffi_lowered = jax.ffi.ffi_lowering("healpix_fft_cuda_c64")
+
+    return ffi_lowered(
+        ctx,
+        f,
+        nside=nside,
+        harmonic_band_limit=L,
+        reality=reality,
+        normalize=normalize,
+        forward=forward,
     )
 
-    layout = tuple(range(len(a_type.shape) - 1, -1, -1))
-    out_layout = tuple(range(len(out_type.shape) - 1, -1, -1))
-
-    result = custom_call(
-        "healpix_fft_cuda",
-        result_types=[out_type],
-        operands=[f],
-        operand_layouts=[layout],
-        result_layouts=[out_layout],
-        has_side_effect=True,
-        backend_config=descriptor,
+
+def _healpix_fft_cuda_transpose(
+    df: jnp.ndarray, L: int, nside: int, reality: bool, fft_type: str, norm: str
+) -> jnp.ndarray:
+    scale_factors = (
+        jnp.concatenate((jnp.ones(L), 2 * jnp.ones(L * (L - 1) // 2)))
+        * (3 * nside**2)
+        / jnp.pi
     )
-    return result.results
+    if fft_type == "forward":
+        return (
+            scale_factors
+            * jnp.conj(healpix_ifft_cuda(jnp.conj(df), L, nside, reality, norm)),
+        )
+    elif fft_type == "backward":
+        return (
+            scale_factors
+            * jnp.conj(healpix_fft_cuda(jnp.conj(df), L, nside, reality, norm)),
+        )
 
 
 # Register healpfix_fft_cuda custom call target
 for name, fn in _s2fft.registration().items():
-    xla_client.register_custom_call_target(name, fn, platform="gpu")
+    jax.ffi.register_ffi_target(name, fn, platform="CUDA")
 
 _healpix_fft_cuda_primitive = register_primitive(
     "healpix_fft_cuda",
     multiple_results=False,
     abstract_evaluation=_healpix_fft_cuda_abstract,
     lowering_per_platform={None: _healpix_fft_cuda_lowering},
+    transpose=_healpix_fft_cuda_transpose,
+    is_linear=True,
 )