feat: conditionally lower elem_apply to a for loop (#1816)

avik-pal · web-flow · commit 0dac900cb87d · 2025-11-10T10:25:34.000-05:00
diff --git a/src/Overlay.jl b/src/Overlay.jl
@@ -187,7 +187,11 @@ end
 end
 
 @reactant_overlay @noinline function Base.map(f, x::AbstractArray, ys::AbstractArray...)
-    if use_overlayed_version(x) || looped_any(use_overlayed_version, ys)
+    if (
+        use_overlayed_version(x) ||
+        use_overlayed_version(f) ||
+        looped_any(use_overlayed_version, ys)
+    )
         return TracedRArrayOverrides.overloaded_map(f, x, ys...)
     else
         return Base.inferencebarrier(Base.map)(CallWithReactant(f), x, ys...)
@@ -200,6 +204,7 @@ end
     if (
         use_overlayed_version(y) ||
         use_overlayed_version(x) ||
+        use_overlayed_version(f) ||
         looped_any(use_overlayed_version, xs)
     )
         return TracedRArrayOverrides.overloaded_map!(f, y, x, xs...)
@@ -209,15 +214,15 @@ end
 end
 
 @reactant_overlay @noinline function Base._all(f, x::AbstractArray, dims)
-    if use_overlayed_version(x)
+    if use_overlayed_version(x) || use_overlayed_version(f)
         return TracedRArrayOverrides.overloaded_mapreduce(f, &, x; dims)
     else
         return Base.inferencebarrier(Base._all)(CallWithReactant(f), x, dims)
     end
 end
 
 @reactant_overlay @noinline function Base._any(f, x::AbstractArray, dims)
-    if use_overlayed_version(x)
+    if use_overlayed_version(x) || use_overlayed_version(f)
         return TracedRArrayOverrides.overloaded_mapreduce(f, |, x; dims)
     else
         return Base.inferencebarrier(Base._any)(CallWithReactant(f), x, dims)
diff --git a/src/TracedUtils.jl b/src/TracedUtils.jl
@@ -15,6 +15,7 @@ using ..Reactant:
     promote_to, # keep this to avoid breaking external code
     broadcast_to_size # keep this to avoid breaking external code
 using ..Ops: @opcall
+using GPUArraysCore: @allowscalar
 using ReactantCore: ReactantCore
 using ReactantCore: MissingTracedValue, is_traced, materialize_traced_array
 
@@ -1086,6 +1087,49 @@ function set!(x, path, tostore; emptypath=false)
     return emptypath && set_paths!(x, ())
 end
 
+function __elem_apply_loop_condition(idx_ref, fn_ref::F, res_ref, args_ref, L_ref) where {F}
+    return idx_ref[] < L_ref[]
+end
+
+function __elem_apply_loop_body(idx_ref, fn_ref::F, res_ref, args_ref, L_ref) where {F}
+    args = args_ref[]
+    fn = fn_ref[]
+    res = res_ref[]
+    idx = idx_ref[] + 1
+
+    scalar_args = [@allowscalar(arg[idx]) for arg in args]
+    @allowscalar res[idx] = fn(scalar_args...)
+
+    idx_ref[] = idx
+    res_ref[] = res
+    return nothing
+end
+
+function elem_apply_via_while_loop(f, args::Vararg{Any,Nargs}) where {Nargs}
+    @assert allequal(size.(args)) "All args must have the same size"
+    L = length(first(args))
+    # flattening the tensors makes the auto-batching pass work nicer
+    flat_args = [ReactantCore.materialize_traced_array(vec(arg)) for arg in args]
+
+    # This wont be a mutating function so we can safely execute it once
+    res_tmp = @allowscalar(f([@allowscalar(arg[1]) for arg in flat_args]...))
+    result = similar(first(flat_args), Reactant.unwrapped_eltype(res_tmp), L)
+
+    ind_var = Ref(0)
+    f_ref = Ref(f)
+    result_ref = Ref(result)
+    args_ref = Ref(flat_args)
+    limit_ref = Ref(L)
+
+    ReactantCore.traced_while(
+        __elem_apply_loop_condition,
+        __elem_apply_loop_body,
+        (ind_var, f_ref, result_ref, args_ref, limit_ref),
+    )
+
+    return ReactantCore.materialize_traced_array(reshape(result, size(first(args))))
+end
+
 function elem_apply(f, args::Vararg{Any,Nargs}) where {Nargs}
     if all(iszero ∘ ndims, args)
         scalar_args = map(args) do arg
@@ -1094,6 +1138,13 @@ function elem_apply(f, args::Vararg{Any,Nargs}) where {Nargs}
         return Reactant.call_with_reactant(f, scalar_args...)
     end
 
+    # we can expand the scope of this later to support cases where the output
+    # doesn't align with `Ops.batch`. For now we just handle cases that would
+    # obviously fail with scalarizing the inputs.
+    if Reactant.use_overlayed_version(f)
+        return elem_apply_via_while_loop(f, args...)
+    end
+
     argprefix::Symbol = gensym("broadcastarg")
     resprefix::Symbol = gensym("broadcastresult")
     resargprefix::Symbol = gensym("broadcastresarg")
diff --git a/test/batching.jl b/test/batching.jl
@@ -100,3 +100,27 @@ end
 
     @test @jit(batch_with_closure(x_ra, y_ra)) ≈ batch_with_closure(x, y)
 end
+
+function map_with_scalar_indexing(i, x, y)
+    c = max(x[i], y[i])
+    return x[i] + y[i] + c
+end
+
+function mctr(f, range, x, y)
+    f2(i) = f(i, x, y)
+    return map(f2, range)
+end
+
+@testset "map with scalar indexing" begin
+    input1 = Reactant.to_rarray(Reactant.TestUtils.construct_test_array(Float32, 10))
+    input2 = Reactant.to_rarray(Reactant.TestUtils.construct_test_array(Float32, 10))
+
+    hlo = @code_hlo optimize = false mctr(map_with_scalar_indexing, 1:8, input1, input2)
+    @test contains(repr(hlo), "stablehlo.while")
+    hlo = @code_hlo optimize = true mctr(map_with_scalar_indexing, 1:8, input1, input2)
+    @test !contains(repr(hlo), "stablehlo.while")
+
+    res_ra = @jit mctr(map_with_scalar_indexing, 1:8, input1, input2)
+    res = mctr(map_with_scalar_indexing, 1:8, Array(input1), Array(input2))
+    @test res_ra ≈ res
+end