@@ -76,13 +76,52 @@ wait(ev::CudaEvent, progress=yield) = wait(CPU(), ev, progress)
7676function wait (:: CPU , ev:: CudaEvent , progress= nothing )
7777 isdone (ev) && return nothing
7878
79- event = Base. Threads. Event ()
79+ # minimize latency of short operations by busy-waiting,
80+ # initially without even yielding to other tasks
81+ spins = 0
82+ while spins < 256
83+ if spins < 32
84+ ccall (:jl_cpu_pause , Cvoid, ())
85+ # Temporary solution before we have gc transition support in codegen.
86+ ccall (:jl_gc_safepoint , Cvoid, ())
87+ else
88+ yield ()
89+ end
90+ isdone (ev) && return
91+ spins += 1
92+ end
93+
94+ event = Base. Event ()
8095 stream = next_stream ()
8196 wait (CUDADevice (), ev, nothing , stream)
8297 CUDA. launch (;stream) do
8398 notify (event)
8499 end
85- wait (event)
100+ dev = CUDA. device ()
101+ # if an error occurs, the callback may never fire, so use a timer to detect such cases
102+ timer = Timer (0 ; interval= 1 )
103+ Base. @sync begin
104+ Threads. @spawn try
105+ CUDA. device! (dev)
106+ while true
107+ try
108+ Base. wait (timer)
109+ catch err
110+ err isa EOFError && break
111+ rethrow ()
112+ end
113+ if CUDA. unsafe_cuEventQuery (ev. event) != CUDA. ERROR_NOT_READY
114+ break
115+ end
116+ end
117+ finally
118+ notify (event)
119+ end
120+ Threads. @spawn begin
121+ Base. wait (event)
122+ close (timer)
123+ end
124+ end
86125end
87126
88127# Use this to synchronize between computation using the task local stream
226265
227266import CUDA: @device_override
228267
229- import KernelAbstractions: CompilerMetadata, CompilerPass, DynamicCheck, LinearIndices
268+ import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
230269import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
231270import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
232271
0 commit comments