Remove pre-allocated streams (#561)

sunqm · web-flow · commit ca3c85c11914 · 2025-11-06T17:49:06.000-08:00
* Remove pre-allocated streams on devices. These streams complicates device synchronization. (Fix #548)
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
@@ -16,12 +16,6 @@
 
 num_devices = cupy.cuda.runtime.getDeviceCount()
 
-# TODO: switch to non_blocking stream (currently blocked by libxc)
-_streams = [None] * num_devices
-for device_id in range(num_devices):
-    with cupy.cuda.Device(device_id):
-        _streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False)
-
 props = cupy.cuda.runtime.getDeviceProperties(0)
 GB = 1024*1024*1024
 min_ao_blksize = 256        # maxisum batch size of AOs
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -28,7 +28,7 @@
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib import utils
 from gpu4pyscf import __config__
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 
 MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
@@ -250,7 +250,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         p1 = min(aux_blksize*(device_id+1), naux)
         #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
         if use_gpu_memory:
-            with cupy.cuda.Device(device_id), _streams[device_id]:
+            with cupy.cuda.Device(device_id):
                 _cderi[device_id] = cupy.empty([p1-p0, npairs])
             log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}")
         else:
@@ -296,7 +296,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
     naoaux = cd_low.shape[0]
     npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))]
     pairs_loc = np.append(0, np.cumsum(npairs))
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         assert isinstance(mol.verbose, int)
         log = logger.new_logger(mol, mol.verbose)
         t1 = log.init_timer()
diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
@@ -27,7 +27,7 @@
 from gpu4pyscf.dft import rks, uks, numint
 from gpu4pyscf.scf import hf, uhf, rohf
 from gpu4pyscf.df import df, int3c2e
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 
 def _pin_memory(array):
     mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
@@ -296,7 +296,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
                      with_j=True, with_k=True, hermi=0, device_id=0):
     ''' Calculate J and K matrices on single GPU
     '''
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
@@ -361,7 +361,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
         For CP-HF or TDDFT
     '''
     vj = vk = None
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
@@ -422,7 +422,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
 def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0):
     ''' Calculate J and K matrices with density matrix
     '''
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
@@ -18,13 +18,13 @@
 from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
 from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 
 def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
     '''  # (L|ij) -> rhoj: (L), rhok: (L|oo)
     '''
     rhoj = rhok = None
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(with_df.mol, with_df.verbose)
         assert isinstance(with_df.verbose, int)
         t0 = log.init_timer()
@@ -87,7 +87,7 @@ def _jk_ip_task(intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list,
                 with_j=True, with_k=True, device_id=0, omega=None):
     mol = intopt.mol
     natm = mol.natm
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(mol, mol.verbose)
         t0 = (logger.process_clock(), logger.perf_counter())
 
@@ -197,7 +197,7 @@ def _jk_task_td(with_df, dm, orbol, orbor, with_j=True, with_k=True, device_id=0
     (L|ij) -> rhoj: (L), rhok: (L|lr), for dm0 from scf, rhok is (L|oo) 
     '''
     rhoj = rhok = None
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(with_df.mol, with_df.verbose)
         assert isinstance(with_df.verbose, int)
         t0 = log.init_timer()
@@ -264,7 +264,7 @@ def _jk_ip_task_td(intopt, rhoj_cart, dm_cart, rhok_cart, orbol_cart, orbor_cart
                 with_j=True, with_k=True, device_id=0, omega=None):
     mol = intopt.mol
     natm = mol.natm
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(mol, mol.verbose)
         t0 = (logger.process_clock(), logger.perf_counter())
 
diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
@@ -23,7 +23,7 @@
 from gpu4pyscf.hessian.rhf import _ao2mo
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 
 NROOT_ON_GPU = 7
 
@@ -33,7 +33,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
         For CP-HF
     '''
     assert hermi == 1
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         assert isinstance(dfobj.verbose, int)
         log = logger.new_logger(dfobj.mol, dfobj.verbose)
         t0 = log.init_timer()
@@ -278,7 +278,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
     assert with_j or with_k
     ao_loc = intopt.ao_loc
     aux_ao_loc = intopt.aux_ao_loc
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
         orbo = cupy.asarray(orbo)
diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py
@@ -24,7 +24,7 @@
                                        reduce_to_device, copy_array, transpose_sum)
 from gpu4pyscf.lib import logger
 from gpu4pyscf.gto.mole import basis_seg_contraction
-from gpu4pyscf.__config__ import num_devices, _streams
+from gpu4pyscf.__config__ import num_devices
 
 LMAX_ON_GPU = 8
 FREE_CUPY_CACHE = True
@@ -253,7 +253,7 @@ def build(self, cutoff=1e-14, group_size=None, group_size_aux=None,
     def bpcache(self):
         device_id = cupy.cuda.Device().id
         if device_id not in self._bpcache:
-            with cupy.cuda.Device(device_id), _streams[device_id]:
+            with cupy.cuda.Device(device_id):
                 log = logger.new_logger(self.mol, self.mol.verbose)
                 cput0 = log.init_timer()
                 bpcache = ctypes.POINTER(BasisProdCache)()
@@ -777,7 +777,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
     return vj
 
 def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
         mocc = cupy.asarray(mocc)
@@ -874,7 +874,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
     aoslices = intopt.mol.aoslice_by_atom()
     vj1_buf = vk1_buf = vj1 = vk1 = None
 
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
         ao2atom = get_ao2atom(intopt, aoslices)
@@ -978,7 +978,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo,
     nao = intopt.mol.nao
     auxslices = intopt.auxmol.aoslice_by_atom()
     vj1 = vk1 = None
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
         aux2atom = get_aux2atom(intopt, auxslices)
@@ -1067,7 +1067,7 @@ def _int3c2e_ip1_wjk_task(intopt, task_k_list, dm0, orbo, wk, device_id=0, with_
     nao = intopt.mol.nao
     naux = intopt.auxmol.nao
     aux_ao_loc = intopt.aux_ao_loc
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
         ncp_ij = len(intopt.log_qs)
@@ -1127,7 +1127,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
 
 def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0):
     aux_ao_loc = intopt.aux_ao_loc
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         cupy.get_default_memory_pool().free_all_blocks()
         log = logger.new_logger(intopt.mol, intopt.mol.verbose)
         t0 = log.init_timer()
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
@@ -29,7 +29,7 @@
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.multi_gpu import lru_cache
 from gpu4pyscf import __config__
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 
 LMAX_ON_GPU = 8
 BAS_ALIGNED = 1
@@ -448,7 +448,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dm, mo_coeff, mo_occ,
                  verbose=None, with_lapl=False, device_id=0, hermi=1):
     ''' nr_rks task on given device
     '''
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if isinstance(dm, cupy.ndarray):
             assert dm.ndim == 2
             # Ensure dm allocated on each device
@@ -858,7 +858,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                 verbose=None, with_lapl=False, device_id=0, hermi=1):
     ''' nr_uks task on one device
     '''
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if dms is not None:
             dma, dmb = dms
             dma = cupy.asarray(dma)
@@ -1117,7 +1117,7 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
 
 def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
                      verbose=None, hermi=1, device_id=0):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if dms is not None: dms = cupy.asarray(dms)
         if mo1 is not None: mo1 = cupy.asarray(mo1)
         if occ_coeff is not None: occ_coeff = cupy.asarray(occ_coeff)
@@ -1281,7 +1281,7 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
 
 def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
                      verbose=None, hermi=1, device_id=0):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if dms is not None:
             dma, dmb = dms
             dma = cupy.asarray(dma)
diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py
@@ -30,7 +30,6 @@
 from gpu4pyscf.lib.cupy_helper import (
     tag_array, contract, condense, reduce_to_device, transpose_sum, ensure_numpy)
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, num_devices
 from gpu4pyscf.df import int3c2e      #TODO: move int3c2e to out of df
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib import multi_gpu
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
@@ -30,7 +30,7 @@
     contract, get_avail_mem, add_sparse, tag_array, sandwich_dot,
     reduce_to_device, take_last2d, ndarray)
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 from gpu4pyscf.dft.numint import NLC_REMOVE_ZERO_RHO_GRID_THRESHOLD
 
 from pyscf import __config__
@@ -157,7 +157,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                   verbose=None, with_lapl=False, device_id=0):
     ''' Calculate the gradient of vxc on given device
     '''
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if dms is not None: dms = cupy.asarray(dms)
         if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
         if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)
diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py
@@ -29,7 +29,7 @@
 from gpu4pyscf.lib.cupy_helper import (
     contract, get_avail_mem, add_sparse, tag_array, reduce_to_device, take_last2d)
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 from gpu4pyscf import __config__
 
 MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
@@ -132,7 +132,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
                   verbose=None, with_lapl=False, grid_range=(), device_id=0):
     ''' Calculate the gradient of vxc on given device
     '''
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if dms is not None: dms = cupy.asarray(dms)
         if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
         if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)
diff --git a/gpu4pyscf/gto/int3c1e.py b/gpu4pyscf/gto/int3c1e.py
@@ -24,7 +24,7 @@
 from gpu4pyscf.scf.int4c2e import BasisProdCache
 from gpu4pyscf.df.int3c2e import sort_mol, _split_l_ctr_groups, get_pairing
 from gpu4pyscf.gto.mole import basis_seg_contraction
-from gpu4pyscf.__config__ import num_devices, _streams
+from gpu4pyscf.__config__ import num_devices
 
 GPU_AO_LMAX = 4
 BLKSIZE = 128
@@ -136,7 +136,7 @@ def get_n_hermite_density_of_angular_pair(l):
 
         self._bpcache = {}
         for n in range(num_devices):
-            with cp.cuda.Device(n), _streams[n]:
+            with cp.cuda.Device(n):
                 bpcache = ctypes.POINTER(BasisProdCache)()
                 scale_shellpair_diag = 1.0
                 libgint.GINTinit_basis_prod(
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
@@ -33,7 +33,7 @@
     contract, tag_array, transpose_sum, get_avail_mem, condense,
     krylov)
 from gpu4pyscf.__config__ import props as gpu_specs
-from gpu4pyscf.__config__ import _streams, num_devices
+from gpu4pyscf.__config__ import num_devices
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib import multi_gpu
 from gpu4pyscf.lib import utils
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
@@ -31,7 +31,7 @@
 from gpu4pyscf.lib.cupy_helper import (contract, add_sparse, get_avail_mem,
                                        reduce_to_device, transpose_sum, take_last2d)
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, num_devices, min_grid_blksize
+from gpu4pyscf.__config__ import num_devices, min_grid_blksize
 from gpu4pyscf.dft.numint import NLC_REMOVE_ZERO_RHO_GRID_THRESHOLD, _contract_rho1_fxc
 import ctypes
 from pyscf import __config__
@@ -406,7 +406,7 @@ def _get_vxc_deriv2_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
     ngrids_glob = grids.coords.shape[0]
     grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id)
 
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         log = logger.new_logger(mol, verbose)
         t1 = t0 = log.init_timer()
         mo_occ = cupy.asarray(mo_occ)
@@ -1558,7 +1558,7 @@ def _get_vxc_deriv1_task(hessobj, grids, mo_coeff, mo_occ, max_memory, device_id
 
     ngrids_glob = grids.coords.shape[0]
     grid_start, grid_end = numint.gen_grid_range(ngrids_glob, device_id)
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         mo_occ = cupy.asarray(mo_occ)
         mo_coeff = cupy.asarray(mo_coeff)
         coeff = cupy.asarray(opt.coeff)
@@ -3805,7 +3805,7 @@ def _get_exc_deriv2_grid_response(hessobj, mo_coeff, mo_occ, max_memory):
 
 def _nr_rks_fxc_mo_task(ni, mol, grids, xc_code, fxc, mo_coeff, mo1, mocc,
                         verbose=None, hermi=1, device_id=0):
-    with cupy.cuda.Device(device_id), _streams[device_id]:
+    with cupy.cuda.Device(device_id):
         if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
         if mo1 is not None: mo1 = cupy.asarray(mo1)
         if mocc is not None: mocc = cupy.asarray(mocc)
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
diff --git a/gpu4pyscf/lib/multi_gpu.py b/gpu4pyscf/lib/multi_gpu.py
diff --git a/gpu4pyscf/mp/dfmp2.py b/gpu4pyscf/mp/dfmp2.py
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py