Skip to content

Commit ca3c85c

Browse files
authored
Remove pre-allocated streams (#561)
* Remove pre-allocated streams on devices. These streams complicates device synchronization. (Fix #548)
1 parent 7bd1c9b commit ca3c85c

File tree

17 files changed

+97
-138
lines changed

17 files changed

+97
-138
lines changed

gpu4pyscf/__config__.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616

1717
num_devices = cupy.cuda.runtime.getDeviceCount()
1818

19-
# TODO: switch to non_blocking stream (currently blocked by libxc)
20-
_streams = [None] * num_devices
21-
for device_id in range(num_devices):
22-
with cupy.cuda.Device(device_id):
23-
_streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False)
24-
2519
props = cupy.cuda.runtime.getDeviceProperties(0)
2620
GB = 1024*1024*1024
2721
min_ao_blksize = 256 # maxisum batch size of AOs

gpu4pyscf/df/df.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from gpu4pyscf.lib import logger
2929
from gpu4pyscf.lib import utils
3030
from gpu4pyscf import __config__
31-
from gpu4pyscf.__config__ import _streams, num_devices
31+
from gpu4pyscf.__config__ import num_devices
3232

3333
MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
3434
ALIGNED = getattr(__config__, 'ao_aligned', 32)
@@ -250,7 +250,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
250250
p1 = min(aux_blksize*(device_id+1), naux)
251251
#for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
252252
if use_gpu_memory:
253-
with cupy.cuda.Device(device_id), _streams[device_id]:
253+
with cupy.cuda.Device(device_id):
254254
_cderi[device_id] = cupy.empty([p1-p0, npairs])
255255
log.debug(f"CDERI size {_cderi[device_id].nbytes/GB:.3f} GB on Device {device_id}")
256256
else:
@@ -296,7 +296,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
296296
naoaux = cd_low.shape[0]
297297
npairs = [len(intopt.ao_pairs_row[cp_ij]) for cp_ij in range(len(intopt.log_qs))]
298298
pairs_loc = np.append(0, np.cumsum(npairs))
299-
with cupy.cuda.Device(device_id), _streams[device_id]:
299+
with cupy.cuda.Device(device_id):
300300
assert isinstance(mol.verbose, int)
301301
log = logger.new_logger(mol, mol.verbose)
302302
t1 = log.init_timer()

gpu4pyscf/df/df_jk.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from gpu4pyscf.dft import rks, uks, numint
2828
from gpu4pyscf.scf import hf, uhf, rohf
2929
from gpu4pyscf.df import df, int3c2e
30-
from gpu4pyscf.__config__ import _streams, num_devices
30+
from gpu4pyscf.__config__ import num_devices
3131

3232
def _pin_memory(array):
3333
mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
@@ -296,7 +296,7 @@ def _jk_task_with_mo(dfobj, dms, mo_coeff, mo_occ,
296296
with_j=True, with_k=True, hermi=0, device_id=0):
297297
''' Calculate J and K matrices on single GPU
298298
'''
299-
with cupy.cuda.Device(device_id), _streams[device_id]:
299+
with cupy.cuda.Device(device_id):
300300
assert isinstance(dfobj.verbose, int)
301301
log = logger.new_logger(dfobj.mol, dfobj.verbose)
302302
t0 = log.init_timer()
@@ -361,7 +361,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
361361
For CP-HF or TDDFT
362362
'''
363363
vj = vk = None
364-
with cupy.cuda.Device(device_id), _streams[device_id]:
364+
with cupy.cuda.Device(device_id):
365365
assert isinstance(dfobj.verbose, int)
366366
log = logger.new_logger(dfobj.mol, dfobj.verbose)
367367
t0 = log.init_timer()
@@ -422,7 +422,7 @@ def _jk_task_with_mo1(dfobj, dms, mo1s, occ_coeffs,
422422
def _jk_task_with_dm(dfobj, dms, with_j=True, with_k=True, hermi=0, device_id=0):
423423
''' Calculate J and K matrices with density matrix
424424
'''
425-
with cupy.cuda.Device(device_id), _streams[device_id]:
425+
with cupy.cuda.Device(device_id):
426426
assert isinstance(dfobj.verbose, int)
427427
log = logger.new_logger(dfobj.mol, dfobj.verbose)
428428
t0 = log.init_timer()

gpu4pyscf/df/grad/jk.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
1919
from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
2020
from gpu4pyscf.lib import logger
21-
from gpu4pyscf.__config__ import _streams, num_devices
21+
from gpu4pyscf.__config__ import num_devices
2222

2323
def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
2424
''' # (L|ij) -> rhoj: (L), rhok: (L|oo)
2525
'''
2626
rhoj = rhok = None
27-
with cupy.cuda.Device(device_id), _streams[device_id]:
27+
with cupy.cuda.Device(device_id):
2828
log = logger.new_logger(with_df.mol, with_df.verbose)
2929
assert isinstance(with_df.verbose, int)
3030
t0 = log.init_timer()
@@ -87,7 +87,7 @@ def _jk_ip_task(intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list,
8787
with_j=True, with_k=True, device_id=0, omega=None):
8888
mol = intopt.mol
8989
natm = mol.natm
90-
with cupy.cuda.Device(device_id), _streams[device_id]:
90+
with cupy.cuda.Device(device_id):
9191
log = logger.new_logger(mol, mol.verbose)
9292
t0 = (logger.process_clock(), logger.perf_counter())
9393

@@ -197,7 +197,7 @@ def _jk_task_td(with_df, dm, orbol, orbor, with_j=True, with_k=True, device_id=0
197197
(L|ij) -> rhoj: (L), rhok: (L|lr), for dm0 from scf, rhok is (L|oo)
198198
'''
199199
rhoj = rhok = None
200-
with cupy.cuda.Device(device_id), _streams[device_id]:
200+
with cupy.cuda.Device(device_id):
201201
log = logger.new_logger(with_df.mol, with_df.verbose)
202202
assert isinstance(with_df.verbose, int)
203203
t0 = log.init_timer()
@@ -264,7 +264,7 @@ def _jk_ip_task_td(intopt, rhoj_cart, dm_cart, rhok_cart, orbol_cart, orbor_cart
264264
with_j=True, with_k=True, device_id=0, omega=None):
265265
mol = intopt.mol
266266
natm = mol.natm
267-
with cupy.cuda.Device(device_id), _streams[device_id]:
267+
with cupy.cuda.Device(device_id):
268268
log = logger.new_logger(mol, mol.verbose)
269269
t0 = (logger.process_clock(), logger.perf_counter())
270270

gpu4pyscf/df/hessian/jk.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from gpu4pyscf.hessian.rhf import _ao2mo
2424
from gpu4pyscf.lib import logger
2525
from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
26-
from gpu4pyscf.__config__ import _streams, num_devices
26+
from gpu4pyscf.__config__ import num_devices
2727

2828
NROOT_ON_GPU = 7
2929

@@ -33,7 +33,7 @@ def _jk_task_with_mo1(dfobj, dms, mo_coeff, mo1s, occ_coeffs,
3333
For CP-HF
3434
'''
3535
assert hermi == 1
36-
with cupy.cuda.Device(device_id), _streams[device_id]:
36+
with cupy.cuda.Device(device_id):
3737
assert isinstance(dfobj.verbose, int)
3838
log = logger.new_logger(dfobj.mol, dfobj.verbose)
3939
t0 = log.init_timer()
@@ -278,7 +278,7 @@ def _int3c2e_ipip_tasks(intopt, task_list, rhoj, rhok, dm0, orbo,
278278
assert with_j or with_k
279279
ao_loc = intopt.ao_loc
280280
aux_ao_loc = intopt.aux_ao_loc
281-
with cupy.cuda.Device(device_id), _streams[device_id]:
281+
with cupy.cuda.Device(device_id):
282282
log = logger.new_logger(intopt.mol, intopt.mol.verbose)
283283
t0 = log.init_timer()
284284
orbo = cupy.asarray(orbo)

gpu4pyscf/df/int3c2e.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
reduce_to_device, copy_array, transpose_sum)
2525
from gpu4pyscf.lib import logger
2626
from gpu4pyscf.gto.mole import basis_seg_contraction
27-
from gpu4pyscf.__config__ import num_devices, _streams
27+
from gpu4pyscf.__config__ import num_devices
2828

2929
LMAX_ON_GPU = 8
3030
FREE_CUPY_CACHE = True
@@ -253,7 +253,7 @@ def build(self, cutoff=1e-14, group_size=None, group_size_aux=None,
253253
def bpcache(self):
254254
device_id = cupy.cuda.Device().id
255255
if device_id not in self._bpcache:
256-
with cupy.cuda.Device(device_id), _streams[device_id]:
256+
with cupy.cuda.Device(device_id):
257257
log = logger.new_logger(self.mol, self.mol.verbose)
258258
cput0 = log.init_timer()
259259
bpcache = ctypes.POINTER(BasisProdCache)()
@@ -777,7 +777,7 @@ def get_j_int3c2e_pass2(intopt, rhoj, stream=None):
777777
return vj
778778

779779
def _int3c2e_jk_task(intopt, task_k_list, dm0, mocc, device_id=0, omega=None):
780-
with cupy.cuda.Device(device_id), _streams[device_id]:
780+
with cupy.cuda.Device(device_id):
781781
log = logger.new_logger(intopt.mol, intopt.mol.verbose)
782782
t0 = log.init_timer()
783783
mocc = cupy.asarray(mocc)
@@ -874,7 +874,7 @@ def _int3c2e_ip1_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo, device_id=
874874
aoslices = intopt.mol.aoslice_by_atom()
875875
vj1_buf = vk1_buf = vj1 = vk1 = None
876876

877-
with cupy.cuda.Device(device_id), _streams[device_id]:
877+
with cupy.cuda.Device(device_id):
878878
log = logger.new_logger(intopt.mol, intopt.mol.verbose)
879879
t0 = log.init_timer()
880880
ao2atom = get_ao2atom(intopt, aoslices)
@@ -978,7 +978,7 @@ def _int3c2e_ip2_vjk_task(intopt, task_k_list, rhoj, rhok, dm0, orbo,
978978
nao = intopt.mol.nao
979979
auxslices = intopt.auxmol.aoslice_by_atom()
980980
vj1 = vk1 = None
981-
with cupy.cuda.Device(device_id), _streams[device_id]:
981+
with cupy.cuda.Device(device_id):
982982
log = logger.new_logger(intopt.mol, intopt.mol.verbose)
983983
t0 = log.init_timer()
984984
aux2atom = get_aux2atom(intopt, auxslices)
@@ -1067,7 +1067,7 @@ def _int3c2e_ip1_wjk_task(intopt, task_k_list, dm0, orbo, wk, device_id=0, with_
10671067
nao = intopt.mol.nao
10681068
naux = intopt.auxmol.nao
10691069
aux_ao_loc = intopt.aux_ao_loc
1070-
with cupy.cuda.Device(device_id), _streams[device_id]:
1070+
with cupy.cuda.Device(device_id):
10711071
log = logger.new_logger(intopt.mol, intopt.mol.verbose)
10721072
t0 = log.init_timer()
10731073
ncp_ij = len(intopt.log_qs)
@@ -1127,7 +1127,7 @@ def get_int3c2e_ip1_wjk(intopt, dm0_tag, with_k=True, omega=None):
11271127

11281128
def _int3c2e_ip2_wjk(intopt, task_list, dm0, orbo, with_k=True, omega=None, device_id=0):
11291129
aux_ao_loc = intopt.aux_ao_loc
1130-
with cupy.cuda.Device(device_id), _streams[device_id]:
1130+
with cupy.cuda.Device(device_id):
11311131
cupy.get_default_memory_pool().free_all_blocks()
11321132
log = logger.new_logger(intopt.mol, intopt.mol.verbose)
11331133
t0 = log.init_timer()

gpu4pyscf/dft/numint.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from gpu4pyscf.lib import logger
3030
from gpu4pyscf.lib.multi_gpu import lru_cache
3131
from gpu4pyscf import __config__
32-
from gpu4pyscf.__config__ import _streams, num_devices
32+
from gpu4pyscf.__config__ import num_devices
3333

3434
LMAX_ON_GPU = 8
3535
BAS_ALIGNED = 1
@@ -448,7 +448,7 @@ def _nr_rks_task(ni, mol, grids, xc_code, dm, mo_coeff, mo_occ,
448448
verbose=None, with_lapl=False, device_id=0, hermi=1):
449449
''' nr_rks task on given device
450450
'''
451-
with cupy.cuda.Device(device_id), _streams[device_id]:
451+
with cupy.cuda.Device(device_id):
452452
if isinstance(dm, cupy.ndarray):
453453
assert dm.ndim == 2
454454
# Ensure dm allocated on each device
@@ -858,7 +858,7 @@ def _nr_uks_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
858858
verbose=None, with_lapl=False, device_id=0, hermi=1):
859859
''' nr_uks task on one device
860860
'''
861-
with cupy.cuda.Device(device_id), _streams[device_id]:
861+
with cupy.cuda.Device(device_id):
862862
if dms is not None:
863863
dma, dmb = dms
864864
dma = cupy.asarray(dma)
@@ -1117,7 +1117,7 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
11171117

11181118
def _nr_rks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
11191119
verbose=None, hermi=1, device_id=0):
1120-
with cupy.cuda.Device(device_id), _streams[device_id]:
1120+
with cupy.cuda.Device(device_id):
11211121
if dms is not None: dms = cupy.asarray(dms)
11221122
if mo1 is not None: mo1 = cupy.asarray(mo1)
11231123
if occ_coeff is not None: occ_coeff = cupy.asarray(occ_coeff)
@@ -1281,7 +1281,7 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None,
12811281

12821282
def _nr_uks_fxc_task(ni, mol, grids, xc_code, fxc, dms, mo1, occ_coeff,
12831283
verbose=None, hermi=1, device_id=0):
1284-
with cupy.cuda.Device(device_id), _streams[device_id]:
1284+
with cupy.cuda.Device(device_id):
12851285
if dms is not None:
12861286
dma, dmb = dms
12871287
dma = cupy.asarray(dma)

gpu4pyscf/grad/rhf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from gpu4pyscf.lib.cupy_helper import (
3131
tag_array, contract, condense, reduce_to_device, transpose_sum, ensure_numpy)
3232
from gpu4pyscf.__config__ import props as gpu_specs
33-
from gpu4pyscf.__config__ import _streams, num_devices
3433
from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df
3534
from gpu4pyscf.lib import logger
3635
from gpu4pyscf.lib import multi_gpu

gpu4pyscf/grad/rks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
contract, get_avail_mem, add_sparse, tag_array, sandwich_dot,
3131
reduce_to_device, take_last2d, ndarray)
3232
from gpu4pyscf.lib import logger
33-
from gpu4pyscf.__config__ import _streams, num_devices
33+
from gpu4pyscf.__config__ import num_devices
3434
from gpu4pyscf.dft.numint import NLC_REMOVE_ZERO_RHO_GRID_THRESHOLD
3535

3636
from pyscf import __config__
@@ -157,7 +157,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
157157
verbose=None, with_lapl=False, device_id=0):
158158
''' Calculate the gradient of vxc on given device
159159
'''
160-
with cupy.cuda.Device(device_id), _streams[device_id]:
160+
with cupy.cuda.Device(device_id):
161161
if dms is not None: dms = cupy.asarray(dms)
162162
if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
163163
if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)

gpu4pyscf/grad/uks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from gpu4pyscf.lib.cupy_helper import (
3030
contract, get_avail_mem, add_sparse, tag_array, reduce_to_device, take_last2d)
3131
from gpu4pyscf.lib import logger
32-
from gpu4pyscf.__config__ import _streams, num_devices
32+
from gpu4pyscf.__config__ import num_devices
3333
from gpu4pyscf import __config__
3434

3535
MIN_BLK_SIZE = getattr(__config__, 'min_grid_blksize', 128*128)
@@ -132,7 +132,7 @@ def _get_exc_task(ni, mol, grids, xc_code, dms, mo_coeff, mo_occ,
132132
verbose=None, with_lapl=False, grid_range=(), device_id=0):
133133
''' Calculate the gradient of vxc on given device
134134
'''
135-
with cupy.cuda.Device(device_id), _streams[device_id]:
135+
with cupy.cuda.Device(device_id):
136136
if dms is not None: dms = cupy.asarray(dms)
137137
if mo_coeff is not None: mo_coeff = cupy.asarray(mo_coeff)
138138
if mo_occ is not None: mo_occ = cupy.asarray(mo_occ)

0 commit comments

Comments
 (0)