Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ permissions:
jobs:
single-gpu:
runs-on: [self-hosted, Linux, X64, v100]
timeout-minutes: 360
timeout-minutes: 180
steps:
- uses: actions/checkout@v3

Expand All @@ -33,11 +33,11 @@ jobs:
-v $GITHUB_WORKSPACE:/workspace \
-v ~/.cache/pip:/root/.cache/pip \
pyscf/gpu4pyscf-devel:pyscf-2.8 \
/bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
/bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -v -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"

multi-gpu:
runs-on: [self-hosted, Linux, X64, 2T4]
timeout-minutes: 360
timeout-minutes: 180
steps:
- uses: actions/checkout@v3

Expand All @@ -55,4 +55,4 @@ jobs:
-v $GITHUB_WORKSPACE:/workspace \
-v ~/.cache/pip:/root/.cache/pip \
pyscf/gpu4pyscf-devel:pyscf-2.8 \
/bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
/bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -v -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
4 changes: 3 additions & 1 deletion gpu4pyscf/gto/mole.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ def group_basis(mol, tile=1, group_size=None, return_bas_mapping=False,
sparse_coeff (bool):
One-to-one mapping between the sorted_mol and mol is assumed.
The array of mapping indices instead of a single transformation
matrix is returned if this option is specified.
matrix is returned if this option is specified. The mapping indices
can transform the AOs from mol to sorted_mol:
sorted_mol.ao_labels() == mol.ao_labels()[idx]
'''
from gpu4pyscf.lib import logger
original_mol = mol
Expand Down
1 change: 1 addition & 0 deletions gpu4pyscf/pbc/df/int3c2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,7 @@ def evaluate_j3c(li, lj, k):
_bas_cpu.ctypes, ctypes.c_int(bvkcell.nbas), _env_cpu.ctypes)
if err != 0:
raise RuntimeError(f'fill_int3c2e kernel for {lll} failed')
cp.cuda.get_current_stream().synchronize()
return c_pair_idx, eri3c
return evaluate_j3c

Expand Down
30 changes: 29 additions & 1 deletion gpu4pyscf/pbc/df/rsdf_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,9 @@ def _lr_int3c2e_gamma_point(ft_opt, bas_ij_cache, cd_j2c, auxcell, omega):
_buf = buf[:j3c_tmp.size].reshape(j3c_tmp.shape)
cderi_compressed[:,pair0:pair1] = j3c_tmp.get(out=_buf)
j3c_tmp = None
# It's important to synchronize the host and CUDA kernel before releasing
# local variables, as the mapped memory may still be in use by the device.
multi_gpu.synchronize()
return cderi_compressed

# The long-range part of the cderi for k points. The 3-index cderi tensor is compressed.
Expand Down Expand Up @@ -935,9 +938,11 @@ def proc():
pair0, pair1 = ao_pair_offsets[i, j]
cderi_compressed[kp][:,pair0:pair1] = j3c_tmp.get(out=_buf)
#t1 = log.timer_debug2(f'processing {ll_pattern}', *t1)
# It's important to synchronize the host and CUDA kernel before releasing
# local variables, as the mapped memory may still be in use by the device.
cp.cuda.get_current_stream().synchronize()

multi_gpu.run(proc, non_blocking=True)

return cderi_compressed

def compressed_cderi_gamma_point(cell, auxcell, omega=OMEGA_MIN, with_long_range=True,
Expand Down Expand Up @@ -1035,12 +1040,14 @@ def proc():
ish, jsh = divmod(c_pair_idx, nctrj)
ish += i0
jsh += j0
print(buflen, j3c_tmp.shape)
if with_long_range:
ft_idx = aopair_offsets_lookup[ish,0,jsh]
ij = np.arange(nfi*nfj, dtype=np.int32)
idx = ij + ft_idx[:,None]
#:cderi[:,idx.ravel()] += j3c_tmp.get()
_buf = j3c_tmp.get(out=buf[:j3c_tmp.size].reshape(j3c_tmp.shape))
print(idx.min(), idx.max())
idx = np.asarray(idx.ravel(), dtype=np.int32)
libpbc.take2d_add( # this copy back operation is very slow
cderi.ctypes, _buf.ctypes, idx.ctypes,
Expand All @@ -1050,6 +1057,7 @@ def proc():
p0, p1 = ao_pair_offsets[li, lj]
cderi[:,p0:p1] = j3c_tmp.get(out=buf[:j3c_tmp.size].reshape(j3c_tmp.shape))
j3c_tmp = ish = jsh = c_pair_idx = None
cp.cuda.get_current_stream().synchronize()

multi_gpu.run(proc, non_blocking=True)

Expand Down Expand Up @@ -1094,6 +1102,7 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
nf = uniq_l * 2 + 1

img_idx_cache = int3c2e_opt.make_img_idx_cache()
print('compressed_cderi_j_only')

if with_long_range:
# LR int3c2e generally creates more non-negligible Coulomb integrals.
Expand All @@ -1114,6 +1123,7 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
int3c2e_opt, img_idx_cache)
nao_pairs = len(ao_pair_mapping)
cderi = empty_mapped((naux, nao_pairs))
print('compressed_cderi_j_only SR')

log.debug('Avail GPU mem = %s B', get_avail_mem())
aux_loc = int3c2e_opt.sorted_auxcell.ao_loc_nr(cart=True)
Expand All @@ -1126,6 +1136,9 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
p0, p1 = p1, p1 + npairs
ao_pair_offsets[li, lj] = p0, p1

import gc
cp.get_default_memory_pool().free_all_blocks()
gc.collect()
tasks = iter(img_idx_cache)
def proc():
if not cell.cart:
Expand Down Expand Up @@ -1166,6 +1179,7 @@ def proc():
j3c_block = cp.empty((naux_cart,nji))
for k in range(len(int3c2e_opt.uniq_l_ctr_aux)):
j3c_tmp = evaluate(li, lj, k)[1]
cp.cuda.get_current_stream().synchronize()
if j3c_tmp.size == 0:
continue
# It is possible to optimize the j-only case by performing the
Expand All @@ -1177,10 +1191,13 @@ def proc():
j3c_tmp = j3c_tmp.transpose(4,0,3,1,2)
k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]]
j3c_block[k0:k1] = j3c_tmp.reshape(-1,nji)
cp.cuda.get_current_stream().synchronize()

j3c_block = contract('uv,up->vp', aux_coeff, j3c_block)
_buf = buf[:j3c_block.size].reshape(j3c_block.shape)
print(buflen, _buf.shape)
if with_long_range:
print(idx.min(), idx.max())
_buf = j3c_block.get(out=_buf)
libpbc.take2d_add( # this copy back operation is very slow
cderi.ctypes, _buf.ctypes, idx.ctypes,
Expand All @@ -1191,6 +1208,7 @@ def proc():
p0, p1 = ao_pair_offsets[li, lj]
cderi[:,p0:p1] = j3c_block.get(out=_buf)
j3c_tmp = j3c_block = None
cp.cuda.get_current_stream().synchronize()

multi_gpu.run(proc, non_blocking=True)

Expand Down Expand Up @@ -1237,6 +1255,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
nf = uniq_l * 2 + 1

img_idx_cache = int3c2e_opt.make_img_idx_cache()
print('compressed_cderi_kk')

if with_long_range:
# LR int3c2e generally creates more non-negligible Coulomb integrals.
Expand All @@ -1259,6 +1278,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters):
naux = cd_j2c_cache[j2c_idx].shape[1]
cderi[kp] = empty_mapped((naux,nao_pairs), dtype=np.complex128)
print('compressed_cderi_kk SR')

log.debug('Avail GPU mem = %s B', get_avail_mem())
aux_loc = int3c2e_opt.sorted_auxcell.ao_loc_nr(cart=True)
Expand All @@ -1273,6 +1293,9 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
ao_pair_offsets[li, lj] = p0, p1

tasks = iter(img_idx_cache)
import gc
cp.get_default_memory_pool().free_all_blocks()
gc.collect()
def proc():
if not cell.cart:
c2s = [cart2sph_by_l(l) for l in range(lmax+1)]
Expand Down Expand Up @@ -1328,10 +1351,14 @@ def proc():
k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]]
j3c_block[:,k0:k1] = j3c_tmp

if with_long_range:
print(idx.min(), idx.max(), len(idx))
cp.cuda.get_current_stream().synchronize()
for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters):
aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki)
cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx])
_buf = buf[:cderi_k.size].reshape(cderi_k.shape)
print(kp, cderi[kp].shape, _buf.shape)
if with_long_range:
_buf = cderi_k.get(out=_buf)
nao_pairs = cderi[kp].shape[1] * 2 # *2 to view complex as doubles
Expand All @@ -1344,6 +1371,7 @@ def proc():
p0, p1 = ao_pair_offsets[li, lj]
cderi[kp][:,p0:p1] = cderi_k.get(out=_buf)
j3c_tmp = j3c_block = None
cp.cuda.get_current_stream().synchronize()

multi_gpu.run(proc, non_blocking=True)

Expand Down
Loading