From 9ec757d3b7eeb1e858974e6fbd625b2c86dadc7f Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Wed, 10 Sep 2025 12:51:21 +0800 Subject: [PATCH 01/11] Pinned memory may be accidently released by the host while still being accessed by devices. --- gpu4pyscf/pbc/df/rsdf_builder.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index 0168587cc..c72c74874 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -780,6 +780,9 @@ def _lr_int3c2e_gamma_point(ft_opt, bas_ij_cache, cd_j2c, auxcell, omega): _buf = buf[:j3c_tmp.size].reshape(j3c_tmp.shape) cderi_compressed[:,pair0:pair1] = j3c_tmp.get(out=_buf) j3c_tmp = None + # It's important to synchronize the host and CUDA kernel before releasing + # local variables, as the mapped memory may still be in use by the device. + multi_gpu.synchronize() return cderi_compressed # The long-range part of the cderi for k points. The 3-index cderi tensor is compressed. @@ -935,6 +938,7 @@ def proc(): pair0, pair1 = ao_pair_offsets[i, j] cderi_compressed[kp][:,pair0:pair1] = j3c_tmp.get(out=_buf) #t1 = log.timer_debug2(f'processing {ll_pattern}', *t1) + cp.cuda.get_current_stream().synchronize() multi_gpu.run(proc, non_blocking=True) @@ -1050,6 +1054,7 @@ def proc(): p0, p1 = ao_pair_offsets[li, lj] cderi[:,p0:p1] = j3c_tmp.get(out=buf[:j3c_tmp.size].reshape(j3c_tmp.shape)) j3c_tmp = ish = jsh = c_pair_idx = None + cp.cuda.get_current_stream().synchronize() multi_gpu.run(proc, non_blocking=True) @@ -1191,6 +1196,7 @@ def proc(): p0, p1 = ao_pair_offsets[li, lj] cderi[:,p0:p1] = j3c_block.get(out=_buf) j3c_tmp = j3c_block = None + cp.cuda.get_current_stream().synchronize() multi_gpu.run(proc, non_blocking=True) @@ -1344,6 +1350,7 @@ def proc(): p0, p1 = ao_pair_offsets[li, lj] cderi[kp][:,p0:p1] = cderi_k.get(out=_buf) j3c_tmp = j3c_block = None + cp.cuda.get_current_stream().synchronize() multi_gpu.run(proc, non_blocking=True) From 4243e792de14da448d7340bfd5aa7d6448ba857f Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Mon, 22 Sep 2025 09:50:28 -0700 Subject: [PATCH 02/11] Adjust CI timeout --- .github/workflows/unittest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 6abeb5728..4bb1fb38d 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -15,7 +15,7 @@ permissions: jobs: single-gpu: runs-on: [self-hosted, Linux, X64, v100] - timeout-minutes: 360 + timeout-minutes: 180 steps: - uses: actions/checkout@v3 @@ -36,7 +36,7 @@ jobs: multi-gpu: runs-on: [self-hosted, Linux, X64, 2T4] - timeout-minutes: 360 + timeout-minutes: 180 steps: - uses: actions/checkout@v3 From 7479aa1a0f62b62cea6fccecbf8d33b6b3abedce Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Tue, 23 Sep 2025 04:10:20 +0800 Subject: [PATCH 03/11] Debugging memory_mapped --- gpu4pyscf/pbc/df/rsdf_builder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index c72c74874..ce4163183 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1255,6 +1255,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, t1 = log.timer_debug2('generating bas_ij indices', *t1) cderi = _lr_int3c2e_kk(ft_opt, bas_ij_cache, cd_j2c_cache, int3c2e_opt.sorted_auxcell, omega, kpts, kpt_iters) + print([x.sum() for x in cderi.values()]) # LR int3c2e would generate more nao_pairs than the SR int3c2e! t1 = log.timer_debug1('LR int3c2e', *t1) else: @@ -1334,12 +1335,14 @@ def proc(): k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]] j3c_block[:,k0:k1] = j3c_tmp + print(idx.min(), idx.max(), len(idx)) for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) _buf = buf[:cderi_k.size].reshape(cderi_k.shape) if with_long_range: _buf = cderi_k.get(out=_buf) + print(kp, cderi[kp].shape, _buf.shape) nao_pairs = cderi[kp].shape[1] * 2 # *2 to view complex as doubles libpbc.take2d_add( # this copy back operation is very slow cderi[kp].ctypes, _buf.ctypes, idx.ctypes, From 760aca69720fbf1b3b60475e0e9c7673851d6634 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Tue, 23 Sep 2025 05:17:46 +0800 Subject: [PATCH 04/11] Debugging pinned memory --- gpu4pyscf/pbc/df/rsdf_builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index ce4163183..ad2178b02 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1335,7 +1335,8 @@ def proc(): k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]] j3c_block[:,k0:k1] = j3c_tmp - print(idx.min(), idx.max(), len(idx)) + if with_long_range: + print(idx.min(), idx.max(), len(idx)) for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) From 54465ba60ab6c0ffefb72a59fa3d7f5ff882de92 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Tue, 23 Sep 2025 07:38:08 +0800 Subject: [PATCH 05/11] More debugging info --- gpu4pyscf/pbc/df/rsdf_builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index ad2178b02..18d76d486 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1340,6 +1340,7 @@ def proc(): for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) + print('cderi_k.shape', kp, cderi_k.shape, buf.shape) _buf = buf[:cderi_k.size].reshape(cderi_k.shape) if with_long_range: _buf = cderi_k.get(out=_buf) From 400527aa8eb037c80457bf3a756e17bcbba58095 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Thu, 2 Oct 2025 14:32:50 -0700 Subject: [PATCH 06/11] Remove redundant print statements --- gpu4pyscf/gto/mole.py | 4 +++- gpu4pyscf/pbc/df/rsdf_builder.py | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py index 56234c647..f6135e8d0 100644 --- a/gpu4pyscf/gto/mole.py +++ b/gpu4pyscf/gto/mole.py @@ -193,7 +193,9 @@ def group_basis(mol, tile=1, group_size=None, return_bas_mapping=False, sparse_coeff (bool): One-to-one mapping between the sorted_mol and mol is assumed. The array of mapping indices instead of a single transformation - matrix is returned if this option is specified. + matrix is returned if this option is specified. The mapping indices + can transform the AOs from mol to sorted_mol: + sorted_mol.ao_labels() == mol.ao_labels()[idx] ''' from gpu4pyscf.lib import logger original_mol = mol diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index 18d76d486..c72c74874 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1255,7 +1255,6 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, t1 = log.timer_debug2('generating bas_ij indices', *t1) cderi = _lr_int3c2e_kk(ft_opt, bas_ij_cache, cd_j2c_cache, int3c2e_opt.sorted_auxcell, omega, kpts, kpt_iters) - print([x.sum() for x in cderi.values()]) # LR int3c2e would generate more nao_pairs than the SR int3c2e! t1 = log.timer_debug1('LR int3c2e', *t1) else: @@ -1335,16 +1334,12 @@ def proc(): k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]] j3c_block[:,k0:k1] = j3c_tmp - if with_long_range: - print(idx.min(), idx.max(), len(idx)) for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) - print('cderi_k.shape', kp, cderi_k.shape, buf.shape) _buf = buf[:cderi_k.size].reshape(cderi_k.shape) if with_long_range: _buf = cderi_k.get(out=_buf) - print(kp, cderi[kp].shape, _buf.shape) nao_pairs = cderi[kp].shape[1] * 2 # *2 to view complex as doubles libpbc.take2d_add( # this copy back operation is very slow cderi[kp].ctypes, _buf.ctypes, idx.ctypes, From 9615aabdedc8467255a7693498e0d2cf0f74296f Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Mon, 6 Oct 2025 09:45:40 -0700 Subject: [PATCH 07/11] Debugging rsdf_builder --- gpu4pyscf/pbc/df/rsdf_builder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index c72c74874..d20f44d2f 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1255,6 +1255,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, t1 = log.timer_debug2('generating bas_ij indices', *t1) cderi = _lr_int3c2e_kk(ft_opt, bas_ij_cache, cd_j2c_cache, int3c2e_opt.sorted_auxcell, omega, kpts, kpt_iters) + print([x.sum() for x in cderi.values()]) # LR int3c2e would generate more nao_pairs than the SR int3c2e! t1 = log.timer_debug1('LR int3c2e', *t1) else: @@ -1334,11 +1335,14 @@ def proc(): k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]] j3c_block[:,k0:k1] = j3c_tmp + if with_long_range: + print(idx.min(), idx.max(), len(idx)) for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) _buf = buf[:cderi_k.size].reshape(cderi_k.shape) if with_long_range: + print(kp, cderi[kp].shape, _buf.shape) _buf = cderi_k.get(out=_buf) nao_pairs = cderi[kp].shape[1] * 2 # *2 to view complex as doubles libpbc.take2d_add( # this copy back operation is very slow From 71d57140b35781a45d503d9bcf59c984f8b50f50 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Wed, 8 Oct 2025 14:35:32 -0700 Subject: [PATCH 08/11] More debugging msgs --- .github/workflows/unittest.yml | 4 ++-- gpu4pyscf/pbc/df/rsdf_builder.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 4bb1fb38d..366550414 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -32,7 +32,7 @@ jobs: -e HTTP_PROXY=$HTTP_PROXY \ -e HTTPS_PROXY=$HTTPS_PROXY \ -v $GITHUB_WORKSPACE:/workspace pyscf/gpu4pyscf-devel:latest \ - /bin/bash -c "cd /workspace && pip3 install --no-cache-dir --target=/tmp/deps -r requirements.txt && export PYTHONPATH=/tmp/deps:$PYTHONPATH && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" + /bin/bash -c "cd /workspace && pip3 install --no-cache-dir --target=/tmp/deps -r requirements.txt && export PYTHONPATH=/tmp/deps:$PYTHONPATH && source build.sh && pytest -s -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" multi-gpu: runs-on: [self-hosted, Linux, X64, 2T4] @@ -53,4 +53,4 @@ jobs: -e HTTP_PROXY=$HTTP_PROXY \ -e HTTPS_PROXY=$HTTPS_PROXY \ -v $GITHUB_WORKSPACE:/workspace pyscf/gpu4pyscf-devel:latest \ - /bin/bash -c "cd /workspace && pip3 install --no-cache-dir --target=/tmp/deps -r requirements.txt && export PYTHONPATH=/tmp/deps:$PYTHONPATH && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" + /bin/bash -c "cd /workspace && pip3 install --no-cache-dir --target=/tmp/deps -r requirements.txt && export PYTHONPATH=/tmp/deps:$PYTHONPATH && source build.sh && pytest -s -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index d20f44d2f..553555303 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -938,10 +938,11 @@ def proc(): pair0, pair1 = ao_pair_offsets[i, j] cderi_compressed[kp][:,pair0:pair1] = j3c_tmp.get(out=_buf) #t1 = log.timer_debug2(f'processing {ll_pattern}', *t1) + # It's important to synchronize the host and CUDA kernel before releasing + # local variables, as the mapped memory may still be in use by the device. cp.cuda.get_current_stream().synchronize() multi_gpu.run(proc, non_blocking=True) - return cderi_compressed def compressed_cderi_gamma_point(cell, auxcell, omega=OMEGA_MIN, with_long_range=True, @@ -1255,7 +1256,6 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, t1 = log.timer_debug2('generating bas_ij indices', *t1) cderi = _lr_int3c2e_kk(ft_opt, bas_ij_cache, cd_j2c_cache, int3c2e_opt.sorted_auxcell, omega, kpts, kpt_iters) - print([x.sum() for x in cderi.values()]) # LR int3c2e would generate more nao_pairs than the SR int3c2e! t1 = log.timer_debug1('LR int3c2e', *t1) else: From cc0ddc50fd19f77d76cc8f423cd360766cd2ca70 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Tue, 14 Oct 2025 07:46:05 -0700 Subject: [PATCH 09/11] Debugging --- .github/workflows/unittest.yml | 4 ++-- gpu4pyscf/pbc/df/rsdf_builder.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 792da142c..36141b41d 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -33,7 +33,7 @@ jobs: -v $GITHUB_WORKSPACE:/workspace \ -v ~/.cache/pip:/root/.cache/pip \ pyscf/gpu4pyscf-devel:pyscf-2.8 \ - /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" + /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -v -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" multi-gpu: runs-on: [self-hosted, Linux, X64, 2T4] @@ -55,4 +55,4 @@ jobs: -v $GITHUB_WORKSPACE:/workspace \ -v ~/.cache/pip:/root/.cache/pip \ pyscf/gpu4pyscf-devel:pyscf-2.8 \ - /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" + /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -v -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache" diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index 553555303..7aa196e21 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1040,12 +1040,14 @@ def proc(): ish, jsh = divmod(c_pair_idx, nctrj) ish += i0 jsh += j0 + print(buflen, j3c_tmp.shape) if with_long_range: ft_idx = aopair_offsets_lookup[ish,0,jsh] ij = np.arange(nfi*nfj, dtype=np.int32) idx = ij + ft_idx[:,None] #:cderi[:,idx.ravel()] += j3c_tmp.get() _buf = j3c_tmp.get(out=buf[:j3c_tmp.size].reshape(j3c_tmp.shape)) + print(idx.min(), idx.max()) idx = np.asarray(idx.ravel(), dtype=np.int32) libpbc.take2d_add( # this copy back operation is very slow cderi.ctypes, _buf.ctypes, idx.ctypes, @@ -1186,7 +1188,9 @@ def proc(): j3c_block = contract('uv,up->vp', aux_coeff, j3c_block) _buf = buf[:j3c_block.size].reshape(j3c_block.shape) + print(buflen, _buf.shape) if with_long_range: + print(idx.min(), idx.max()) _buf = j3c_block.get(out=_buf) libpbc.take2d_add( # this copy back operation is very slow cderi.ctypes, _buf.ctypes, idx.ctypes, @@ -1341,8 +1345,8 @@ def proc(): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) _buf = buf[:cderi_k.size].reshape(cderi_k.shape) + print(kp, cderi[kp].shape, _buf.shape) if with_long_range: - print(kp, cderi[kp].shape, _buf.shape) _buf = cderi_k.get(out=_buf) nao_pairs = cderi[kp].shape[1] * 2 # *2 to view complex as doubles libpbc.take2d_add( # this copy back operation is very slow From 0940311ced74c02c44ee56187379f47fc9f381b3 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Wed, 15 Oct 2025 09:37:12 -0700 Subject: [PATCH 10/11] Debugging --- gpu4pyscf/pbc/df/int3c2e.py | 1 + gpu4pyscf/pbc/df/rsdf_builder.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py index 782722946..8f86bfbdf 100644 --- a/gpu4pyscf/pbc/df/int3c2e.py +++ b/gpu4pyscf/pbc/df/int3c2e.py @@ -1041,6 +1041,7 @@ def evaluate_j3c(li, lj, k): _bas_cpu.ctypes, ctypes.c_int(bvkcell.nbas), _env_cpu.ctypes) if err != 0: raise RuntimeError(f'fill_int3c2e kernel for {lll} failed') + cp.cuda.get_current_stream().synchronize() return c_pair_idx, eri3c return evaluate_j3c diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index 7aa196e21..339f4a13f 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1134,6 +1134,9 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, p0, p1 = p1, p1 + npairs ao_pair_offsets[li, lj] = p0, p1 + import gc + cp.get_default_memory_pool().free_all_blocks() + gc.collect() tasks = iter(img_idx_cache) def proc(): if not cell.cart: @@ -1174,6 +1177,7 @@ def proc(): j3c_block = cp.empty((naux_cart,nji)) for k in range(len(int3c2e_opt.uniq_l_ctr_aux)): j3c_tmp = evaluate(li, lj, k)[1] + cp.cuda.get_current_stream().synchronize() if j3c_tmp.size == 0: continue # It is possible to optimize the j-only case by performing the @@ -1185,6 +1189,7 @@ def proc(): j3c_tmp = j3c_tmp.transpose(4,0,3,1,2) k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]] j3c_block[k0:k1] = j3c_tmp.reshape(-1,nji) + cp.cuda.get_current_stream().synchronize() j3c_block = contract('uv,up->vp', aux_coeff, j3c_block) _buf = buf[:j3c_block.size].reshape(j3c_block.shape) @@ -1284,6 +1289,9 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, ao_pair_offsets[li, lj] = p0, p1 tasks = iter(img_idx_cache) + import gc + cp.get_default_memory_pool().free_all_blocks() + gc.collect() def proc(): if not cell.cart: c2s = [cart2sph_by_l(l) for l in range(lmax+1)] @@ -1341,6 +1349,7 @@ def proc(): if with_long_range: print(idx.min(), idx.max(), len(idx)) + cp.cuda.get_current_stream().synchronize() for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki) cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx]) From 2d88216a2401e1b6780a3184a083a5292a9a95fe Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Fri, 17 Oct 2025 15:12:58 -0700 Subject: [PATCH 11/11] Debugging msgs --- gpu4pyscf/pbc/df/rsdf_builder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index 339f4a13f..8a5cc70bb 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -1102,6 +1102,7 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, nf = uniq_l * 2 + 1 img_idx_cache = int3c2e_opt.make_img_idx_cache() + print('compressed_cderi_j_only') if with_long_range: # LR int3c2e generally creates more non-negligible Coulomb integrals. @@ -1122,6 +1123,7 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, int3c2e_opt, img_idx_cache) nao_pairs = len(ao_pair_mapping) cderi = empty_mapped((naux, nao_pairs)) + print('compressed_cderi_j_only SR') log.debug('Avail GPU mem = %s B', get_avail_mem()) aux_loc = int3c2e_opt.sorted_auxcell.ao_loc_nr(cart=True) @@ -1253,6 +1255,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, nf = uniq_l * 2 + 1 img_idx_cache = int3c2e_opt.make_img_idx_cache() + print('compressed_cderi_kk') if with_long_range: # LR int3c2e generally creates more non-negligible Coulomb integrals. @@ -1275,6 +1278,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN, for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters): naux = cd_j2c_cache[j2c_idx].shape[1] cderi[kp] = empty_mapped((naux,nao_pairs), dtype=np.complex128) + print('compressed_cderi_kk SR') log.debug('Avail GPU mem = %s B', get_avail_mem()) aux_loc = int3c2e_opt.sorted_auxcell.ao_loc_nr(cart=True)