diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index bae786e4b..36141b41d 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -15,7 +15,7 @@ permissions:
 jobs:
   single-gpu:
     runs-on: [self-hosted, Linux, X64, v100]
-    timeout-minutes: 360
+    timeout-minutes: 180
     steps:
     - uses: actions/checkout@v3
 
@@ -33,11 +33,11 @@ jobs:
           -v $GITHUB_WORKSPACE:/workspace \
           -v ~/.cache/pip:/root/.cache/pip \
           pyscf/gpu4pyscf-devel:pyscf-2.8 \
-          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
+          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -v -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
-    timeout-minutes: 360
+    timeout-minutes: 180
     steps:
     - uses: actions/checkout@v3
 
@@ -55,4 +55,4 @@ jobs:
           -v $GITHUB_WORKSPACE:/workspace \
           -v ~/.cache/pip:/root/.cache/pip \
           pyscf/gpu4pyscf-devel:pyscf-2.8 \
-          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
+          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -s -v -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
diff --git a/gpu4pyscf/gto/mole.py b/gpu4pyscf/gto/mole.py
index 56234c647..f6135e8d0 100644
--- a/gpu4pyscf/gto/mole.py
+++ b/gpu4pyscf/gto/mole.py
@@ -193,7 +193,9 @@ def group_basis(mol, tile=1, group_size=None, return_bas_mapping=False,
         sparse_coeff (bool):
             One-to-one mapping between the sorted_mol and mol is assumed.
             The array of mapping indices instead of a single transformation
-            matrix is returned if this option is specified.
+            matrix is returned if this option is specified. The mapping indices
+            can transform the AOs from mol to sorted_mol:
+            sorted_mol.ao_labels() == mol.ao_labels()[idx]
     '''
     from gpu4pyscf.lib import logger
     original_mol = mol
diff --git a/gpu4pyscf/pbc/df/int3c2e.py b/gpu4pyscf/pbc/df/int3c2e.py
index 782722946..8f86bfbdf 100644
--- a/gpu4pyscf/pbc/df/int3c2e.py
+++ b/gpu4pyscf/pbc/df/int3c2e.py
@@ -1041,6 +1041,7 @@ def evaluate_j3c(li, lj, k):
                 _bas_cpu.ctypes, ctypes.c_int(bvkcell.nbas), _env_cpu.ctypes)
             if err != 0:
                 raise RuntimeError(f'fill_int3c2e kernel for {lll} failed')
+            cp.cuda.get_current_stream().synchronize()
             return c_pair_idx, eri3c
         return evaluate_j3c
 
diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py
index 0168587cc..8a5cc70bb 100644
--- a/gpu4pyscf/pbc/df/rsdf_builder.py
+++ b/gpu4pyscf/pbc/df/rsdf_builder.py
@@ -780,6 +780,9 @@ def _lr_int3c2e_gamma_point(ft_opt, bas_ij_cache, cd_j2c, auxcell, omega):
         _buf = buf[:j3c_tmp.size].reshape(j3c_tmp.shape)
         cderi_compressed[:,pair0:pair1] = j3c_tmp.get(out=_buf)
         j3c_tmp = None
+    # It's important to synchronize the host and CUDA kernel before releasing
+    # local variables, as the mapped memory may still be in use by the device.
+    multi_gpu.synchronize()
     return cderi_compressed
 
 # The long-range part of the cderi for k points. The 3-index cderi tensor is compressed.
@@ -935,9 +938,11 @@ def proc():
                 pair0, pair1 = ao_pair_offsets[i, j]
                 cderi_compressed[kp][:,pair0:pair1] = j3c_tmp.get(out=_buf)
             #t1 = log.timer_debug2(f'processing {ll_pattern}', *t1)
+        # It's important to synchronize the host and CUDA kernel before releasing
+        # local variables, as the mapped memory may still be in use by the device.
+        cp.cuda.get_current_stream().synchronize()
 
     multi_gpu.run(proc, non_blocking=True)
-
     return cderi_compressed
 
 def compressed_cderi_gamma_point(cell, auxcell, omega=OMEGA_MIN, with_long_range=True,
@@ -1035,12 +1040,14 @@ def proc():
             ish, jsh = divmod(c_pair_idx, nctrj)
             ish += i0
             jsh += j0
+            print(buflen, j3c_tmp.shape)
             if with_long_range:
                 ft_idx = aopair_offsets_lookup[ish,0,jsh]
                 ij = np.arange(nfi*nfj, dtype=np.int32)
                 idx = ij + ft_idx[:,None]
                 #:cderi[:,idx.ravel()] += j3c_tmp.get()
                 _buf = j3c_tmp.get(out=buf[:j3c_tmp.size].reshape(j3c_tmp.shape))
+                print(idx.min(), idx.max())
                 idx = np.asarray(idx.ravel(), dtype=np.int32)
                 libpbc.take2d_add( # this copy back operation is very slow
                     cderi.ctypes, _buf.ctypes, idx.ctypes,
@@ -1050,6 +1057,7 @@ def proc():
                 p0, p1 = ao_pair_offsets[li, lj]
                 cderi[:,p0:p1] = j3c_tmp.get(out=buf[:j3c_tmp.size].reshape(j3c_tmp.shape))
             j3c_tmp = ish = jsh = c_pair_idx = None
+        cp.cuda.get_current_stream().synchronize()
 
     multi_gpu.run(proc, non_blocking=True)
 
@@ -1094,6 +1102,7 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
         nf = uniq_l * 2 + 1
 
     img_idx_cache = int3c2e_opt.make_img_idx_cache()
+    print('compressed_cderi_j_only')
 
     if with_long_range:
         # LR int3c2e generally creates more non-negligible Coulomb integrals.
@@ -1114,6 +1123,7 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
             int3c2e_opt, img_idx_cache)
         nao_pairs = len(ao_pair_mapping)
         cderi = empty_mapped((naux, nao_pairs))
+    print('compressed_cderi_j_only SR')
 
     log.debug('Avail GPU mem = %s B', get_avail_mem())
     aux_loc = int3c2e_opt.sorted_auxcell.ao_loc_nr(cart=True)
@@ -1126,6 +1136,9 @@ def compressed_cderi_j_only(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
         p0, p1 = p1, p1 + npairs
         ao_pair_offsets[li, lj] = p0, p1
 
+    import gc
+    cp.get_default_memory_pool().free_all_blocks()
+    gc.collect()
     tasks = iter(img_idx_cache)
     def proc():
         if not cell.cart:
@@ -1166,6 +1179,7 @@ def proc():
             j3c_block = cp.empty((naux_cart,nji))
             for k in range(len(int3c2e_opt.uniq_l_ctr_aux)):
                 j3c_tmp = evaluate(li, lj, k)[1]
+                cp.cuda.get_current_stream().synchronize()
                 if j3c_tmp.size == 0:
                     continue
                 # It is possible to optimize the j-only case by performing the
@@ -1177,10 +1191,13 @@ def proc():
                 j3c_tmp = j3c_tmp.transpose(4,0,3,1,2)
                 k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]]
                 j3c_block[k0:k1] = j3c_tmp.reshape(-1,nji)
+                cp.cuda.get_current_stream().synchronize()
 
             j3c_block = contract('uv,up->vp', aux_coeff, j3c_block)
             _buf = buf[:j3c_block.size].reshape(j3c_block.shape)
+            print(buflen, _buf.shape)
             if with_long_range:
+                print(idx.min(), idx.max())
                 _buf = j3c_block.get(out=_buf)
                 libpbc.take2d_add( # this copy back operation is very slow
                     cderi.ctypes, _buf.ctypes, idx.ctypes,
@@ -1191,6 +1208,7 @@ def proc():
                 p0, p1 = ao_pair_offsets[li, lj]
                 cderi[:,p0:p1] = j3c_block.get(out=_buf)
             j3c_tmp = j3c_block = None
+        cp.cuda.get_current_stream().synchronize()
 
     multi_gpu.run(proc, non_blocking=True)
 
@@ -1237,6 +1255,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
         nf = uniq_l * 2 + 1
 
     img_idx_cache = int3c2e_opt.make_img_idx_cache()
+    print('compressed_cderi_kk')
 
     if with_long_range:
         # LR int3c2e generally creates more non-negligible Coulomb integrals.
@@ -1259,6 +1278,7 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
         for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters):
             naux = cd_j2c_cache[j2c_idx].shape[1]
             cderi[kp] = empty_mapped((naux,nao_pairs), dtype=np.complex128)
+    print('compressed_cderi_kk SR')
 
     log.debug('Avail GPU mem = %s B', get_avail_mem())
     aux_loc = int3c2e_opt.sorted_auxcell.ao_loc_nr(cart=True)
@@ -1273,6 +1293,9 @@ def compressed_cderi_kk(cell, auxcell, kpts, kmesh=None, omega=OMEGA_MIN,
         ao_pair_offsets[li, lj] = p0, p1
 
     tasks = iter(img_idx_cache)
+    import gc
+    cp.get_default_memory_pool().free_all_blocks()
+    gc.collect()
     def proc():
         if not cell.cart:
             c2s = [cart2sph_by_l(l) for l in range(lmax+1)]
@@ -1328,10 +1351,14 @@ def proc():
                 k0, k1 = aux_loc[int3c2e_opt.l_ctr_aux_offsets[k:k+2]]
                 j3c_block[:,k0:k1] = j3c_tmp
 
+            if with_long_range:
+                print(idx.min(), idx.max(), len(idx))
+            cp.cuda.get_current_stream().synchronize()
             for j2c_idx, (kp, kp_conj, ki_idx, kj_idx) in enumerate(kpt_iters):
                 aux_coeff = _cd_j2c_cache[j2c_idx] # at -(kj-ki)
                 cderi_k = contract('uv,up->vp', aux_coeff, j3c_block[j2c_idx])
                 _buf = buf[:cderi_k.size].reshape(cderi_k.shape)
+                print(kp, cderi[kp].shape, _buf.shape)
                 if with_long_range:
                     _buf = cderi_k.get(out=_buf)
                     nao_pairs = cderi[kp].shape[1] * 2 # *2 to view complex as doubles
@@ -1344,6 +1371,7 @@ def proc():
                     p0, p1 = ao_pair_offsets[li, lj]
                     cderi[kp][:,p0:p1] = cderi_k.get(out=_buf)
             j3c_tmp = j3c_block = None
+        cp.cuda.get_current_stream().synchronize()
 
     multi_gpu.run(proc, non_blocking=True)