pyscf
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gpu4pyscf/dft/rks.py‎
Lines changed: 2 additions & 8 deletions b/‎gpu4pyscf/dft/rks.py‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎gpu4pyscf/grad/rhf.py‎
Lines changed: 2 additions & 2 deletions b/‎gpu4pyscf/grad/rhf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gpu4pyscf/lib/gvhf-md/md_contract_j.cu‎
Lines changed: 0 additions & 13 deletions b/‎gpu4pyscf/lib/gvhf-md/md_contract_j.cu‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎gpu4pyscf/lib/gvhf-md/md_j_driver.cu‎
Lines changed: 29 additions & 0 deletions b/‎gpu4pyscf/lib/gvhf-md/md_j_driver.cu‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎gpu4pyscf/lib/gvhf-md/md_pairdata.c‎
Lines changed: 3 additions & 2 deletions b/‎gpu4pyscf/lib/gvhf-md/md_pairdata.c‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu‎
Lines changed: 12 additions & 21 deletions b/‎gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu‎
Lines changed: 12 additions & 21 deletions
@@ -11,7 +11,7 @@ jobs:
     - name: Install ruff
       run: pip install ruff
     - name: Check style
-      run: ruff check --config .ruff.toml gpu4pyscf
+      run: ruff check --config .ruff.toml --unsafe-fixes gpu4pyscf
     - name: Check NumPy
       run: ruff check --select NPY --ignore NPY002 gpu4pyscf
   flake:
 
@@ -92,8 +92,6 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     t0 = logger.init_timer(ks)
     initialize_grids(ks, mol, dm)
 
-    ground_state = getattr(dm, 'ndim', 0) == 2
-
     ni = ks._numint
     if hermi == 2:  # because rho = 0
         n, exc, vxc = 0, 0, 0
@@ -120,10 +118,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     if vj_last is not None:
         vj += asarray(vj_last)
     vxc += vj
-    if ground_state:
-        ecoul = float(cupy.einsum('ij,ij', dm_orig, vj).real) * .5
-    else:
-        ecoul = None
+    ecoul = float(cupy.einsum('ij,ij', dm_orig, vj).real) * .5
 
     vk = None
     if ni.libxc.is_hybrid_xc(ks.xc):
@@ -147,8 +142,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
         if vj_last is not None:
             vk += asarray(vhf_last.vk)
         vxc -= vk
-        if ground_state:
-            exc -= float(cupy.einsum('ij,ij', dm_orig, vk).real) * .5
+        exc -= float(cupy.einsum('ij,ij', dm_orig, vk).real) * .5
     t0 = logger.timer(ks, 'veff', *t0)
     vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
     return vxc
 
@@ -195,10 +195,10 @@ def _ejk_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
     nps = l_ctr_pattern[:,1]
     ij_prims = nps[0] * nps[1]
     nroots = (order + 1) // 2 + 1
-    unit = nroots*2 + g_size*3 + ij_prims + 9
+    unit = nroots*2 + g_size*3 + 6
     if mol.omega < 0: # SR
         unit += nroots * 2
-    counts = shm_size // (unit*8)
+    counts = (shm_size - ij_prims*8) // (unit*8)
     n = min(THREADS, _nearest_power2(counts))
     gout_stride = THREADS // n
     return n, gout_stride
 
@@ -963,17 +963,4 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
     }
     return 0;
 }
-
-int init_mdj_constant(int shm_size)
-{
-    cudaFuncSetAttribute(md_j_1dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
-    cudaFuncSetAttribute(md_j_4dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
-                cudaGetErrorString(err));
-        return 1;
-    }
-    return 0;
-}
 }
@@ -51,3 +51,32 @@ int qd_offset_for_threads(int npairs, int threads)
     }
     return address;
 }
+
+extern __global__
+void md_j_1dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
+                     int threadsx, int threadsy, int tilex, int tiley,
+                     uint16_t *pRt2_kl_ij, int8_t *efg_phase);
+extern __global__
+void md_j_4dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
+                     int threadsx, int threadsy, int tilex, int tiley, int dm_size,
+                     uint16_t *pRt2_kl_ij, int8_t *efg_phase);
+extern __global__
+void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
+                  int threadsx, int threadsy, int tilex, int tiley,
+                  uint16_t *pRt2_kl_ij, int8_t *efg_phase);
+
+extern "C" {
+int init_mdj_constant(int shm_size)
+{
+    cudaFuncSetAttribute(md_j_1dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaFuncSetAttribute(md_j_4dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaFuncSetAttribute(pbc_md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
+                cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+}
@@ -283,8 +283,9 @@ void PBC_Et_dot_dm(double *Et_dm, double *dm, int n_dm, int Et_dm_size,
                 int nfj = (lj + 1) * (lj + 2) / 2;
                 int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6;
                 double cc = ci * cj;
+                // Be careful with the transpose of dm. Here, dm is not symmetric.
                 double *Et_dm_ij = Et_dm + pair_loc[bas_ij];
-                double *dm_ij = dm + ao_loc[ctr_ish] * nao + ao_loc[ctr_jsh];
+                double *dm_ij = dm + ao_loc[ctr_jsh] * nao + ao_loc[ctr_ish];
                 for (int img = 0; img < nimgs_uniq_pair; img++) {
                         double cc_with_img = cc;
                         // The diagonal elements of the AO-pairs within the
@@ -306,7 +307,7 @@ void PBC_Et_dot_dm(double *Et_dm, double *dm, int n_dm, int Et_dm_size,
                                         double rho_t = 0.;
                                         for (int i = 0; i < nfi; i++) {
                                         for (int j = 0; j < nfj; j++, n++) {
-                                                rho_t += Et[n] * cc_with_img * pdm[i*nao+j];
+                                                rho_t += Et[n] * cc_with_img * pdm[j*nao+i];
                                         } }
                                         rho[t] = rho_t;
                                 }
 
@@ -74,7 +74,7 @@ inline void iter_Rt_n(double *Rt, double rx, double ry, double rz, int l,
 }
 
 // gout_pattern = ((li == 0) >> 3) | ((lj == 0) >> 2) | ((lk == 0) >> 1) | (ll == 0);
-__global__ static
+__global__
 void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
                   int threadsx, int threadsy, int tilex, int tiley,
                   uint16_t *pRt2_kl_ij, int8_t *efg_phase)
@@ -332,6 +332,7 @@ void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
 
 extern "C" {
 int PBC_build_j(double *vj, double *dm, int n_dm,
+                int dm_xyz_size, int nimgs_uniq_pair,
                 RysIntEnvVars envs, int *scheme, int *shls_slice,
                 int npairs_ij, int npairs_kl,
                 int *pair_ij_mapping, int *pair_kl_mapping,
@@ -363,8 +364,6 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
         q_cond, cutoff};
 
     double omega = env[PTR_RANGE_OMEGA];
-    JKMatrix jmat = {vj, NULL, dm, n_dm, 0, omega};
-
     int threads_ij = scheme[0];
     int threads_kl = scheme[1];
     int gout_stride = scheme[2];
@@ -384,12 +383,16 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
     cudaGetSymbolAddress((void**)&efg_phase, c_Rt2_efg_phase);
     pRt2_kl_ij += offset_for_Rt2_idx(lij, lkl);
     efg_phase += offset_for_Rt2_idx(0, lkl);
-    if (1){//!pbc_md_j_unrolled(&envs, &jmat, &bounds, omega)) {
-        bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
-        bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
-        pbc_md_j_kernel<<<blocks, threads, buflen>>>(
-            envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley,
-            pRt2_kl_ij, efg_phase);
+    int dm_size = dm_xyz_size * nimgs_uniq_pair;
+    for (int i_dm = 0; i_dm < n_dm; ++i_dm) {
+        JKMatrix jmat = {vj+i_dm*dm_size, NULL, dm+i_dm*dm_size, n_dm, 0, omega};
+        if (1){//!pbc_md_j_unrolled(&envs, &jmat, &bounds, omega)) {
+            bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
+            bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
+            pbc_md_j_kernel<<<blocks, threads, buflen>>>(
+                envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley,
+                pRt2_kl_ij, efg_phase);
+        }
     }
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
@@ -398,16 +401,4 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
     }
     return 0;
 }
-
-int PBC_build_j_init(int shm_size)
-{
-    cudaFuncSetAttribute(pbc_md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
-                cudaGetErrorString(err));
-        return 1;
-    }
-    return 0;
-}
 }
Original file line number	Diff line number	Diff line change
`@@ -963,17 +963,4 @@ int MD_build_j(double vj, double dm, int n_dm, int dm_size,`
`963`	`963`	`}`
`964`	`964`	`return 0;`
`965`	`965`	`}`
`966`		`-`
`967`		`-int init_mdj_constant(int shm_size)`
`968`		`-{`
`969`		`- cudaFuncSetAttribute(md_j_1dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);`
`970`		`- cudaFuncSetAttribute(md_j_4dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);`
`971`		`- cudaError_t err = cudaGetLastError();`
`972`		`- if (err != cudaSuccess) {`
`973`		`- fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,`
`974`		`- cudaGetErrorString(err));`
`975`		`- return 1;`
`976`		`- }`
`977`		`- return 0;`
`978`		`-}`
`979`	`966`	`}`