diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
index ed39e07e5..bba773880 100644
--- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu
@@ -882,7 +882,7 @@ int md_j_4dm_unrolled(RysIntEnvVars *envs, JKMatrix *jk, MDBoundsInfo *bounds, d
 
 extern "C" {
 int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
-                RysIntEnvVars envs, int *scheme, int *shls_slice,
+                RysIntEnvVars *envs, int *scheme, int *shls_slice,
                 int npairs_ij, int npairs_kl,
                 int *pair_ij_mapping, int *pair_kl_mapping,
                 int *pair_ij_loc, int *pair_kl_loc,
@@ -935,15 +935,15 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
     pRt2_kl_ij += offset_for_Rt2_idx(lij, lkl);
     efg_phase += offset_for_Rt2_idx(0, lkl);
     if (n_dm == 1) {
-        if (!md_j_unrolled(&envs, &jk, &bounds, omega)) {
+        if (!md_j_unrolled(envs, &jk, &bounds, omega)) {
             bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
             bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
             md_j_1dm_kernel<<<blocks, threads, buflen>>>(
-                envs, jk, bounds, threads_ij, threads_kl, tilex, tiley,
+                *envs, jk, bounds, threads_ij, threads_kl, tilex, tiley,
                 pRt2_kl_ij, efg_phase);
         }
     } else {
-        if (!md_j_4dm_unrolled(&envs, &jk, &bounds, omega, dm_size)) {
+        if (!md_j_4dm_unrolled(envs, &jk, &bounds, omega, dm_size)) {
             bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
             bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
             for (int dm_offset = 0; dm_offset < n_dm; dm_offset+=4) {
@@ -951,7 +951,7 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
                 jk.dm = dm + dm_offset * dm_size;
                 jk.n_dm = n_dm - dm_offset;
                 md_j_4dm_kernel<<<blocks, threads, buflen>>>(
-                    envs, jk, bounds, threads_ij, threads_kl, tilex, tiley, dm_size,
+                    *envs, jk, bounds, threads_ij, threads_kl, tilex, tiley, dm_size,
                     pRt2_kl_ij, efg_phase);
             }
         }
diff --git a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
index d82eb5600..64eddba8b 100644
--- a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
+++ b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu
@@ -333,7 +333,7 @@ void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
 extern "C" {
 int PBC_build_j(double *vj, double *dm, int n_dm,
                 int dm_xyz_size, int nimgs_uniq_pair,
-                RysIntEnvVars envs, int *scheme, int *shls_slice,
+                RysIntEnvVars *envs, int *scheme, int *shls_slice,
                 int npairs_ij, int npairs_kl,
                 int *pair_ij_mapping, int *pair_kl_mapping,
                 int *pair_ij_loc, int *pair_kl_loc,
@@ -386,11 +386,11 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
     int dm_size = dm_xyz_size * nimgs_uniq_pair;
     for (int i_dm = 0; i_dm < n_dm; ++i_dm) {
         JKMatrix jmat = {vj+i_dm*dm_size, NULL, dm+i_dm*dm_size, n_dm, 0, omega};
-        if (1){//!pbc_md_j_unrolled(&envs, &jmat, &bounds, omega)) {
+        if (1){//!pbc_md_j_unrolled(envs, &jmat, &bounds, omega)) {
             bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
             bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
             pbc_md_j_kernel<<<blocks, threads, buflen>>>(
-                envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley,
+                *envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley,
                 pRt2_kl_ij, efg_phase);
         }
     }
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu
index 31da55ddf..921a4cff2 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu
@@ -793,7 +793,7 @@ extern int rys_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds,
 
 extern "C" {
 int RYS_build_j(double *vj, double *dm, int n_dm, int nao,
-                RysIntEnvVars envs, int *scheme, int *shls_slice,
+                RysIntEnvVars *envs, int *scheme, int *shls_slice,
                 int npairs_ij, int npairs_kl, int *pair_ij_mapping, int *pair_kl_mapping,
                 float *q_cond, float *s_estimator, float *dm_cond, float cutoff,
                 int *pool, int *atm, int natm, int *bas, int nbas, double *env)
@@ -834,7 +834,7 @@ int RYS_build_j(double *vj, double *dm, int n_dm, int nao,
 
     JKMatrix jk = {vj, NULL, dm, n_dm, 0, omega};
 
-    if (!rys_j_unrolled(&envs, &jk, &bounds, pool)) {
+    if (!rys_j_unrolled(envs, &jk, &bounds, pool)) {
         int quartets_per_block = scheme[0];
         int gout_stride = scheme[1];
         int with_gout = scheme[2];
@@ -846,11 +846,11 @@ int RYS_build_j(double *vj, double *dm, int n_dm, int nao,
         int buflen = (nroots*2 + g_size*3 + 6) * quartets_per_block + iprim*jprim;
         if (with_gout) {
             buflen += nf3_ij*nf3_kl * quartets_per_block;
-            rys_j_with_gout_kernel<<<npairs_ij, threads, buflen*sizeof(double)>>>(envs, jk, bounds, pool);
+            rys_j_with_gout_kernel<<<npairs_ij, threads, buflen*sizeof(double)>>>(*envs, jk, bounds, pool);
         } else {
             buflen += (nf3_ij+nf3_kl*2+(lij+1)*(lkl+1)*(nmax+2)) * quartets_per_block;
             buflen += nf3_ij; // dm_ij_cache
-            rys_j_kernel<<<npairs_ij, threads, buflen*sizeof(double)>>>(envs, jk, bounds, pool);
+            rys_j_kernel<<<npairs_ij, threads, buflen*sizeof(double)>>>(*envs, jk, bounds, pool);
         }
     }
     cudaError_t err = cudaGetLastError();
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
index 1bee928ab..569d71e7f 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu
@@ -523,7 +523,7 @@ extern int rys_jk_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds
 
 extern "C" {
 int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
-                 RysIntEnvVars envs, int *shls_slice, int shm_size,
+                 RysIntEnvVars *envs, int *shls_slice, int shm_size,
                  int npairs_ij, int npairs_kl,
                  uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping,
                  float *q_cond, float *s_estimator, float *dm_cond, float cutoff,
@@ -567,7 +567,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
         ntiles_i, ntiles_j, ntiles_k, ntiles_l};
 
     JKMatrix jk = {vj, vk, dm, n_dm, 0, omega};
-    if (!rys_jk_unrolled(&envs, &jk, &bounds, pool)) {
+    if (!rys_jk_unrolled(envs, &jk, &bounds, pool)) {
         GXYZOffset* p_gxyz_offset = RYS_make_gxyz_offset(bounds);
         int gout_pattern = (((li == 0) >> 3) |
                             ((lj == 0) >> 2) |
@@ -579,7 +579,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
 
         rys_jk_kernel<<<npairs_ij, threads, buflen>>>(
-            envs, jk, bounds, pool, p_gxyz_offset,
+            *envs, jk, bounds, pool, p_gxyz_offset,
             gout_pattern, reserved_shm_size);
 
         int n_tiles = ntiles_i * ntiles_j * ntiles_k * ntiles_l;
@@ -588,7 +588,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
                                            min(256, n_tiles-256));
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
             rys_jk_kernel<<<npairs_ij, threads, buflen>>>(
-                envs, jk, bounds, pool, p_gxyz_offset+256,
+                *envs, jk, bounds, pool, p_gxyz_offset+256,
                 gout_pattern, reserved_shm_size);
         }
 
@@ -597,7 +597,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao,
                                            min(256, n_tiles-512));
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
             rys_jk_kernel<<<npairs_ij, threads, buflen>>>(
-                envs, jk, bounds, pool, p_gxyz_offset+512,
+                *envs, jk, bounds, pool, p_gxyz_offset+512,
                 gout_pattern, reserved_shm_size);
         }
     }
diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu
index e5434f491..119dac73a 100644
--- a/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu
+++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu
@@ -612,7 +612,7 @@ extern int rys_k_unrolled(RysIntEnvVars *envs, JKMatrix *kmat, BoundsInfo *bound
 
 extern "C" {
 int RYS_build_k(double *vk, double *dm, int n_dm, int nao,
-                RysIntEnvVars envs, int *shls_slice, int shm_size,
+                RysIntEnvVars *envs, int *shls_slice, int shm_size,
                 int npairs_ij, int npairs_kl,
                 uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping,
                 float *q_cond, float *s_estimator, float *dm_cond, float cutoff,
@@ -664,7 +664,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao,
         kmat.sr_factor = 1;
     }
 
-    if (!rys_k_unrolled(&envs, &kmat, &bounds, pool)) {
+    if (!rys_k_unrolled(envs, &kmat, &bounds, pool)) {
         GXYZOffset* p_gxyz_offset = RYS_make_gxyz_offset(bounds);
         int gout_pattern = (((li == 0) >> 3) |
                             ((lj == 0) >> 2) |
@@ -676,7 +676,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao,
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
 
         rys_k_kernel<<<npairs_ij, threads, buflen>>>(
-            envs, kmat, bounds, pool, p_gxyz_offset,
+            *envs, kmat, bounds, pool, p_gxyz_offset,
             gout_pattern, reserved_shm_size);
 
         int n_tiles = ntiles_i * ntiles_j * ntiles_k * ntiles_l;
@@ -685,7 +685,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao,
                                           min(256, n_tiles-256));
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
             rys_k_kernel<<<npairs_ij, threads, buflen>>>(
-                envs, kmat, bounds, pool, p_gxyz_offset+256,
+                *envs, kmat, bounds, pool, p_gxyz_offset+256,
                 gout_pattern, reserved_shm_size);
         }
 
@@ -694,7 +694,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao,
                                           min(256, n_tiles-512));
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
             rys_k_kernel<<<npairs_ij, threads, buflen>>>(
-                envs, kmat, bounds, pool, p_gxyz_offset+512,
+                *envs, kmat, bounds, pool, p_gxyz_offset+512,
                 gout_pattern, reserved_shm_size);
         }
     }
diff --git a/gpu4pyscf/lib/pbc/ft_ao_ip1.cu b/gpu4pyscf/lib/pbc/ft_ao_ip1.cu
index 66246464e..2e7aa6f24 100644
--- a/gpu4pyscf/lib/pbc/ft_ao_ip1.cu
+++ b/gpu4pyscf/lib/pbc/ft_ao_ip1.cu
@@ -342,6 +342,377 @@ void ft_aopair_ejk_ip1_kernel(double *out, double *dm, double *vG, double *Gv,
     }
 }
 
+__global__
+void ft_aopair_strain_deriv_kernel(double *out, double *sigma,
+                              double *dm, double *vG, double *Gv,
+                              PBCIntEnvVars envs, int nGv, int shm_size,
+                              int *bas_ij_idx, int *bas_ij_img_idx,
+                              int *shl_pair_offsets)
+{
+    constexpr int nGv_per_block = NGV_PER_BLOCK;
+    constexpr int threads = NGV_PER_BLOCK * NSP_PER_BLOCK;
+    int sp_block_id = blockIdx.x;
+    int Gv_block_id = blockIdx.y;
+    int Gv_id_in_block = threadIdx.x;
+
+    int thread_id = threadIdx.x + threadIdx.y * blockDim.x;
+    int shl_pair0 = shl_pair_offsets[sp_block_id];
+    int shl_pair1 = shl_pair_offsets[sp_block_id+1];
+    int bas_ij0 = bas_ij_idx[shl_pair0];
+    int nbas = envs.cell0_nbas * envs.bvk_ncells;
+    int ish0 = bas_ij0 / nbas;
+    int jsh0 = bas_ij0 % nbas;
+
+    int *bas = envs.bas;
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int stride_j = li + 2;
+    int g_size = stride_j * (lj + 1);
+    int gx_len = g_size * nGv_per_block * NSP_PER_BLOCK;
+    int gout_stride = 1;
+    while (8*6*gx_len > shm_size) {
+        gx_len /= 2;
+        gout_stride *= 2;
+    }
+    int nsp_per_block = NSP_PER_BLOCK / gout_stride;
+    int gout_id = threadIdx.y % gout_stride;
+    int sp_id = threadIdx.y / gout_stride;
+    int Gv_gout_id = Gv_id_in_block + nGv_per_block * gout_id;
+    int nGv_gout = nGv_per_block * gout_stride;
+    int lij = li + lj + 1;
+    int nfi = (li + 1) * (li + 2) / 2;
+    int nfj = (lj + 1) * (lj + 2) / 2;
+    int nfij = nfi * nfj;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int ijprim = iprim * jprim;
+    int i_1 =          nGv_per_block;
+    int j_1 = stride_j*nGv_per_block;
+    int *ao_loc = envs.ao_loc;
+    int nao = ao_loc[envs.cell0_nbas];
+
+    int Gv_id = Gv_block_id * nGv_per_block + Gv_id_in_block;
+    Gv += Gv_id;
+    double kx = Gv[0];
+    double ky = Gv[nGv];
+    double kz = Gv[nGv * 2];
+    double kk = kx * kx + ky * ky + kz * kz;
+
+    extern __shared__ double shared_memory[];
+    double *gxR = shared_memory + g_size * nGv_per_block * sp_id + Gv_id_in_block;
+    double *gxI = gxR + gx_len*1;
+    double *gyR = gxR + gx_len*2;
+    double *gyI = gxR + gx_len*3;
+    double *gzR = gxR + gx_len*4;
+    double *gzI = gxR + gx_len*5;
+    double *rjri = shared_memory + gx_len * 6 + sp_id;
+    int *idx_i = _c_cartesian_lexical_xyz + lex_xyz_offset(li);
+    int *idx_j = _c_cartesian_lexical_xyz + lex_xyz_offset(lj);
+
+    double sigma_xx = 0;
+    double sigma_xy = 0;
+    double sigma_xz = 0;
+    double sigma_yx = 0;
+    double sigma_yy = 0;
+    double sigma_yz = 0;
+    double sigma_zx = 0;
+    double sigma_zy = 0;
+    double sigma_zz = 0;
+
+    for (int pair_ij = shl_pair0+sp_id; pair_ij < shl_pair1+sp_id; pair_ij += nsp_per_block) {
+        __syncthreads();
+        int bas_ij, jL;
+        if (pair_ij < shl_pair1) {
+            bas_ij = bas_ij_idx[pair_ij];
+            jL = bas_ij_img_idx[pair_ij];
+        } else {
+            bas_ij = bas_ij_idx[shl_pair0];
+            jL = bas_ij_img_idx[shl_pair0];
+        }
+        int ish = bas_ij / nbas;
+        int jsh = bas_ij % nbas;
+        int ish_cell0 = ish;
+        int jsh_cell0 = jsh % envs.cell0_nbas;
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double xi = ri[0];
+        double yi = ri[1];
+        double zi = ri[2];
+        double xj = rj[0] + img_coords[jL*3+0];
+        double yj = rj[1] + img_coords[jL*3+1];
+        double zj = rj[2] + img_coords[jL*3+2];
+        if (Gv_gout_id == 0) {
+            double xjxi = xj - xi;
+            double yjyi = yj - yi;
+            double zjzi = zj - zi;
+            rjri[0*nsp_per_block] = xjxi;
+            rjri[1*nsp_per_block] = yjyi;
+            rjri[2*nsp_per_block] = zjzi;
+        }
+        int i0 = ao_loc[ish];
+        int j0 = ao_loc[jsh];
+        // Note the density matrix is assumed to be real in get_ej_ip1 function
+        double *dm_ij;
+        if (vG == NULL) {
+            dm_ij = dm + (Gv_id + (j0*nao+i0) * nGv) * OF_COMPLEX;
+        } else {
+            dm_ij = dm + (j0*nao+i0);
+        }
+
+        double v_ix = 0;
+        double v_iy = 0;
+        double v_iz = 0;
+        double v_jx = 0;
+        double v_jy = 0;
+        double v_jz = 0;
+        double s0xR, s1xR, s2xR;
+        double s0xI, s1xI, s2xI;
+        double goutx, gouty, goutz;
+
+        for (int ijp = 0; ijp < ijprim; ++ijp) {
+            __syncthreads();
+            int ip = ijp / jprim;
+            int jp = ijp % jprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ai2 = ai * 2;
+            double aj2 = aj * 2;
+            double aij = ai + aj;
+            double aj_aij = aj / aij;
+            double a2 = .5 / aij;
+            if (gout_id == 0) {
+                double theta_ij = ai * aj_aij;
+                double fac = OVERLAP_FAC * ci[ip] * cj[jp] / (aij * sqrt(aij));
+                if (ish_cell0 == jsh_cell0) {
+                    fac *= .5;
+                }
+                if (Gv_id >= nGv) {
+                    fac = 0;
+                }
+                double xjxi = rjri[0*nsp_per_block];
+                double yjyi = rjri[1*nsp_per_block];
+                double zjzi = rjri[2*nsp_per_block];
+                double xij = xjxi * aj_aij + ri[0];
+                double yij = yjyi * aj_aij + ri[1];
+                double zij = zjzi * aj_aij + ri[2];
+                double kR = kx * xij + ky * yij + kz * zij;
+                sincos(-kR, gzI, gzR);
+                double rr = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+                double theta_rr = theta_ij*rr + .5*a2*kk;
+                double Kab = exp(-theta_rr);
+                gxR[0] = fac;
+                gxI[0] = 0.;
+                gyR[0] = 1.;
+                gyI[0] = 0.;
+                // exp(-theta_rr-kR*1j)
+                gzR[0] *= Kab;
+                gzI[0] *= Kab;
+            }
+
+            // gx[i+1] = ia2 * gx[i-1] + (rijrx[0] - kx*a2*_Complex_I) * gx[i];
+            __syncthreads();
+            for (int n = gout_id; n < 3; n += gout_stride) {
+                double *_gxR = gxR + n * gx_len * OF_COMPLEX;
+                double *_gxI = _gxR + gx_len;
+                double RpaR = rjri[n*nsp_per_block] * aj_aij; // Rp - Ra
+                double RpaI = -a2 * Gv[nGv*n];
+                s0xR = _gxR[0];
+                s0xI = _gxI[0];
+                multiply(RpaR, RpaI, s0xR, s0xI, s1xR, s1xI);
+                _gxR[nGv_per_block] = s1xR;
+                _gxI[nGv_per_block] = s1xI;
+                for (int i = 1; i < lij; i++) {
+                    double ia2 = i * a2;
+                    multiply(RpaR, RpaI, s1xR, s1xI, s2xR, s2xI);
+                    s2xR += ia2 * s0xR;
+                    s2xI += ia2 * s0xI;
+                    _gxR[(i+1)*nGv_per_block] = s2xR;
+                    _gxI[(i+1)*nGv_per_block] = s2xI;
+                    s0xR = s1xR;
+                    s0xI = s1xI;
+                    s1xR = s2xR;
+                    s1xI = s2xI;
+                }
+            }
+
+            // hrr
+            if (lj > 0) {
+                __syncthreads();
+                for (int n = gout_id; n < 3*OF_COMPLEX; n += gout_stride) {
+                    double *_gx = gxR + n * gx_len;
+                    // The real and imaginary parts call the same expression
+                    int _ix = n / 2;
+                    double xjxi = rjri[_ix*nsp_per_block];
+                    for (int j = 0; j < lj; ++j) {
+                        int ij = (lij-j) + j*stride_j;
+                        s1xR = _gx[ij*nGv_per_block];
+                        for (--ij; ij >= j*stride_j; --ij) {
+                            s0xR = _gx[ij*nGv_per_block];
+                            _gx[(ij+stride_j)*nGv_per_block] = s1xR - xjxi * s0xR;
+                            s1xR = s0xR;
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+            if (pair_ij >= shl_pair1 || Gv_id >= nGv) {
+                continue;
+            }
+            for (int ij = gout_id; ij < nfij; ij += gout_stride) {
+                int i = ij % nfi;
+                int j = ij / nfi;
+                double dm_vR, dm_vI;
+                if (vG == NULL) {
+                    int addr = (j*nao+i)*nGv * OF_COMPLEX;
+                    dm_vR = dm_ij[addr];
+                    dm_vI = dm_ij[addr+1];
+                } else {
+                    double tmp = dm_ij[j*nao+i];
+                    dm_vR = tmp * vG[Gv_id*OF_COMPLEX  ];
+                    dm_vI = tmp * vG[Gv_id*OF_COMPLEX+1];
+                }
+                int ix = idx_i[i*3+0];
+                int iy = idx_i[i*3+1];
+                int iz = idx_i[i*3+2];
+                int jx = idx_j[j*3+0];
+                int jy = idx_j[j*3+1];
+                int jz = idx_j[j*3+2];
+                int addrx = (ix + jx*stride_j) * nGv_per_block;
+                int addry = (iy + jy*stride_j) * nGv_per_block;
+                int addrz = (iz + jz*stride_j) * nGv_per_block;
+                double IxR = gxR[addrx];
+                double IxI = gxI[addrx];
+                double IyR = gyR[addry];
+                double IyI = gyI[addry];
+                double IzR = gzR[addrz];
+                double IzI = gzI[addrz];
+                double prod_xyR, prod_xyI;
+                double prod_xzR, prod_xzI;
+                double prod_yzR, prod_yzI;
+                multiply(IxR, IxI, IyR, IyI, prod_xyR, prod_xyI);
+                multiply(IxR, IxI, IzR, IzI, prod_xzR, prod_xzI);
+                multiply(IyR, IyI, IzR, IzI, prod_yzR, prod_yzI);
+                multiply(prod_xyR, prod_xyI, dm_vR, dm_vI, prod_xyR, prod_xyI);
+                multiply(prod_xzR, prod_xzI, dm_vR, dm_vI, prod_xzR, prod_xzI);
+                multiply(prod_yzR, prod_yzI, dm_vR, dm_vI, prod_yzR, prod_yzI);
+                double gixR = gxR[addrx+i_1];
+                double gixI = gxI[addrx+i_1];
+                double giyR = gyR[addry+i_1];
+                double giyI = gyI[addry+i_1];
+                double gizR = gzR[addrz+i_1];
+                double gizI = gzI[addrz+i_1];
+                // <i|exp(-iGr)|\nabla j>
+                double fjxR = aj2 * (gixR - rjri[0*nsp_per_block] * IxR);
+                double fjxI = aj2 * (gixI - rjri[0*nsp_per_block] * IxI);
+                double fjyR = aj2 * (giyR - rjri[1*nsp_per_block] * IyR);
+                double fjyI = aj2 * (giyI - rjri[1*nsp_per_block] * IyI);
+                double fjzR = aj2 * (gizR - rjri[2*nsp_per_block] * IzR);
+                double fjzI = aj2 * (gizI - rjri[2*nsp_per_block] * IzI);
+                if (jx > 0) { fjxR -= jx * gxR[addrx-j_1]; fjxI -= jx * gxI[addrx-j_1]; }
+                if (jy > 0) { fjyR -= jy * gyR[addry-j_1]; fjyI -= jy * gyI[addry-j_1]; }
+                if (jz > 0) { fjzR -= jz * gzR[addrz-j_1]; fjzI -= jz * gzI[addrz-j_1]; }
+                goutx = fjxR * prod_yzR - fjxI * prod_yzI;
+                gouty = fjyR * prod_xzR - fjyI * prod_xzI;
+                goutz = fjzR * prod_xyR - fjzI * prod_xyI;
+                v_jx += goutx;
+                v_jy += gouty;
+                v_jz += goutz;
+                sigma_xx += goutx * xj;
+                sigma_xy += goutx * yj;
+                sigma_xz += goutx * zj;
+                sigma_yx += gouty * xj;
+                sigma_yy += gouty * yj;
+                sigma_yz += gouty * zj;
+                sigma_zx += goutz * xj;
+                sigma_zy += goutz * yj;
+                sigma_zz += goutz * zj;
+                // <\nabla i|exp(-iGr)|j>
+                double fixR = ai2 * gixR;
+                double fiyR = ai2 * giyR;
+                double fizR = ai2 * gizR;
+                double fixI = ai2 * gixI;
+                double fiyI = ai2 * giyI;
+                double fizI = ai2 * gizI;
+                if (ix > 0) { fixR -= ix * gxR[addrx-i_1]; fixI -= ix * gxI[addrx-i_1]; }
+                if (iy > 0) { fiyR -= iy * gyR[addry-i_1]; fiyI -= iy * gyI[addry-i_1]; }
+                if (iz > 0) { fizR -= iz * gzR[addrz-i_1]; fizI -= iz * gzI[addrz-i_1]; }
+                goutx = fixR * prod_yzR - fixI * prod_yzI;
+                gouty = fiyR * prod_xzR - fiyI * prod_xzI;
+                goutz = fizR * prod_xyR - fizI * prod_xyI;
+                v_ix += goutx;
+                v_iy += gouty;
+                v_iz += goutz;
+                sigma_xx += goutx * xi;
+                sigma_xy += goutx * yi;
+                sigma_xz += goutx * zi;
+                sigma_yx += gouty * xi;
+                sigma_yy += gouty * yi;
+                sigma_yz += gouty * zi;
+                sigma_zx += goutz * xi;
+                sigma_zy += goutz * yi;
+                sigma_zz += goutz * zi;
+                // <i|\nabla_(e_ij) exp(-iGr)|j> = <i|-iy exp(-iGr)|j> Gx
+                //   = -i <(y-Yi + Yi)i|exp(-iGr)|j> Gx
+                goutx = (gixR + xi * IxR) * prod_yzI + (gixI + xi * IxI) * prod_yzR;
+                gouty = (giyR + yi * IyR) * prod_xzI + (giyI + yi * IyI) * prod_xzR;
+                goutz = (gizR + zi * IzR) * prod_xyI + (gizI + zi * IzI) * prod_xyR;
+                sigma_xx -= kx * goutx;
+                sigma_xy -= kx * gouty;
+                sigma_xz -= kx * goutz;
+                sigma_yx -= ky * goutx;
+                sigma_yy -= ky * gouty;
+                sigma_yz -= ky * goutz;
+                sigma_zx -= kz * goutx;
+                sigma_zy -= kz * gouty;
+                sigma_zz -= kz * goutz;
+            }
+        }
+
+        double *reduce = shared_memory + thread_id;
+        __syncthreads();
+        reduce[0*threads] = v_ix;
+        reduce[1*threads] = v_iy;
+        reduce[2*threads] = v_iz;
+        reduce[3*threads] = v_jx;
+        reduce[4*threads] = v_jy;
+        reduce[5*threads] = v_jz;
+        for (int i = nGv_gout/2; i > 0; i >>= 1) {
+            __syncthreads();
+            if (Gv_gout_id < i) {
+#pragma unroll
+                for (int n = 0; n < 6; ++n) {
+                    reduce[n*threads] += reduce[n*threads+i];
+                }
+            }
+        }
+        if (Gv_gout_id == 0 && pair_ij < shl_pair1) {
+            int ia = bas[ish_cell0*BAS_SLOTS+ATOM_OF];
+            int ja = bas[jsh_cell0*BAS_SLOTS+ATOM_OF];
+            atomicAdd(out+ia*3+0, reduce[0*threads]);
+            atomicAdd(out+ia*3+1, reduce[1*threads]);
+            atomicAdd(out+ia*3+2, reduce[2*threads]);
+            atomicAdd(out+ja*3+0, reduce[3*threads]);
+            atomicAdd(out+ja*3+1, reduce[4*threads]);
+            atomicAdd(out+ja*3+2, reduce[5*threads]);
+        }
+    }
+    atomicAdd(sigma+0, sigma_xx);
+    atomicAdd(sigma+1, sigma_xy);
+    atomicAdd(sigma+2, sigma_xz);
+    atomicAdd(sigma+3, sigma_yx);
+    atomicAdd(sigma+4, sigma_yy);
+    atomicAdd(sigma+5, sigma_yz);
+    atomicAdd(sigma+6, sigma_zx);
+    atomicAdd(sigma+7, sigma_zy);
+    atomicAdd(sigma+8, sigma_zz);
+}
+
 extern "C" {
 int PBC_ft_aopair_ej_ip1(double *out, double *dm, double *vG, double *GvT,
                          PBCIntEnvVars *envs,
@@ -383,4 +754,46 @@ int PBC_ft_aopair_ek_ip1(double *out, double *dm_vG, double *GvT, PBCIntEnvVars
     }
     return 0;
 }
+
+int PBC_ft_aopair_ej_strain_deriv(double *out, double *sigma, double *dm,
+                         double *vG, double *GvT, PBCIntEnvVars *envs,
+                         int nbatches_shl_pair, int ngrids, int shm_size,
+                         int *bas_ij_idx, int *bas_ij_img_idx, int *shl_pair_offsets,
+                         int *atm, int natm, int *bas, int nbas, double *env)
+{
+    cudaFuncSetAttribute(ft_aopair_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    dim3 threads(NGV_PER_BLOCK, NSP_PER_BLOCK);
+    int Gv_batches = (ngrids + NGV_PER_BLOCK - 1) / NGV_PER_BLOCK;
+    dim3 blocks(nbatches_shl_pair, Gv_batches);
+    ft_aopair_strain_deriv_kernel<<<blocks, threads, shm_size>>>(
+            out, sigma, dm, vG, GvT, *envs, ngrids, shm_size,
+            bas_ij_idx, bas_ij_img_idx, shl_pair_offsets);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in ft_aopair_ej_strain_deriv: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
+int PBC_ft_aopair_ek_strain_deriv(double *out, double *sigma,
+                         double *dm_vG, double *GvT, PBCIntEnvVars *envs,
+                         int nbatches_shl_pair, int ngrids, int shm_size,
+                         int *bas_ij_idx, int *bas_ij_img_idx, int *shl_pair_offsets,
+                         int *atm, int natm, int *bas, int nbas, double *env)
+{
+    cudaFuncSetAttribute(ft_aopair_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    dim3 threads(NGV_PER_BLOCK, NSP_PER_BLOCK);
+    int Gv_batches = (ngrids + NGV_PER_BLOCK - 1) / NGV_PER_BLOCK;
+    dim3 blocks(nbatches_shl_pair, Gv_batches);
+    ft_aopair_strain_deriv_kernel<<<blocks, threads, shm_size>>>(
+            out, sigma, dm_vG, NULL, GvT, *envs, ngrids, shm_size,
+            bas_ij_idx, bas_ij_img_idx, shl_pair_offsets);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in ft_aopair_ek_strain_deriv: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
 }
diff --git a/gpu4pyscf/lib/pbc/overlap.cu b/gpu4pyscf/lib/pbc/overlap.cu
index caaaa4d83..23ca997c8 100644
--- a/gpu4pyscf/lib/pbc/overlap.cu
+++ b/gpu4pyscf/lib/pbc/overlap.cu
@@ -20,6 +20,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+#include "gvhf-rys/rys_contract_k.cuh"
 #include "pbc.cuh"
 #include "int3c2e.cuh"
 
@@ -34,6 +35,7 @@ typedef struct {
 
 #define GOUT_WIDTH      36
 #define GOUT_WIDTH_IP1  18
+#define REMOTE_THRESHOLD 50
 
 static __global__
 void int1e_ovlp_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds)
@@ -795,6 +797,289 @@ void int1e_ipkin_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds
     }
 }
 
+static __global__
+void ovlp_strain_deriv_kernel(double *out, double *dm, PBCIntEnvVars envs,
+                              int *shl_pair_offsets, int *bas_ij_idx,
+                              int *gout_stride_lookup, int is_gamma_point)
+{
+    int sp_block_id = blockIdx.x;
+    int thread_id = threadIdx.x;
+    int shl_pair0 = shl_pair_offsets[sp_block_id];
+    int shl_pair1 = shl_pair_offsets[sp_block_id+1];
+    int bas_ij0 = bas_ij_idx[shl_pair0];
+    int cell0_nbas = envs.cell0_nbas;
+    int supmol_nbas = cell0_nbas * envs.nimgs;
+    int ish0 = bas_ij0 / supmol_nbas;
+    int jsh0 = bas_ij0 % cell0_nbas;
+    int nao = envs.ao_loc[cell0_nbas];
+
+    int *bas = envs.bas;
+    int *ao_loc = envs.ao_loc;
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+    int li = bas[ish0*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh0*BAS_SLOTS+ANG_OF];
+    int nfi = (li + 1) * (li + 2) / 2;
+    int nfj = (lj + 1) * (lj + 2) / 2;
+    int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF];
+    int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF];
+    int lij = li + lj + 1;
+    int stride_j = li + 2;
+    int ijprim = iprim * jprim;
+    int nfij = nfi * nfj;
+
+    int gout_stride = gout_stride_lookup[li*L_AUX1+lj];
+    int nsp_per_block = THREADS / gout_stride;
+    int sp_id = thread_id % nsp_per_block;
+    int gout_id = thread_id / nsp_per_block;
+    int i_1 =          nsp_per_block;
+    int j_1 = stride_j*nsp_per_block;
+
+    int g_size = (li + 2) * (lj + 1);
+    int gx_len = g_size * nsp_per_block;
+    extern __shared__ double g[];
+    double *gx = g + sp_id;
+    double *gy = gx + gx_len;
+    double *gz = gx + gx_len * 2;
+    double *rjri = gx + gx_len * 3;
+    gx[0] = PI_POW_1_5;
+    gy[0] = 1.;
+    int *idx_i = _c_cartesian_lexical_xyz + lex_xyz_offset(li);
+    int *idx_j = _c_cartesian_lexical_xyz + lex_xyz_offset(lj);
+
+    double sigma_xx = 0;
+    double sigma_xy = 0;
+    double sigma_xz = 0;
+    double sigma_yx = 0;
+    double sigma_yy = 0;
+    double sigma_yz = 0;
+    double sigma_zx = 0;
+    double sigma_zy = 0;
+    double sigma_zz = 0;
+    for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) {
+        int pair_ij = task_id + sp_id;
+        if (pair_ij >= shl_pair1) {
+            pair_ij = shl_pair0;
+            gx[0] = 0;
+        }
+        int bas_ij = bas_ij_idx[pair_ij];
+        int ish = bas_ij / supmol_nbas;
+        int _jsh = bas_ij % supmol_nbas;
+        int cell_j = _jsh / cell0_nbas;
+        int jsh = _jsh % cell0_nbas;
+        if (ish == jsh) {
+            gy[0] = .5;
+        } else if (ish < jsh) {
+            gx[0] = 0;
+        }
+        int i0 = ao_loc[ish];
+        int j0 = ao_loc[jsh];
+        double *dm_ji;
+        if (is_gamma_point) {
+            dm_ji = dm + j0*nao+i0;
+        } else {
+            dm_ji = dm + (cell_j*nao+j0)*nao+i0;
+        }
+        double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+        double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+        double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+        double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+        double xj = rj[0] + img_coords[cell_j*3+0];
+        double yj = rj[1] + img_coords[cell_j*3+1];
+        double zj = rj[2] + img_coords[cell_j*3+2];
+        if (gout_id == 0) {
+            double xjxi = xj - ri[0];
+            double yjyi = yj - ri[1];
+            double zjzi = zj - ri[2];
+            double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+            rjri[0*nsp_per_block] = xjxi;
+            rjri[1*nsp_per_block] = yjyi;
+            rjri[2*nsp_per_block] = zjzi;
+            rjri[3*nsp_per_block] = rr_ij;
+        }
+        for (int ijp = 0; ijp < ijprim; ++ijp) {
+            __syncthreads();
+            int ip = ijp % iprim;
+            int jp = ijp / iprim;
+            double ai = expi[ip];
+            double aj = expj[jp];
+            double ai2 = ai * 2;
+            double aj2 = aj * 2;
+            double aij = ai + aj;
+            double aj_aij = aj / aij;
+            if (gout_id == 0) {
+                double theta = ai * aj_aij;
+                double theta_rr = theta * rjri[3*nsp_per_block];
+                double cicj = ci[ip] * cj[jp];
+                gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr);
+            }
+            __syncthreads();
+            double s0x, s1x, s2x;
+            double b = .5 / aij;
+            for (int n = gout_id; n < 3; n += gout_stride) {
+                double *_gx = gx + n * gx_len;
+                double xjxi = rjri[n*nsp_per_block];
+                double xpa = xjxi * aj_aij;
+                s0x = _gx[0];
+                s1x = xpa * s0x;
+                _gx[nsp_per_block] = s1x;
+                for (int i = 1; i < lij; ++i) {
+                    s2x = xpa * s1x + i * b * s0x;
+                    _gx[(i+1)*nsp_per_block] = s2x;
+                    s0x = s1x;
+                    s1x = s2x;
+                }
+                for (int j = 0; j < lj; ++j) {
+                    int ij = (lij-j) + j*stride_j;
+                    s1x = _gx[ij*nsp_per_block];
+                    for (--ij; ij >= j*stride_j; --ij) {
+                        s0x = _gx[ij*nsp_per_block];
+                        _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x;
+                        s1x = s0x;
+                    }
+                }
+            }
+            __syncthreads();
+            if (task_id + sp_id >= shl_pair1) {
+                continue;
+            }
+#pragma unroll
+            for (int n = 0; n < GOUT_WIDTH; ++n) {
+                int ij = n*gout_stride+gout_id;
+                if (ij >= nfij) break;
+                int i = ij % nfi;
+                int j = ij / nfi;
+                int ix = idx_i[i*3+0];
+                int iy = idx_i[i*3+1];
+                int iz = idx_i[i*3+2];
+                int jx = idx_j[j*3+0];
+                int jy = idx_j[j*3+1];
+                int jz = idx_j[j*3+2];
+                int addrx = (ix + jx*stride_j) * nsp_per_block;
+                int addry = (iy + jy*stride_j + g_size) * nsp_per_block;
+                int addrz = (iz + jz*stride_j + g_size*2) * nsp_per_block;
+                double Ix = gx[addrx];
+                double Iy = gx[addry];
+                double Iz = gx[addrz];
+                double dm_val = dm_ji[j*nao+i];
+                double prod_xy = Ix * Iy * dm_val;
+                double prod_xz = Ix * Iz * dm_val;
+                double prod_yz = Iy * Iz * dm_val;
+                double gix = gx[addrx+i_1];
+                double giy = gx[addry+i_1];
+                double giz = gx[addrz+i_1];
+                double fix = ai2 * gix; if (ix > 0) { fix -= ix * gx[addrx-i_1]; }
+                double fiy = ai2 * giy; if (iy > 0) { fiy -= iy * gx[addry-i_1]; }
+                double fiz = ai2 * giz; if (iz > 0) { fiz -= iz * gx[addrz-i_1]; }
+                double v_ix = fix * prod_yz;
+                double v_iy = fiy * prod_xz;
+                double v_iz = fiz * prod_xy;
+                double fjx = aj2 * (gix - rjri[0*nsp_per_block] * Ix); if (jx > 0) { fjx -= jx * gx[addrx-j_1]; }
+                double fjy = aj2 * (giy - rjri[1*nsp_per_block] * Iy); if (jy > 0) { fjy -= jy * gx[addry-j_1]; }
+                double fjz = aj2 * (giz - rjri[2*nsp_per_block] * Iz); if (jz > 0) { fjz -= jz * gx[addrz-j_1]; }
+                double v_jx = fjx * prod_yz;
+                double v_jy = fjy * prod_xz;
+                double v_jz = fjz * prod_xy;
+                double xi = ri[0];
+                double yi = ri[1];
+                double zi = ri[2];
+                sigma_xx += v_ix * xi;
+                sigma_xy += v_ix * yi;
+                sigma_xz += v_ix * zi;
+                sigma_yx += v_iy * xi;
+                sigma_yy += v_iy * yi;
+                sigma_yz += v_iy * zi;
+                sigma_zx += v_iz * xi;
+                sigma_zy += v_iz * yi;
+                sigma_zz += v_iz * zi;
+                sigma_xx += v_jx * xj;
+                sigma_xy += v_jx * yj;
+                sigma_xz += v_jx * zj;
+                sigma_yx += v_jy * xj;
+                sigma_yy += v_jy * yj;
+                sigma_yz += v_jy * zj;
+                sigma_zx += v_jz * xj;
+                sigma_zy += v_jz * yj;
+                sigma_zz += v_jz * zj;
+            }
+        }
+    }
+    atomicAdd(out+0, sigma_xx);
+    atomicAdd(out+1, sigma_xy);
+    atomicAdd(out+2, sigma_xz);
+    atomicAdd(out+3, sigma_yx);
+    atomicAdd(out+4, sigma_yy);
+    atomicAdd(out+5, sigma_yz);
+    atomicAdd(out+6, sigma_zx);
+    atomicAdd(out+7, sigma_zy);
+    atomicAdd(out+8, sigma_zz);
+}
+
+// An estimation of the upper bound of the overlap |<cell0|supcmol>| for
+// shell pairs between the primitve cell and the super-mol
+__global__ static
+void ovlp_mask_estimation_kernel(int8_t *ovlp_mask, float *exps, float *log_coeff,
+                                 PBCIntEnvVars envs, int hermi, float log_cutoff)
+{
+    int bas_ij = blockIdx.x * blockDim.x + threadIdx.x;
+    int nbas = envs.cell0_nbas;
+    int nbas2 = nbas * nbas;
+    if (bas_ij >= nbas2) {
+        return;
+    }
+    int ish = bas_ij / nbas;
+    int jsh = bas_ij % nbas;
+    if (hermi && ish < jsh) {
+        return;
+    }
+    int nimgs = envs.nimgs;
+    int supmol_nbas = nbas * nimgs;
+    ovlp_mask += ish * supmol_nbas + jsh;
+    int *bas = envs.bas;
+    double *env = envs.env;
+    double *img_coords = envs.img_coords;
+    int li = bas[ish*BAS_SLOTS+ANG_OF];
+    int lj = bas[jsh*BAS_SLOTS+ANG_OF];
+    double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+    double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+    float xi = ri[0];
+    float yi = ri[1];
+    float zi = ri[2];
+    float xj = rj[0];
+    float yj = rj[1];
+    float zj = rj[2];
+    float ai = exps[ish];
+    float aj = exps[jsh];
+    float aij = ai + aj;
+    float fi = ai / aij;
+    float fj = aj / aij;
+    float theta = ai * fj;
+    float xjxi = xj - xi;
+    float yjyi = yj - yi;
+    float zjzi = zj - zi;
+    float fac_norm = log_coeff[ish] + log_coeff[jsh] + 1.717f - 1.5f * logf(aij);
+    for (int img = 0; img < nimgs; ++img) {
+        float xjLxi = xjxi + img_coords[img*3+0];
+        float yjLyi = yjyi + img_coords[img*3+1];
+        float zjLzi = zjzi + img_coords[img*3+2];
+        float rr_ij = xjLxi * xjLxi + yjLyi * yjLyi + zjLzi * zjLzi;
+        if (theta*rr_ij > REMOTE_THRESHOLD) {
+            continue;
+        }
+        float dr = sqrtf(rr_ij);
+        float dri = fj * dr;
+        float drj = fi * dr;
+        float dri_fac = .5f*li * logf(.5f*li/aij + dri*dri + 1e-9f);
+        float drj_fac = .5f*lj * logf(.5f*lj/aij + drj*drj + 1e-9f);
+        float log_ovlp = fac_norm - theta*rr_ij + dri_fac + drj_fac;
+        if (log_ovlp > log_cutoff) {
+            ovlp_mask[img*nbas] = 1;
+        }
+    }
+}
+
 extern "C" {
 int PBCint1e_ovlp(double *out, PBCIntEnvVars *envs, int shm_size,
                   int nbatches_shl_pair, int *bas_ij_idx,
@@ -863,4 +1148,28 @@ int PBCint1e_ipkin(double *out, PBCIntEnvVars *envs, int shm_size,
     }
     return 0;
 }
+
+int PBCovlp_strain_deriv(double *out, double *dm,
+                    PBCIntEnvVars *envs, int shm_size, int nbatches_shl_pair,
+                    int *shl_pair_offsets, int *bas_ij_idx, int *gout_stride_lookup,
+                    int is_gamma_point)
+{
+    cudaFuncSetAttribute(ovlp_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    ovlp_strain_deriv_kernel<<<nbatches_shl_pair, THREADS, shm_size>>>(
+            out, dm, *envs, shl_pair_offsets, bas_ij_idx, gout_stride_lookup, is_gamma_point);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in ovlp_strain_deriv kernel: %s\n", cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+void PBCovlp_mask_estimation(int8_t *ovlp_mask, float *exps, float *log_coeff,
+                             PBCIntEnvVars *envs, int hermi, float log_cutoff)
+{
+    int nbas = envs->cell0_nbas;
+    int nbatches = (nbas * nbas + THREADS-1) / THREADS;
+    ovlp_mask_estimation_kernel<<<nbatches, THREADS>>>(
+            ovlp_mask, exps, log_coeff, *envs, hermi, log_cutoff);
+}
 }
diff --git a/gpu4pyscf/lib/pbc/pbc.cuh b/gpu4pyscf/lib/pbc/pbc.cuh
index 7b7f44178..778d6ae2a 100644
--- a/gpu4pyscf/lib/pbc/pbc.cuh
+++ b/gpu4pyscf/lib/pbc/pbc.cuh
@@ -66,12 +66,12 @@
 typedef struct {
     int cell0_natm; // in bvk-cell
     int cell0_nbas; // in bvk-cell
-    int bvk_ncells; // number of images in the BvK cell
-    int nimgs; // number of images in lattice sum
     int *atm;
     int *bas;
     double *env;
     int *ao_loc; // in bvk-cell
+    int bvk_ncells; // number of images in the BvK cell
+    int nimgs; // number of images in lattice sum
     double *img_coords; // vectors in lattice sum
 } PBCIntEnvVars;
 
diff --git a/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu b/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu
index d25182e5f..8bda9b0be 100644
--- a/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu
+++ b/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+ * Copyright 2025 The PySCF Developers. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -550,13 +550,629 @@ while (1) {
 }
 }
 
+__global__ static
+void rys_ejk_strain_deriv_kernel(RysIntEnvVars envs, JKEnergy jk, BoundsInfo bounds,
+                        double *sigma, int *bas_mask_idx, int *Ts_ji_lookup,
+                        int nimgs, int nimgs_uniq_pair, int nbas_cell0, int nao,
+                        uint32_t *pool, double *dd_pool, int *head,
+                        int reserved_shm_size)
+{
+    int sq_id = threadIdx.x;
+    int nsq_per_block = blockDim.x;
+    int gout_id = threadIdx.y;
+    int gout_stride = blockDim.y;
+    int threads = blockDim.x * blockDim.y;
+    int thread_id = threadIdx.x + blockDim.x * threadIdx.y;
+    int nbas = envs.nbas;
+    int *bas = envs.bas;
+    double *env = envs.env;
+    int li = bounds.li;
+    int lj = bounds.lj;
+    int lk = bounds.lk;
+    int ll = bounds.ll;
+    int iprim = bounds.iprim;
+    int jprim = bounds.jprim;
+    int kprim = bounds.kprim;
+    int lprim = bounds.lprim;
+    int stride_j = bounds.stride_j;
+    int stride_k = bounds.stride_k;
+    int stride_l = bounds.stride_l;
+    int g_size = bounds.g_size;
+    int nfi = bounds.nfi;
+    int nfj = bounds.nfj;
+    int nfk = bounds.nfk;
+    int nfl = bounds.nfl;
+    int nfij = nfi * nfj;
+    int nfkl = nfk * nfl;
+    int lij = li + lj + 1;
+    int lkl = lk + ll + 1;
+    int i_1 =          nsq_per_block;
+    int j_1 = stride_j*nsq_per_block;
+    int k_1 = stride_k*nsq_per_block;
+    int l_1 = stride_l*nsq_per_block;
+
+    extern __shared__ double shared_memory[];
+    double *rlrk = shared_memory + sq_id;
+    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
+    double *gx = shared_memory + nsq_per_block * 6 + sq_id;
+    double *rw = shared_memory + nsq_per_block * (g_size*3+6) + sq_id;
+    double *cicj_cache = shared_memory + reserved_shm_size;
+    int *idx_i = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.li);
+    int *idx_j = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.lj);
+    int *idx_k = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.lk);
+    int *idx_l = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.ll);
+
+    int do_j = jk.j_factor != 0.;
+    int do_k = jk.k_factor != 0.;
+    int *ao_loc = envs.ao_loc;
+    double *dm = jk.dm;
+
+    uint32_t *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
+    int nf = bounds.nfi * bounds.nfj * bounds.nfk * bounds.nfl;
+    double *dd_cache = dd_pool + blockIdx.x * nf * blockDim.x + sq_id;
+    __shared__ int ntasks, pair_ij, pair_kl0;
+    double sigma_xx = 0;
+    double sigma_xy = 0;
+    double sigma_xz = 0;
+    double sigma_yx = 0;
+    double sigma_yy = 0;
+    double sigma_yz = 0;
+    double sigma_zx = 0;
+    double sigma_zy = 0;
+    double sigma_zz = 0;
+while (1) {
+    if (thread_id == 0) {
+        pair_ij = atomicAdd(head, 1);
+    }
+    __syncthreads();
+    if (pair_ij >= bounds.npairs_ij) {
+        break;
+    }
+
+    uint32_t bas_ij = bounds.pair_ij_mapping[pair_ij];
+    int ish = bas_ij / nbas;
+    int jsh = bas_ij % nbas;
+    __shared__ int cell_j, ish_cell0, jsh_cell0, i0, j0;
+    __shared__ double ri[3];
+    __shared__ double rj[3];
+    __shared__ double rjri[3];
+    if (thread_id == 0) {
+        int _jsh = bas_mask_idx[jsh];
+        ish_cell0 = ish;
+        jsh_cell0 = _jsh % nbas_cell0;
+        cell_j = _jsh / nbas_cell0;
+        i0 = ao_loc[ish_cell0];
+        j0 = ao_loc[jsh_cell0];
+    }
+    __syncthreads();
+    double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
+    double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
+    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
+    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
+    if (thread_id < 3) {
+        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
+        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
+        ri[thread_id] = env[ri_ptr+thread_id];
+        rj[thread_id] = env[rj_ptr+thread_id];
+        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
+    }
+    __syncthreads();
+    double xjxi = rjri[0];
+    double yjyi = rjri[1];
+    double zjzi = rjri[2];
+    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
+        int ip = ij / jprim;
+        int jp = ij % jprim;
+        double ai = expi[ip];
+        double aj = expj[jp];
+        double aij = ai + aj;
+        double theta_ij = ai * aj / aij;
+        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
+        double Kab = exp(-theta_ij * rr_ij);
+        double cicj = ci[ip] * cj[jp];
+        if (ish_cell0 == jsh_cell0) {
+            cicj *= .5;
+        }
+        cicj_cache[ij] = cicj * Kab;
+    }
+    double v_ix = 0;
+    double v_iy = 0;
+    double v_iz = 0;
+    double v_jx = 0;
+    double v_jy = 0;
+    double v_jz = 0;
+    double goutx, gouty, goutz;
+
+    if (thread_id == 0) {
+        pair_kl0 = 0;
+    }
+    __syncthreads();
+    while (pair_kl0 < bounds.npairs_kl) {
+        _fill_sr_ejk_tasks(ntasks, pair_kl0, bas_kl_idx, bas_ij, bas_mask_idx,
+                           Ts_ji_lookup, nimgs, nbas_cell0, jk, envs, bounds);
+        if (ntasks == 0) {
+            continue;
+        }
+
+        for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
+            __syncthreads();
+            int bas_kl = bas_kl_idx[task_id];
+            int ksh = bas_kl / nbas;
+            int lsh = bas_kl % nbas;
+            int _ksh = bas_mask_idx[ksh];
+            int cell_k = _ksh / nbas_cell0;
+            int ksh_cell0 = _ksh % nbas_cell0;
+            int _lsh = bas_mask_idx[lsh];
+            int cell_l = _lsh / nbas_cell0;
+            int lsh_cell0 = _lsh % nbas_cell0;
+            double fac_sym = PI_FAC;
+            if (task_id < ntasks) {
+                if (ksh_cell0 == lsh_cell0) fac_sym *= .5;
+                if (ish_cell0 == ksh_cell0 && jsh_cell0 == lsh_cell0) fac_sym *= .5;
+            } else {
+                fac_sym = 0;
+            }
+            int k0 = ao_loc[ksh_cell0];
+            int l0 = ao_loc[lsh_cell0];
+            double *expi = env + bas[ish_cell0*BAS_SLOTS+PTR_EXP];
+            double *expj = env + bas[jsh_cell0*BAS_SLOTS+PTR_EXP];
+            double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
+            double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
+            double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
+            double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
+            double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
+            double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
+            double xk = rk[0];
+            double yk = rk[1];
+            double zk = rk[2];
+            double xl = rl[0];
+            double yl = rl[1];
+            double zl = rl[2];
+            if (gout_id == 0) {
+                double xlxk = xl - xk;
+                double ylyk = yl - yk;
+                double zlzk = zl - zk;
+                rlrk[0*nsq_per_block] = xlxk;
+                rlrk[1*nsq_per_block] = ylyk;
+                rlrk[2*nsq_per_block] = zlzk;
+            }
+
+            double v_kx = 0;
+            double v_ky = 0;
+            double v_kz = 0;
+            double v_lx = 0;
+            double v_ly = 0;
+            double v_lz = 0;
+            int nao2 = nao * nao;
+            double *dm_jk = dm + Ts_ji_lookup[cell_j+cell_k*nimgs] * nao2;
+            double *dm_jl = dm + Ts_ji_lookup[cell_j+cell_l*nimgs] * nao2;
+            double *dm_ki = dm + Ts_ji_lookup[cell_k             ] * nao2;
+            double *dm_li = dm + Ts_ji_lookup[cell_l             ] * nao2;
+            double *dm_ji = dm + Ts_ji_lookup[cell_j             ] * nao2;
+            double *dm_lk = dm + Ts_ji_lookup[cell_l+cell_k*nimgs] * nao2;
+            if (jk.n_dm == 1) {
+                for (int n = gout_id; n < nfij*nfkl; n+=gout_stride) {
+                    int kl = n / nfij;
+                    int ij = n % nfij;
+                    int i = ij % nfi;
+                    int j = ij / nfi;
+                    int k = kl % nfk;
+                    int l = kl / nfk;
+                    int _i = i + i0;
+                    int _j = j + j0;
+                    int _k = k + k0;
+                    int _l = l + l0;
+                    int _jl = _j*nao+_l;
+                    int _jk = _j*nao+_k;
+                    int _li = _l*nao+_i;
+                    int _ki = _k*nao+_i;
+                    int _ji = _j*nao+_i;
+                    int _lk = _l*nao+_k;
+                    double dd = 0;
+                    if (do_k) {
+                        dd += jk.k_factor * (dm_jk[_jk] * dm_li[_li] + dm_jl[_jl] * dm_ki[_ki]);
+                    }
+                    if (do_j) {
+                        dd += jk.j_factor * dm_ji[_ji] * dm_lk[_lk];
+                    }
+                    dd_cache[n*nsq_per_block] = fac_sym * dd;
+                }
+            } else {
+                int dm_size = nao2 * nimgs_uniq_pair;
+                for (int n = gout_id; n < nfij*nfkl; n+=gout_stride) {
+                    int kl = n / nfij;
+                    int ij = n % nfij;
+                    int i = ij % nfi;
+                    int j = ij / nfi;
+                    int k = kl % nfk;
+                    int l = kl / nfk;
+                    int _i = i + i0;
+                    int _j = j + j0;
+                    int _k = k + k0;
+                    int _l = l + l0;
+                    int _jl = _j*nao+_l;
+                    int _jk = _j*nao+_k;
+                    int _li = _l*nao+_i;
+                    int _ki = _k*nao+_i;
+                    int _ji = _j*nao+_i;
+                    int _lk = _l*nao+_k;
+                    double dd = 0;
+                    if (do_k) {
+                        dd += dm_jk[_jk] * dm_li[_li] + dm_jl[_jl] * dm_ki[_ki];
+                        dd += dm_jk[dm_size+_jk] * dm_li[dm_size+_li] +
+                              dm_jl[dm_size+_jl] * dm_ki[dm_size+_ki];
+                        dd *= jk.k_factor;
+                    }
+                    if (do_j) {
+                        dd += jk.j_factor * (dm_ji[_ji] + dm_ji[dm_size+_ji]) *
+                                            (dm_lk[_lk] + dm_lk[dm_size+_lk]);
+                    }
+                    dd_cache[n*nsq_per_block] = fac_sym * dd;
+                }
+            }
+
+            for (int klp = 0; klp < kprim*lprim; ++klp) {
+                __syncthreads();
+                int kp = klp / lprim;
+                int lp = klp % lprim;
+                double ak = expk[kp];
+                double al = expl[lp];
+                double akl = ak + al;
+                double al_akl = al / akl;
+                double ak2 = ak * 2;
+                double al2 = al * 2;
+                if (gout_id == 0) {
+                    double xlxk = rlrk[0*nsq_per_block];
+                    double ylyk = rlrk[1*nsq_per_block];
+                    double zlzk = rlrk[2*nsq_per_block];
+                    double rr_kl = xlxk*xlxk + ylyk*ylyk + zlzk*zlzk;
+                    double theta_kl = ak * al_akl;
+                    double Kcd = exp(-theta_kl * rr_kl);
+                    double ckcl = ck[kp] * cl[lp] * Kcd;
+                    gx[0] = ckcl;
+                }
+                for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
+                    __syncthreads();
+                    int ip = ijp / jprim;
+                    int jp = ijp % jprim;
+                    double ai = expi[ip];
+                    double aj = expj[jp];
+                    double ai2 = ai * 2;
+                    double aj2 = aj * 2;
+                    double aij = ai + aj;
+                    double aj_aij = aj / aij;
+                    double xij = ri[0] + (rjri[0]) * aj_aij;
+                    double yij = ri[1] + (rjri[1]) * aj_aij;
+                    double zij = ri[2] + (rjri[2]) * aj_aij;
+                    double xkl = xk + rlrk[0*nsq_per_block] * al_akl;
+                    double ykl = yk + rlrk[1*nsq_per_block] * al_akl;
+                    double zkl = zk + rlrk[2*nsq_per_block] * al_akl;
+                    double xpq = xij - xkl;
+                    double ypq = yij - ykl;
+                    double zpq = zij - zkl;
+                    if (gout_id == 0) {
+                        Rpq[0*nsq_per_block] = xpq;
+                        Rpq[1*nsq_per_block] = ypq;
+                        Rpq[2*nsq_per_block] = zpq;
+                        double cicj = cicj_cache[ijp];
+                        gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
+                    }
+                    double rr = xpq*xpq + ypq*ypq + zpq*zpq;
+                    double theta = aij * akl / (aij + akl);
+                    int nroots = bounds.nroots;
+                    rys_roots_for_k(nroots, theta, rr, rw, jk.omega, jk.lr_factor, jk.sr_factor);
+                    for (int irys = 0; irys < nroots; ++irys) {
+                        __syncthreads();
+                        if (gout_id == 0) {
+                            gx[nsq_per_block*g_size*2] = rw[(irys*2+1)*nsq_per_block];
+                        }
+                        double rt = rw[irys*2*nsq_per_block];
+                        double rt_aa = rt / (aij + akl);
+                        double rt_aij = rt_aa * akl;
+                        double rt_akl = rt_aa * aij;
+                        double b00 = .5 * rt_aa;
+                        double b10 = .5/aij * (1 - rt_aij);
+                        double b01 = .5/akl * (1 - rt_akl);
+                        double s0x, s1x, s2x;
+                        __syncthreads();
+                        // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1)
+                        for (int n = gout_id; n < 3; n += gout_stride) {
+                            double *_gx = gx + n * g_size * nsq_per_block;
+                            double Rpa = (rjri[n]) * aj_aij;
+                            double c0x = Rpa - rt_aij * Rpq[n*nsq_per_block];
+                            s0x = _gx[0];
+                            s1x = c0x * s0x;
+                            _gx[nsq_per_block] = s1x;
+                            for (int i = 1; i < lij; ++i) {
+                                s2x = c0x * s1x + i * b10 * s0x;
+                                _gx[(i+1)*nsq_per_block] = s2x;
+                                s0x = s1x;
+                                s1x = s2x;
+                            }
+                        }
+
+                        int lij3 = (lij+1)*3;
+                        for (int n = gout_id; n < lij3+gout_id; n += gout_stride) {
+                            __syncthreads();
+                            int i = n / 3; //for i in range(lij+1):
+                            int _ix = n % 3; // TODO: remove _ix for nroots > 2
+                            double *_gx = gx + (i + _ix * g_size) * nsq_per_block;
+                            double Rqc = rlrk[_ix*nsq_per_block] * al_akl;
+                            double cpx = Rqc + rt_akl * Rpq[_ix*nsq_per_block];
+                            //for i in range(lij+1):
+                            //    trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0)
+                            if (n < lij3) {
+                                s0x = _gx[0];
+                                s1x = cpx * s0x;
+                                if (i > 0) {
+                                    s1x += i * b00 * _gx[-nsq_per_block];
+                                }
+                                _gx[stride_k*nsq_per_block] = s1x;
+                            }
+
+                            //for k in range(1, lkl):
+                            //    for i in range(lij+1):
+                            //        trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k)
+                            for (int k = 1; k < lkl; ++k) {
+                                __syncthreads();
+                                if (n < lij3) {
+                                    s2x = cpx*s1x + k*b01*s0x;
+                                    if (i > 0) {
+                                        s2x += i * b00 * _gx[(k*stride_k-1)*nsq_per_block];
+                                    }
+                                    _gx[(k*stride_k+stride_k)*nsq_per_block] = s2x;
+                                    s0x = s1x;
+                                    s1x = s2x;
+                                }
+                            }
+                        }
+
+                        // hrr
+                        // g(i,j+1) = rirj * g(i,j) +  g(i+1,j)
+                        // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l)
+                        if (lj > 0) {
+                            __syncthreads();
+                            if (task_id < ntasks) {
+                                int lkl3 = (lkl+1)*3;
+                                for (int m = gout_id; m < lkl3; m += gout_stride) {
+                                    int k = m / 3;
+                                    int _ix = m % 3;
+                                    double xjxi = rjri[_ix];
+                                    double *_gx = gx + (_ix*g_size + k*stride_k) * nsq_per_block;
+                                    for (int j = 0; j < lj; ++j) {
+                                        int ij = (lij-j) + j*stride_j;
+                                        s1x = _gx[ij*nsq_per_block];
+                                        for (--ij; ij >= j*stride_j; --ij) {
+                                            s0x = _gx[ij*nsq_per_block];
+                                            _gx[(ij+stride_j)*nsq_per_block] = s1x - xjxi * s0x;
+                                            s1x = s0x;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        if (ll > 0) {
+                            __syncthreads();
+                            if (task_id < ntasks) {
+                                for (int n = gout_id; n < stride_k*3; n += gout_stride) {
+                                    int i = n / 3;
+                                    int _ix = n % 3;
+                                    double xlxk = rlrk[_ix*nsq_per_block];
+                                    double *_gx = gx + (_ix*g_size + i) * nsq_per_block;
+                                    for (int l = 0; l < ll; ++l) {
+                                        int kl = (lkl-l)*stride_k + l*stride_l;
+                                        s1x = _gx[kl*nsq_per_block];
+                                        for (kl-=stride_k; kl >= l*stride_l; kl-=stride_k) {
+                                            s0x = _gx[kl*nsq_per_block];
+                                            _gx[(kl+stride_l)*nsq_per_block] = s1x - xlxk * s0x;
+                                            s1x = s0x;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        __syncthreads();
+                        if (task_id >= ntasks) {
+                            continue;
+                        }
+                        for (int n = gout_id; n < nfij*nfkl; n+=gout_stride) {
+                            int kl = n / nfij;
+                            int ij = n % nfij;
+                            int i = ij % nfi;
+                            int j = ij / nfi;
+                            int k = kl % nfk;
+                            int l = kl / nfk;
+                            int ix = idx_i[i*3+0];
+                            int iy = idx_i[i*3+1];
+                            int iz = idx_i[i*3+2];
+                            int jx = idx_j[j*3+0];
+                            int jy = idx_j[j*3+1];
+                            int jz = idx_j[j*3+2];
+                            int kx = idx_k[k*3+0];
+                            int ky = idx_k[k*3+1];
+                            int kz = idx_k[k*3+2];
+                            int lx = idx_l[l*3+0];
+                            int ly = idx_l[l*3+1];
+                            int lz = idx_l[l*3+2];
+                            double dd = dd_cache[n*nsq_per_block];
+                            int addrx = (ix + jx*stride_j + kx*stride_k + lx*stride_l) * nsq_per_block;
+                            int addry = (iy + jy*stride_j + ky*stride_k + ly*stride_l + g_size) * nsq_per_block;
+                            int addrz = (iz + jz*stride_j + kz*stride_k + lz*stride_l + g_size*2) * nsq_per_block;
+                            double Ix = gx[addrx];
+                            double Iy = gx[addry];
+                            double Iz = gx[addrz];
+                            double prod_xy = Ix * Iy * dd;
+                            double prod_xz = Ix * Iz * dd;
+                            double prod_yz = Iy * Iz * dd;
+                            double gix = gx[addrx+i_1];
+                            double giy = gx[addry+i_1];
+                            double giz = gx[addrz+i_1];
+                            double gkx = gx[addrx+k_1];
+                            double gky = gx[addry+k_1];
+                            double gkz = gx[addrz+k_1];
+                            double fix = ai2 * gix; if (ix > 0) { fix -= ix * gx[addrx-i_1]; }
+                            double fiy = ai2 * giy; if (iy > 0) { fiy -= iy * gx[addry-i_1]; }
+                            double fiz = ai2 * giz; if (iz > 0) { fiz -= iz * gx[addrz-i_1]; }
+                            goutx = fix * prod_yz;
+                            gouty = fiy * prod_xz;
+                            goutz = fiz * prod_xy;
+                            v_ix += goutx;
+                            v_iy += gouty;
+                            v_iz += goutz;
+                            double xi = ri[0];
+                            double yi = ri[1];
+                            double zi = ri[2];
+                            sigma_xx += goutx * xi;
+                            sigma_xy += goutx * yi;
+                            sigma_xz += goutx * zi;
+                            sigma_yx += gouty * xi;
+                            sigma_yy += gouty * yi;
+                            sigma_yz += gouty * zi;
+                            sigma_zx += goutz * xi;
+                            sigma_zy += goutz * yi;
+                            sigma_zz += goutz * zi;
+                            double fkx = ak2 * gkx; if (kx > 0) { fkx -= kx * gx[addrx-k_1]; }
+                            double fky = ak2 * gky; if (ky > 0) { fky -= ky * gx[addry-k_1]; }
+                            double fkz = ak2 * gkz; if (kz > 0) { fkz -= kz * gx[addrz-k_1]; }
+                            goutx = fkx * prod_yz;
+                            gouty = fky * prod_xz;
+                            goutz = fkz * prod_xy;
+                            v_kx += goutx;
+                            v_ky += gouty;
+                            v_kz += goutz;
+                            sigma_xx += goutx * xk;
+                            sigma_xy += goutx * yk;
+                            sigma_xz += goutx * zk;
+                            sigma_yx += gouty * xk;
+                            sigma_yy += gouty * yk;
+                            sigma_yz += gouty * zk;
+                            sigma_zx += goutz * xk;
+                            sigma_zy += goutz * yk;
+                            sigma_zz += goutz * zk;
+                            double fjx = aj2 * (gix - rjri[0] * Ix); if (jx > 0) { fjx -= jx * gx[addrx-j_1]; }
+                            double fjy = aj2 * (giy - rjri[1] * Iy); if (jy > 0) { fjy -= jy * gx[addry-j_1]; }
+                            double fjz = aj2 * (giz - rjri[2] * Iz); if (jz > 0) { fjz -= jz * gx[addrz-j_1]; }
+                            goutx = fjx * prod_yz;
+                            gouty = fjy * prod_xz;
+                            goutz = fjz * prod_xy;
+                            v_jx += goutx;
+                            v_jy += gouty;
+                            v_jz += goutz;
+                            double xj = rj[0];
+                            double yj = rj[1];
+                            double zj = rj[2];
+                            sigma_xx += goutx * xj;
+                            sigma_xy += goutx * yj;
+                            sigma_xz += goutx * zj;
+                            sigma_yx += gouty * xj;
+                            sigma_yy += gouty * yj;
+                            sigma_yz += gouty * zj;
+                            sigma_zx += goutz * xj;
+                            sigma_zy += goutz * yj;
+                            sigma_zz += goutz * zj;
+                            double flx = al2 * (gkx - rlrk[0*nsq_per_block] * Ix); if (lx > 0) { flx -= lx * gx[addrx-l_1]; }
+                            double fly = al2 * (gky - rlrk[1*nsq_per_block] * Iy); if (ly > 0) { fly -= ly * gx[addry-l_1]; }
+                            double flz = al2 * (gkz - rlrk[2*nsq_per_block] * Iz); if (lz > 0) { flz -= lz * gx[addrz-l_1]; }
+                            goutx = flx * prod_yz;
+                            gouty = fly * prod_xz;
+                            goutz = flz * prod_xy;
+                            v_lx += goutx;
+                            v_ly += gouty;
+                            v_lz += goutz;
+                            sigma_xx += goutx * xl;
+                            sigma_xy += goutx * yl;
+                            sigma_xz += goutx * zl;
+                            sigma_yx += gouty * xl;
+                            sigma_yy += gouty * yl;
+                            sigma_yz += gouty * zl;
+                            sigma_zx += goutz * xl;
+                            sigma_zy += goutz * yl;
+                            sigma_zz += goutz * zl;
+                        }
+                    }
+                }
+            }
+            int ka = bas[ksh_cell0*BAS_SLOTS+ATOM_OF];
+            int la = bas[lsh_cell0*BAS_SLOTS+ATOM_OF];
+            int threads = nsq_per_block * gout_stride;
+            double *reduce = shared_memory + thread_id;
+            __syncthreads();
+            if (task_id < ntasks) {
+                reduce[0*threads] = v_kx;
+                reduce[1*threads] = v_ky;
+                reduce[2*threads] = v_kz;
+                reduce[3*threads] = v_lx;
+                reduce[4*threads] = v_ly;
+                reduce[5*threads] = v_lz;
+            }
+            for (int i = gout_stride/2; i > 0; i >>= 1) {
+                __syncthreads();
+                if (gout_id < i && task_id < ntasks) {
+#pragma unroll
+                    for (int n = 0; n < 6; ++n) {
+                        reduce[n*threads] += reduce[n*threads+i*nsq_per_block];
+                    }
+                }
+            }
+            if (gout_id == 0 && task_id < ntasks) {
+                double *ejk = jk.ejk;
+                atomicAdd(ejk+ka*3+0, reduce[0*threads]);
+                atomicAdd(ejk+ka*3+1, reduce[1*threads]);
+                atomicAdd(ejk+ka*3+2, reduce[2*threads]);
+                atomicAdd(ejk+la*3+0, reduce[3*threads]);
+                atomicAdd(ejk+la*3+1, reduce[4*threads]);
+                atomicAdd(ejk+la*3+2, reduce[5*threads]);
+            }
+        }
+    }
+    int ia = bas[ish_cell0*BAS_SLOTS+ATOM_OF];
+    int ja = bas[jsh_cell0*BAS_SLOTS+ATOM_OF];
+    double *reduce = shared_memory + thread_id;
+    __syncthreads();
+    reduce[0*threads] = v_ix;
+    reduce[1*threads] = v_iy;
+    reduce[2*threads] = v_iz;
+    reduce[3*threads] = v_jx;
+    reduce[4*threads] = v_jy;
+    reduce[5*threads] = v_jz;
+    for (int i = gout_stride/2; i > 0; i >>= 1) {
+        __syncthreads();
+        if (gout_id < i) {
+#pragma unroll
+            for (int n = 0; n < 6; ++n) {
+                reduce[n*threads] += reduce[n*threads+i*nsq_per_block];
+            }
+        }
+    }
+    if (gout_id == 0) {
+        double *ejk = jk.ejk;
+        atomicAdd(ejk+ia*3+0, reduce[0*threads]);
+        atomicAdd(ejk+ia*3+1, reduce[1*threads]);
+        atomicAdd(ejk+ia*3+2, reduce[2*threads]);
+        atomicAdd(ejk+ja*3+0, reduce[3*threads]);
+        atomicAdd(ejk+ja*3+1, reduce[4*threads]);
+        atomicAdd(ejk+ja*3+2, reduce[5*threads]);
+    }
+}
+    atomicAdd(sigma+0, sigma_xx);
+    atomicAdd(sigma+1, sigma_xy);
+    atomicAdd(sigma+2, sigma_xz);
+    atomicAdd(sigma+3, sigma_yx);
+    atomicAdd(sigma+4, sigma_yy);
+    atomicAdd(sigma+5, sigma_yz);
+    atomicAdd(sigma+6, sigma_zx);
+    atomicAdd(sigma+7, sigma_zy);
+    atomicAdd(sigma+8, sigma_zz);
+}
+
 //extern int rys_ejk_ip1_unrolled(RysIntEnvVars *envs, JKEnergy *jk, BoundsInfo *bounds,
 //                        int *pool, double *dd_pool);
 
 extern "C" {
 int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor,
                         double *dm, int n_dm, int nao,
-                        RysIntEnvVars envs, int *scheme, int *shls_slice,
+                        RysIntEnvVars *envs, int *scheme, int *shls_slice,
                         int npairs_ij, int npairs_kl,
                         uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping,
                         int *bas_mask_idx, int *Ts_ji_lookup, int nimgs, int nimgs_uniq_pair,
@@ -609,7 +1225,7 @@ int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor,
     int *head = (int *)(pool + workers * QUEUE_DEPTH);
     cudaMemset(head, 0, sizeof(int));
 
-    if (1){//!rys_ejk_ip1_unrolled(&envs, &jk, &bounds, pool, dd_pool)) {
+    if (1){//!rys_ejk_ip1_unrolled(envs, &jk, &bounds, pool, dd_pool)) {
         int quartets_per_block = scheme[0];
         int gout_stride = scheme[1];
         int ij_prims = iprim * jprim;
@@ -619,7 +1235,7 @@ int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor,
         buflen = (reserved_shm_size + ij_prims)*sizeof(double);
 
         rys_ejk_ip1_kernel<<<workers, threads, buflen>>>(
-                envs, jk, bounds, bas_mask_idx, Ts_ji_lookup,
+                *envs, jk, bounds, bas_mask_idx, Ts_ji_lookup,
                 nimgs, nimgs_uniq_pair, nbas_cell0, nao,
                 pool, dd_pool, head, reserved_shm_size);
     }
@@ -632,9 +1248,88 @@ int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor,
     return 0;
 }
 
+int PBC_jk_strain_deriv(double *ejk, double j_factor, double k_factor,
+                        double *sigma, double *dm, int n_dm, int nao,
+                        RysIntEnvVars *envs, int *scheme, int *shls_slice,
+                        int npairs_ij, int npairs_kl,
+                        uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping,
+                        int *bas_mask_idx, int *Ts_ji_lookup, int nimgs, int nimgs_uniq_pair,
+                        float *q_cond, float *s_estimator, float *dm_cond, float cutoff,
+                        uint32_t *pool, double *dd_pool, int nbas_cell0,
+                        int *atm, int natm, int *bas, int nbas, double *env)
+{
+    int ish0 = shls_slice[0];
+    int jsh0 = shls_slice[2];
+    int ksh0 = shls_slice[4];
+    int lsh0 = shls_slice[6];
+    int li = bas[ANG_OF + ish0*BAS_SLOTS];
+    int lj = bas[ANG_OF + jsh0*BAS_SLOTS];
+    int lk = bas[ANG_OF + ksh0*BAS_SLOTS];
+    int ll = bas[ANG_OF + lsh0*BAS_SLOTS];
+    int iprim = bas[NPRIM_OF + ish0*BAS_SLOTS];
+    int jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS];
+    int kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS];
+    int lprim = bas[NPRIM_OF + lsh0*BAS_SLOTS];
+    int nfi = (li+1)*(li+2)/2;
+    int nfj = (lj+1)*(lj+2)/2;
+    int nfk = (lk+1)*(lk+2)/2;
+    int nfl = (ll+1)*(ll+2)/2;
+    int order = li + lj + lk + ll;
+    int nroots = (order + 1) / 2 + 1;
+    double omega = env[PTR_RANGE_OMEGA];
+    if (omega < 0) { // SR ERIs
+        nroots *= 2;
+    }
+    int stride_j = li + 2;
+    int stride_k = stride_j * (lj + 1);
+    int stride_l = stride_k * (lk + 2);
+    int g_size = stride_l * (ll + 1);
+    BoundsInfo bounds = {li, lj, lk, ll, nfi, nfj, nfk, nfl,
+        nroots, stride_j, stride_k, stride_l, g_size,
+        iprim, jprim, kprim, lprim,
+        npairs_ij, npairs_kl, pair_ij_mapping, pair_kl_mapping,
+        q_cond, s_estimator, dm_cond, cutoff};
+
+    if (n_dm == 1) { // RHF
+        k_factor *= .5;
+    }
+    // *4 for the symmetry (i,j) = (j,i), (k,l) = (l,k) in J contraction
+    // Additional factor 1/2 from the two-electron Coulomb operator
+    JKEnergy jk = {ejk, dm, 2.*j_factor, -k_factor, n_dm, omega, 0, 1};
+
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    int workers = prop.multiProcessorCount;
+    int *head = (int *)(pool + workers * QUEUE_DEPTH);
+    cudaMemset(head, 0, sizeof(int));
+
+    if (1){//!rys_ejk_strain_deriv_unrolled(envs, &jk, &bounds, pool, dd_pool)) {
+        int quartets_per_block = scheme[0];
+        int gout_stride = scheme[1];
+        int ij_prims = iprim * jprim;
+        dim3 threads(quartets_per_block, gout_stride);
+        int buflen = (nroots*2 + g_size*3 + 6) * quartets_per_block;
+        int reserved_shm_size = MAX(buflen, 6*gout_stride*quartets_per_block);
+        buflen = (reserved_shm_size + ij_prims)*sizeof(double);
+
+        rys_ejk_strain_deriv_kernel<<<workers, threads, buflen>>>(
+                *envs, jk, bounds, sigma, bas_mask_idx, Ts_ji_lookup,
+                nimgs, nimgs_uniq_pair, nbas_cell0, nao,
+                pool, dd_pool, head, reserved_shm_size);
+    }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error in PBC_jk_strain_deriv, li,lj,lk,ll = %d,%d,%d,%d, error message = %s\n",
+                li,lj,lk,ll, cudaGetErrorString(err));
+        return 1;
+    }
+    return 0;
+}
+
 int PBC_build_jk_ip1_init(int shm_size)
 {
     cudaFuncSetAttribute(rys_ejk_ip1_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+    cudaFuncSetAttribute(rys_ejk_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
diff --git a/gpu4pyscf/lib/pbc/rys_contract_k.cu b/gpu4pyscf/lib/pbc/rys_contract_k.cu
index f8265774d..d0c5efc02 100644
--- a/gpu4pyscf/lib/pbc/rys_contract_k.cu
+++ b/gpu4pyscf/lib/pbc/rys_contract_k.cu
@@ -589,7 +589,7 @@ static size_t threads_scheme_for_k(dim3& threads, BoundsInfo &bounds,
 
 extern "C" {
 int PBC_build_k(double *vk, double *dm, int n_dm, int nao,
-                RysIntEnvVars envs, int *shls_slice, int shm_size,
+                RysIntEnvVars *envs, int *shls_slice, int shm_size,
                 int npairs_ij, int npairs_kl,
                 uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping,
                 int *bas_mask_idx, int *Ts_ji_lookup, int nimgs, int nimgs_uniq_pair,
@@ -642,7 +642,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao,
     int *head = (int *)(pool + workers * QUEUE_DEPTH);
     cudaMemset(head, 0, sizeof(int));
 
-    if (1){//!rys_k_unrolled(&envs, &kmat, &bounds, pool)) {
+    if (1){//!rys_k_unrolled(envs, &kmat, &bounds, pool)) {
         GXYZOffset* p_gxyz_offset = RYS_make_gxyz_offset(bounds);
         int gout_pattern = (((li == 0) >> 3) |
                             ((lj == 0) >> 2) |
@@ -654,7 +654,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao,
         int reserved_shm_size = (buflen - cart_idx_size*4)/8;
 
         rys_k_kernel<<<workers, threads, buflen>>>(
-            envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup,
+            *envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup,
             nimgs, nimgs_uniq_pair, nbas_cell0, nao,
             pool, head, p_gxyz_offset, gout_pattern, reserved_shm_size);
 
@@ -664,7 +664,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao,
                                           min(256, n_tiles-256));
             int reserved_shm_size = (buflen - cart_idx_size*4)/8;
             rys_k_kernel<<<workers, threads, buflen>>>(
-                envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup,
+                *envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup,
                 nimgs, nimgs_uniq_pair, nbas_cell0, nao,
                 pool, head, p_gxyz_offset+256, gout_pattern, reserved_shm_size);
         }
@@ -674,7 +674,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao,
                                           min(256, n_tiles-512));
             int reserved_shm_size = (buflen - cart_idx_size*4)/8;
             rys_k_kernel<<<workers, threads, buflen>>>(
-                envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup,
+                *envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup,
                 nimgs, nimgs_uniq_pair, nbas_cell0, nao,
                 pool, head, p_gxyz_offset+512, gout_pattern, reserved_shm_size);
         }
diff --git a/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu b/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu
index 33b67ce94..57ee9c26a 100644
--- a/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu
+++ b/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu
@@ -114,7 +114,7 @@ void filter_q_cond_by_distance_kernel(float *q_cond, float *s_estimator, RysIntE
 }
 
 extern "C" {
-int filter_q_cond_by_distance(float *q_cond, float *s_estimator, RysIntEnvVars envs,
+int filter_q_cond_by_distance(float *q_cond, float *s_estimator, RysIntEnvVars *envs,
                               float *diffuse_exps_per_atom, float *s_max_per_atom,
                               float log_cutoff, int natm_cell0, int nbas)
 {
@@ -123,7 +123,7 @@ int filter_q_cond_by_distance(float *q_cond, float *s_estimator, RysIntEnvVars e
     dim3 blocks(sh_blocks, sh_blocks);
     int buflen = natm_cell0 * 3 * sizeof(float);
     filter_q_cond_by_distance_kernel<<<blocks, threads, buflen>>>(
-        q_cond, s_estimator, envs, diffuse_exps_per_atom, s_max_per_atom,
+        q_cond, s_estimator, *envs, diffuse_exps_per_atom, s_max_per_atom,
         log_cutoff, natm_cell0);
 
     cudaError_t err = cudaGetLastError();
diff --git a/gpu4pyscf/pbc/df/aft_jk.py b/gpu4pyscf/pbc/df/aft_jk.py
index d04269c22..97934861d 100644
--- a/gpu4pyscf/pbc/df/aft_jk.py
+++ b/gpu4pyscf/pbc/df/aft_jk.py
@@ -32,7 +32,7 @@
 from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
 from gpu4pyscf.pbc.df.ft_ao import FTOpt, libpbc
 from gpu4pyscf.pbc.df.fft_jk import _format_dms, _format_jks, _ewald_exxdiv_for_G0
-from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem
+from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, asarray
 from gpu4pyscf.lib import logger
 from gpu4pyscf.scf.jk import apply_coeff_C_mat_CT, SHM_SIZE
 
@@ -213,11 +213,11 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=None, kpts_band=None,
 
     for group_id, (kpt, ki_idx, kj_idx, self_conj) \
             in enumerate(kk_adapted_iter(cell, kpts)):
-        vkcoulG = mydf.weighted_coulG(kpt, exxdiv, mesh)
+        vkcoulG = mydf.weighted_coulG(kpt, exxdiv, mesh) * weight
         for p0, p1 in lib.prange(0, ngrids, Gblksize):
             log.debug3('update_vk [%s:%s]', p0, p1)
             Gpq = ft_kern(Gv[p0:p1], kpt, kpts)
-            update_vk(vk_kpts, Gpq, dms, vkcoulG[p0:p1] * weight, ki_idx, kj_idx,
+            update_vk(vk_kpts, Gpq, dms, vkcoulG[p0:p1], ki_idx, kj_idx,
                       not self_conj, k_to_compute, t_rev_pairs)
             Gpq = None
         cpu1 = log.timer_debug1(f'get_k_kpts group {group_id}', *cpu1)
@@ -361,9 +361,9 @@ def get_ej_ip1(mydf, dm, kpts=None):
     blksize = min(blksize, ngrids, 16384)
 
     kpt_allow = np.zeros(3)
-    coulG = mydf.weighted_coulG()
+    wcoulG = mydf.weighted_coulG()
 
-    bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _screen_shl_pairs(ft_opt)
+    bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt)
     nbatches_shl_pair = len(shl_pair_offsets) - 1
     aft_envs = ft_opt.aft_envs
 
@@ -389,7 +389,7 @@ def get_ej_ip1(mydf, dm, kpts=None):
         Gpq = ft_kern(Gv[p0:p1], kpt_allow, kpts, transform_ao=False)
         Gpq = Gpq.transpose(0,2,3,1)
         vG[:nGv] = contract('kji,kijg->g', dms, Gpq).conj()
-        vG[:nGv] *= coulG[p0:p1]
+        vG[:nGv] *= wcoulG[p0:p1]
         GvT[:3*nGv].set(Gv[p0:p1].T.ravel())
         Gpq = None
         err = kern(
@@ -449,7 +449,7 @@ def get_ek_ip1(mydf, dm, kpts=None, exxdiv=None):
     blksize = max(16, int(avail_mem/(nao**2*bvk_ncells*16*2))//16*16)
     blksize = min(blksize, ngrids, 16384)
 
-    bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _screen_shl_pairs(ft_opt)
+    bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt)
     nbatches_shl_pair = len(shl_pair_offsets) - 1
     aft_envs = ft_opt.aft_envs
 
@@ -530,7 +530,7 @@ def get_ek_ip1(mydf, dm, kpts=None, exxdiv=None):
     log.timer_debug1('get_ek_ip1', *cpu0)
     return ek.get()
 
-def _screen_shl_pairs(ft_opt):
+def _generate_shl_pairs(ft_opt):
     img_idx_cache = ft_opt.make_img_idx_cache(permutation_symmetry=True)
     bas_ij_idx = []
     bas_ij_img_idx = []
@@ -551,6 +551,291 @@ def _screen_shl_pairs(ft_opt):
     shl_pair_offsets = cp.hstack(shl_pair_offsets, dtype=np.int32)
     return bas_ij_idx, bas_ij_img_idx, shl_pair_offsets
 
+def get_ej_strain_deriv(mydf, dm, kpts=None):
+    '''Strain derivatives from Coulomb matrix'''
+    from gpu4pyscf.pbc.grad import rks_stress
+    log = logger.new_logger(mydf)
+    cell = mydf.cell
+    if kpts is None:
+        kpts = np.zeros((1,3))
+    else:
+        kpts = kpts.reshape(-1, 3)
+    is_gamma_point = is_zero(kpts)
+    dms = _format_dms(dm, kpts)
+    n_dm, nkpts, nao = dms.shape[:3]
+    assert nkpts == len(kpts)
+    if n_dm == 2:
+        dms = dms[0] + dms[1]
+    elif n_dm > 1:
+        raise NotImplementedError
+
+    ft_opt = FTOpt(cell, kpts).build()
+    ft_kern = ft_opt.gen_ft_kernel()
+    sorted_cell = ft_opt.sorted_cell
+    dms = cp.asarray(dms.reshape(-1,nao,nao))
+    dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, ft_opt.uniq_l_ctr,
+                               ft_opt.l_ctr_offsets, ft_opt.ao_idx)
+    if is_gamma_point:
+        dms_bvkcell = cp.asarray(dms.real, order='C')
+    else:
+        expLk = cp.exp(1j*cp.asarray(ft_opt.bvkmesh_Ls).dot(cp.asarray(kpts).T))
+        dms_bvkcell = contract('Lk,kpq->Lpq', expLk, dms)
+        assert abs(dms_bvkcell.imag).max() < 1e-6
+        dms_bvkcell = cp.asarray(dms_bvkcell.real, order='C')
+        expLk = None
+
+    bvk_ncells = np.prod(ft_opt.bvk_kmesh)
+    nao = ft_opt.sorted_cell.nao
+    Gv = cell.get_Gv(mydf.mesh)
+    ngrids = len(Gv)
+    # memory buffer required by ft_kern
+    avail_mem = get_avail_mem() * .8
+    blksize = max(16, int(avail_mem/(nao**2*bvk_ncells*16*2))//16*16)
+    blksize = min(blksize, ngrids, 16384)
+
+    kpt_allow = np.zeros(3)
+    coulG_0, coulG_1 = rks_stress._get_coulG_strain_derivatives(cell, Gv)
+    coulG_0 = asarray(coulG_0)
+    coulG_1 = asarray(coulG_1)
+    weight_0 = 1/cell.vol
+    weight_1 = -1/cell.vol * cp.eye(3)
+    wcoulG_0 = weight_0 * coulG_0
+    # wcoulG_1 includes two terms, weight_0*coulG_1 + weight_1*coulG_0
+    wcoulG_1 = weight_0 * coulG_1
+    wcoulG_1 += weight_1[:,:,None] * coulG_0
+
+    bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt)
+    nbatches_shl_pair = len(shl_pair_offsets) - 1
+    aft_envs = ft_opt.aft_envs
+
+    lmax = ft_opt.uniq_l_ctr[:,0].max()
+    ls = np.arange(lmax+1)
+    gx_len = (ls[:,None]+2)*(ls+1) * 6*32
+    nsp_per_block = np.ones_like(gx_len)
+    for m in [2, 4, 8]:
+        nsp_per_block[(gx_len + 6)*m*8 < SHM_SIZE] = m
+    shm_size = (nsp_per_block * (gx_len + 6)).max() * 8
+
+    log.debug('bas_ij_idx=%d nbatches=%d shm_size=%d blksize=%d',
+              len(bas_ij_idx), nbatches_shl_pair, shm_size, blksize)
+
+    kern = libpbc.PBC_ft_aopair_ej_strain_deriv
+    vG = cp.zeros(blksize+256, dtype=np.complex128)
+    GvT = cp.zeros(3*blksize+256)
+    ej = cp.zeros((cell.natm, 3))
+    sigma = cp.zeros((3, 3))
+    for p0, p1 in lib.prange(0, ngrids, blksize):
+        nGv = p1 - p0
+        # TODO: Gpq are transformed to the k-points adapted representation in
+        # gen_ft_kernel. This transfomration can be skipped.
+        Gpq = ft_kern(Gv[p0:p1], kpt_allow, kpts, transform_ao=False)
+        Gpq = Gpq.transpose(0,2,3,1)
+        rhoG = contract('kji,kijg->g', dms, Gpq)
+        sigma += .25*cp.einsum('xyg,g,g->xy', wcoulG_1[:,:,p0:p1], rhoG.conj(), rhoG).real
+
+        vG[:nGv] = rhoG.conj()
+        vG[:nGv] *= wcoulG_0[p0:p1]
+        GvT[:3*nGv].set(Gv[p0:p1].T.ravel())
+        Gpq = None
+        err = kern(
+            ctypes.cast(ej.data.ptr, ctypes.c_void_p),
+            ctypes.cast(sigma.data.ptr, ctypes.c_void_p),
+            ctypes.cast(dms_bvkcell.data.ptr, ctypes.c_void_p),
+            ctypes.cast(vG.data.ptr, ctypes.c_void_p),
+            ctypes.cast(GvT.data.ptr, ctypes.c_void_p),
+            ctypes.byref(aft_envs),
+            ctypes.c_int(nbatches_shl_pair),
+            ctypes.c_int(nGv),
+            ctypes.c_int(shm_size),
+            ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+            ctypes.cast(bas_ij_img_idx.data.ptr, ctypes.c_void_p),
+            ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p),
+            sorted_cell._atm.ctypes, ctypes.c_int(sorted_cell.natm),
+            sorted_cell._bas.ctypes, ctypes.c_int(sorted_cell.nbas),
+            sorted_cell._env.ctypes)
+        if err != 0:
+            raise RuntimeError('PBC_ft_aopair_ej_strain_deriv failed')
+    if nkpts != 1:
+        ej /= nkpts
+    sigma *= 2 / nkpts**2
+    return sigma.get()
+
+def get_ek_strain_deriv(mydf, dm, kpts=None, exxdiv=None, omega=None):
+    '''Strain derivatives from exact exchange'''
+    from gpu4pyscf.pbc.grad import rks_stress
+    log = logger.new_logger(mydf)
+    cpu0 = cpu1 = log.init_timer()
+    if kpts is None:
+        kpts = np.zeros((1,3))
+    else:
+        kpts = kpts.reshape(-1, 3)
+    is_gamma_point = is_zero(kpts)
+    cell = mydf.cell
+    dm0 = _format_dms(dm, kpts)
+    n_dm, nkpts, nao = dm0.shape[:3]
+    assert nkpts == len(kpts)
+    if n_dm > 2:
+        raise NotImplementedError
+
+    ft_opt = FTOpt(cell, kpts).build()
+    ft_kern = ft_opt.gen_ft_kernel()
+    sorted_cell = ft_opt.sorted_cell
+    dms = cp.asarray(dm0.reshape(-1,nao,nao))
+    dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, ft_opt.uniq_l_ctr,
+                               ft_opt.l_ctr_offsets, ft_opt.ao_idx)
+    nao = dms.shape[-1]
+    dms = dms.reshape(n_dm,nkpts,nao,nao)
+
+    if not is_gamma_point:
+        expLk = cp.exp(1j*cp.asarray(ft_opt.bvkmesh_Ls).dot(cp.asarray(kpts).T))
+
+    bvk_ncells = np.prod(ft_opt.bvk_kmesh)
+    Gv = cell.get_Gv(mydf.mesh)
+    ngrids = len(Gv)
+    # memory buffer required by ft_kern
+    avail_mem = get_avail_mem() * .8
+    blksize = max(16, int(avail_mem/(nao**2*bvk_ncells*16*2))//16*16)
+    blksize = min(blksize, ngrids, 16384)
+
+    bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt)
+    nbatches_shl_pair = len(shl_pair_offsets) - 1
+    aft_envs = ft_opt.aft_envs
+
+    lmax = ft_opt.uniq_l_ctr[:,0].max()
+    ls = np.arange(lmax+1)
+    gx_len = (ls[:,None]+2)*(ls+1) * 6*32
+    nsp_per_block = np.ones_like(gx_len)
+    for m in [2, 4, 8]:
+        nsp_per_block[(gx_len + 3)*m*8 < SHM_SIZE] = m
+    shm_size = (nsp_per_block * (gx_len + 3)).max() * 8
+
+    log.debug('bas_ij_idx=%d nbatches=%d shm_size=%d blksize=%d',
+              len(bas_ij_idx), nbatches_shl_pair, shm_size, blksize)
+
+    kern = libpbc.PBC_ft_aopair_ek_strain_deriv
+    GvT = cp.zeros(3*blksize+256)
+    ek = cp.zeros((cell.natm, 3))
+    sigma = cp.zeros((3, 3))
+    for group_id, (kpt, ki_idx, kj_idx, self_conj) \
+            in enumerate(kk_adapted_iter(cell, kpts)):
+        Gvk = Gv + kpt
+        coulG_0, coulG_1 = rks_stress._get_coulG_strain_derivatives(
+            cell, Gvk, omega=omega, remove_G0=is_zero(kpt))
+        coulG_0 = asarray(coulG_0)
+        coulG_1 = asarray(coulG_1)
+        weight_0 = 1/cell.vol
+        weight_1 = -1/cell.vol * cp.eye(3)
+        wcoulG_0 = weight_0 * coulG_0
+        wcoulG_1 = weight_0 * coulG_1
+        wcoulG_1 += weight_1[:,:,None] * coulG_0
+
+        swap_2e = not self_conj
+        for p0, p1 in lib.prange(0, ngrids, blksize):
+            nGv = p1 - p0
+            Gpq = ft_kern(Gv[p0:p1], kpt, kpts, transform_ao=False)
+            Gpq = Gpq.transpose(0,2,3,1)
+            Gpq_conj = Gpq.conj()
+            # Gpq.conj() can be computed equivalently as
+            #Gpq_conj = ft_kern(-Gv[p0:p1], -kpt, -kpts, transform_ao=False)
+            #Gpq_conj = Gpq_conj.transpose(0,2,3,1)
+
+            if is_gamma_point:
+                tmp = contract('sjk,lkg->sjlg', dms[:,0], Gpq_conj[0])
+                dm_vG = contract('sjlg,sli->jig', tmp, dms[:,0])
+                vkG = cp.einsum('pqg,qpg->g', dm_vG, Gpq[0]).real
+                sigma += .25*cp.einsum('xyg,g->xy', wcoulG_1[:,:,p0:p1], vkG)
+            else:
+                # einsum(nijG[kj_idx],jk[kj_idx],nlkG*[kj_idx],li[ki_idx])
+                # apply derivatives to nlkG*
+                #:tmp = contract('nijg,snjk->snikg', Gpq[kj_idx], dms[:,kj_idx])
+                #:tmp = contract('snikg,snli->nklg', tmp, dms[:,ki_idx])
+                #:dm_vG = contract('Lk,kpqg->Lpqg', expLk[:,kj_idx].conj(), tmp).conj()
+                #:if swap_2e:
+                # apply derivatives to nijG. This term is equivalent to the
+                # derivatives of nlkG*.
+                #:    tmp = contract('snjk,nlkg->snjlg', dms[:,kj_idx], Gpq.conj()[kj_idx])
+                #:    tmp = contract('snjlg,snli->njig', tmp, dms[:,ki_idx])
+                #:    dm_vG += contract('Lk,kpqg->Lpqg', expLk[:,kj_idx], tmp)
+                idx = np.empty_like(ki_idx)
+                idx[kj_idx] = ki_idx
+                dm_k = contract('snjk,nlkg->snjlg', dms, Gpq_conj)
+                dm_k = contract('snjlg,snli->njig', dm_k, dms[:,idx])
+                dm_vG = contract('Lk,kpqg->Lpqg', expLk, dm_k)
+
+                vkG = cp.einsum('njig,nijg->g', dm_k, Gpq).real
+                tmp = .25*cp.einsum('xyg,g->xy', wcoulG_1[:,:,p0:p1], vkG)
+                if swap_2e:
+                    sigma += tmp * 2
+                else:
+                    sigma += tmp
+
+            if swap_2e:
+                dm_vG *= wcoulG_0[p0:p1] * 2
+            else:
+                dm_vG *= wcoulG_0[p0:p1]
+            dm_vG = cp.asarray(dm_vG, order='C')
+
+            GvT[:3*nGv].set(Gvk[p0:p1].T.ravel())
+            err = kern(
+                ctypes.cast(ek.data.ptr, ctypes.c_void_p),
+                ctypes.cast(sigma.data.ptr, ctypes.c_void_p),
+                ctypes.cast(dm_vG.data.ptr, ctypes.c_void_p),
+                ctypes.cast(GvT.data.ptr, ctypes.c_void_p),
+                ctypes.byref(aft_envs),
+                ctypes.c_int(nbatches_shl_pair),
+                ctypes.c_int(nGv),
+                ctypes.c_int(shm_size),
+                ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+                ctypes.cast(bas_ij_img_idx.data.ptr, ctypes.c_void_p),
+                ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p),
+                sorted_cell._atm.ctypes, ctypes.c_int(sorted_cell.natm),
+                sorted_cell._bas.ctypes, ctypes.c_int(sorted_cell.nbas),
+                sorted_cell._env.ctypes)
+            Gpq_conj = tmp = dm_vG = None
+            if err != 0:
+                raise RuntimeError('PBC_ft_aopair_ek_strain_deriv failed')
+        cpu1 = log.timer_debug1(f'get_k_kpts group {group_id}', *cpu1)
+    if nkpts != 1:
+        ek /= nkpts
+    sigma *= 2 / nkpts**2
+    sigma = sigma.get()
+
+    if (exxdiv == 'ewald' and
+        (cell.dimension == 3 or
+         (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))):
+        from pyscf.pbc.tools.pbc import madelung
+        from gpu4pyscf.pbc.gto import int1e
+        int1e_opt = int1e._Int1eOpt(cell, kpts)
+        s0 = int1e_opt.intor('PBCint1e_ovlp', 1, 1, (0, 0))
+        k_dm = contract('nkpq,kqr->nkpr', dm0, s0)
+        k_dm = contract('nkpr,nkrs->kps', k_dm, dm0)
+        ek_G0 = cp.einsum('kij,kji->', s0, k_dm).real.get()
+
+        scaled_kpts = kpts.dot(cell.lattice_vectors().T)
+        ewald_G0 = np.empty((3,3))
+        disp = max(1e-5, (cell.precision*.1)**.5)
+        for i in range(3):
+            for j in range(i+1):
+                cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp)
+                kpts1 = scaled_kpts.dot(cell1.reciprocal_vectors(norm_to=1))
+                kpts2 = scaled_kpts.dot(cell2.reciprocal_vectors(norm_to=1))
+                e1 = madelung(cell1, kpts1, omega=omega)
+                e2 = madelung(cell2, kpts2, omega=omega)
+                ewald_G0[j,i] = ewald_G0[i,j] = (e1-e2)/(2*disp)
+        if n_dm == 1: # RHF
+            ewald_G0 *= .5 * ek_G0
+            k_dm *= .5 * madelung(cell, kpts, omega=omega)
+        else:
+            ewald_G0 *= ek_G0
+            k_dm *= madelung(cell, kpts, omega=omega)
+        int1e_opt = int1e._Int1eOptV2(cell)
+        ewald_G0 += int1e_opt.get_ovlp_strain_deriv(k_dm, kpts) * 2
+        ewald_G0 /= nkpts
+        sigma += ewald_G0
+
+    log.timer_debug1('get_ek_ip1', *cpu0)
+    return sigma
+
 ##################################################
 #
 # Single k-point
diff --git a/gpu4pyscf/pbc/df/ft_ao.py b/gpu4pyscf/pbc/df/ft_ao.py
index 976690357..183b933a5 100644
--- a/gpu4pyscf/pbc/df/ft_ao.py
+++ b/gpu4pyscf/pbc/df/ft_ao.py
@@ -91,8 +91,8 @@ def ft_ao(cell, Gv, shls_slice=None, b=None,
     ao_loc_cpu = sorted_cell.ao_loc
     ao_loc_gpu = cp.array(ao_loc_cpu)
     envs = PBCIntEnvVars(
-        sorted_cell.natm, sorted_cell.nbas, 1, 1, _atm.data.ptr,
-        _bas.data.ptr, _env.data.ptr, ao_loc_gpu.data.ptr, 0,
+        sorted_cell.natm, sorted_cell.nbas, _atm.data.ptr,
+        _bas.data.ptr, _env.data.ptr, ao_loc_gpu.data.ptr, 1, 1, 0,
     )
     ngrids = len(Gv)
     GvT = (asarray(Gv).T + asarray(kpt[:,None])).ravel()
@@ -504,21 +504,21 @@ def most_diffuse_pgto(cell):
 
 class PBCIntEnvVars(ctypes.Structure):
     _fields_ = [
-        ('cell0_natm', ctypes.c_int),
-        ('cell0_nbas', ctypes.c_int),
-        ('bvk_ncells', ctypes.c_int),
-        ('nimgs', ctypes.c_int),
+        ('natm', ctypes.c_int),
+        ('nbas', ctypes.c_int),
         ('atm', ctypes.c_void_p),
         ('bas', ctypes.c_void_p),
         ('env', ctypes.c_void_p),
         ('ao_loc', ctypes.c_void_p),
+        ('bvk_ncells', ctypes.c_int),
+        ('nimgs', ctypes.c_int),
         ('img_coords', ctypes.c_void_p),
     ]
 
     @classmethod
     def new(cls, natm, nbas, ncells, nimgs, atm, bas, env, ao_loc, Ls):
-        obj = PBCIntEnvVars(natm, nbas, ncells, nimgs, atm.data.ptr, bas.data.ptr,
-                            env.data.ptr, ao_loc.data.ptr, Ls.data.ptr)
+        obj = PBCIntEnvVars(natm, nbas, atm.data.ptr, bas.data.ptr, env.data.ptr,
+                            ao_loc.data.ptr, ncells, nimgs, Ls.data.ptr)
         # Keep a reference to these arrays, prevent releasing them upon returning
         obj._env_ref_holder = (atm, bas, env, ao_loc, Ls)
         obj._device = cp.cuda.device.get_device_id()
diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py
index 89c0d6235..aafa8cefe 100644
--- a/gpu4pyscf/pbc/df/rsdf_builder.py
+++ b/gpu4pyscf/pbc/df/rsdf_builder.py
@@ -429,8 +429,8 @@ def _int3c2e_overlap_mask(int3c2e_opt, cutoff):
     _bas = cp.array(pcell._bas)
     _env = cp.array(_scale_sp_ctr_coeff(pcell))
     int3c2e_envs = PBCIntEnvVars(
-        pcell.natm, p_nbas, 1, nimgs, _atm.data.ptr, _bas.data.ptr,
-        _env.data.ptr, 0, Ls.data.ptr,
+        pcell.natm, p_nbas, _atm.data.ptr, _bas.data.ptr,
+        _env.data.ptr, 0, 1, nimgs, Ls.data.ptr,
     )
     err = libpbc.bvk_overlap_img_counts(
         ctypes.cast(ovlp_img_counts.data.ptr, ctypes.c_void_p),
diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
index 5522ac591..71088675c 100644
--- a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
+++ b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import cupy as cp
 import pyscf
 from pyscf import lib
 from pyscf.pbc import gto as pgto
@@ -22,6 +23,8 @@
 from gpu4pyscf.pbc.df import aft, aft_jk
 from gpu4pyscf.pbc.df import fft
 from gpu4pyscf.lib.cupy_helper import tag_array
+from gpu4pyscf.pbc.grad import rks_stress
+from gpu4pyscf.pbc.grad import krks_stress
 from packaging import version
 
 def setUpModule():
@@ -362,6 +365,116 @@ def test_ek_ip1_kpts(self):
                 ref[i] = np.einsum('xkpq,kqp->x', vk[:,:,p0:p1], dm[:,:,p0:p1]).real
             assert abs(ek_ewald - ref).max() < 1e-8
 
+    def test_ej_strain_deriv_gamma_point(self):
+        cell = pgto.M(
+            atom = '''
+            C   1.      1.    0.
+            H   4.      0.    3.
+            H   0.      1.    .6
+            ''',
+            a=np.eye(3)*4.,
+            basis=[[0, [.25, 1]], [1, [.3, 1]]],
+        )
+        np.random.seed(9)
+        nao = cell.nao
+        dm = np.random.rand(nao, nao) * .5
+        dm = dm.dot(dm.T)
+        mydf = aft.AFTDF(cell)
+        sigma = aft_jk.get_ej_strain_deriv(mydf, dm)
+
+        xc = 'lda,'
+        mf_grad = cell.RKS(xc=xc).to_gpu().Gradients()
+        ref = rks_stress.get_vxc(mf_grad, cell, dm, with_j=True, with_nuc=False)
+        ref -= rks_stress.get_vxc(mf_grad, cell, dm, with_j=False, with_nuc=False)
+        assert abs(ref - sigma).max() < 1e-8
+
+    def test_ej_strain_deriv_kpts(self):
+        cell = pgto.M(
+            atom = '''
+            C   1.      1.    0.
+            H   4.      0.    3.
+            H   0.      1.    .6
+            ''',
+            a=np.eye(3)*4.,
+            basis=[[0, [.25, 1]], [1, [.3, 1]]],
+        )
+        kmesh = [3,2,1]
+        kpts = cell.make_kpts(kmesh)
+        nkpts = len(kpts)
+        dm = cp.asarray(cell.pbc_intor('int1e_ovlp', kpts=kpts))
+        mydf = aft.AFTDF(cell)
+        sigma = aft_jk.get_ej_strain_deriv(mydf, dm, kpts)
+
+        xc = 'lda,'
+        mf_grad = cell.KRKS(xc=xc, kpts=kpts).to_gpu().Gradients()
+        ref = krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=True, with_nuc=False)
+        ref -= krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=False, with_nuc=False)
+        assert abs(ref - sigma).max() < 1e-8
+
+        for (i, j) in [(0, 0), (0, 1), (1, 2), (2, 1), (2, 2)]:
+            cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp=1e-4)
+            mydf = aft.AFTDF(cell1, kpts=cell1.make_kpts(kmesh))
+            vj = aft_jk.get_j_kpts(mydf, dm, hermi=1, kpts=mydf.kpts)
+            e1 = cp.einsum('kij,kji->', vj, dm).real / nkpts
+            mydf = aft.AFTDF(cell2, kpts=cell2.make_kpts(kmesh))
+            vj = aft_jk.get_j_kpts(mydf, dm, hermi=1, kpts=mydf.kpts)
+            e2 = cp.einsum('kij,kji->', vj, dm).real / nkpts
+            assert abs(sigma[i,j] - (e1-e2)/2e-4) < 1e-7
+
+    def test_ek_strain_deriv_gamma_point(self):
+        cell = pgto.M(
+            atom = '''
+            C   1.      1.    0.
+            H   4.      0.5   3.
+            H   0.5     1.    .6
+            ''',
+            a=np.eye(3)*4.,
+            basis=[[0, [.25, 1]], [1, [.3, 1]]],
+        )
+        np.random.seed(9)
+        nao = cell.nao
+        dm = np.random.rand(nao, nao) * .5
+        dm = cp.array(dm.dot(dm.T))
+        mydf = aft.AFTDF(cell)
+        sigma = aft_jk.get_ek_strain_deriv(mydf, dm)
+
+        for (i, j) in [(0, 0), (0, 1), (1, 2), (2, 1), (2, 2)]:
+            cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp=1e-4)
+            mydf = aft.AFTDF(cell1)
+            vk = aft_jk.get_jk(mydf, dm, hermi=1, with_j=False, exxdiv=None)[1]
+            e1 = .5 * cp.einsum('ij,ji->', vk, dm).real
+            mydf = aft.AFTDF(cell2)
+            vk = aft_jk.get_jk(mydf, dm, hermi=1, with_j=False, exxdiv=None)[1]
+            e2 = .5 * cp.einsum('ij,ji->', vk, dm).real
+            assert abs(sigma[i, j] - (e1-e2)/2e-4).max() < 2e-7
+
+    def test_ek_strain_deriv_kpts(self):
+        cell = pgto.M(
+            atom = '''
+            C   1.      1.    0.
+            H   4.      0.5   3.
+            H   0.5     1.    .6
+            ''',
+            a=np.eye(3)*4.,
+            basis=[[0, [.25, 1]], [1, [.3, 1]]],
+        )
+        kmesh = [1,3,1]
+        kpts = cell.make_kpts(kmesh)
+        nkpts = len(kpts)
+        dm = cp.asarray(cell.pbc_intor('int1e_ovlp', kpts=kpts))
+        mydf = aft.AFTDF(cell)
+        sigma = aft_jk.get_ek_strain_deriv(mydf, dm, kpts, exxdiv='ewald')
+
+        for (i, j) in [(0, 0), (0, 1), (1, 2), (2, 1), (2, 2)]:
+            cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp=1e-4)
+            mydf = aft.AFTDF(cell1, kpts=cell1.make_kpts(kmesh))
+            vk = aft_jk.get_k_kpts(mydf, dm, hermi=1, kpts=mydf.kpts, exxdiv='ewald')
+            e1 = .5 * cp.einsum('kij,kji->', vk, dm).real / nkpts
+            mydf = aft.AFTDF(cell2, kpts=cell2.make_kpts(kmesh))
+            vk = aft_jk.get_k_kpts(mydf, dm, hermi=1, kpts=mydf.kpts, exxdiv='ewald')
+            e2 = .5 * cp.einsum('kij,kji->', vk, dm).real / nkpts
+            assert abs(sigma[i, j] - (e1-e2)/2e-4).max() < 5e-7
+
 if __name__ == '__main__':
     print("Full Tests for aft")
     unittest.main()
diff --git a/gpu4pyscf/pbc/grad/rks_stress.py b/gpu4pyscf/pbc/grad/rks_stress.py
index 3528e4ba8..d3906e6cb 100644
--- a/gpu4pyscf/pbc/grad/rks_stress.py
+++ b/gpu4pyscf/pbc/grad/rks_stress.py
@@ -90,14 +90,28 @@ def _finite_diff_cells(cell, x, y, disp=1e-4, precision=None):
         cell2.build(False, False)
     return cell1, cell2
 
-def _get_coulG_strain_derivatives(cell, Gv):
+def _get_coulG_strain_derivatives(cell, Gv, omega=None, remove_G0=True):
     '''derivatives of 4pi/G^2'''
     Gv = asarray(Gv)
     G2 = cp.einsum('gx,gx->g', Gv, Gv)
-    G2[0] = np.inf
+    if remove_G0:
+        G2[0] = np.inf
     coulG_0 = 4 * np.pi / G2
-    coulG_1 = cp.einsum('gx,gy->xyg', Gv, Gv)
-    coulG_1 *= coulG_0 * 2/G2
+    if omega is None:
+        omega = cell.omega
+    coulGxy = cp.einsum('gx,gy->xyg', Gv, Gv)
+    coulGxy *= coulG_0
+    coulG_1 = coulGxy * 2/G2
+    if omega < 0:
+        coulG_1 *= (1 - cp.exp(-.25/omega**2 * G2))
+        coulG_1 -= cp.exp(-.25/omega**2 * G2) * (.25/omega**2*2) * coulGxy
+        coulG_0 *= (1 - cp.exp(-.25/omega**2 * G2))
+        #coulG_0[0] = np.pi/omega**2
+    elif omega > 0:
+        coulG_1 *= cp.exp(-.25/omega**2 * G2)
+        coulG_1 += cp.exp(-.25/omega**2 * G2) * (.25/omega**2*2) * coulGxy
+        coulG_0 *= cp.exp(-.25/omega**2 * G2)
+        #coulG_0[0] = -np.pi/omega**2
     return coulG_0, coulG_1
 
 def _get_weight_strain_derivatives(cell, grids):
@@ -276,25 +290,25 @@ def partial_dot(bra, ket):
     rho0, rho1 = rho0_fft_order, rho1_fft_order
 
     exc, vxc = ni.eval_xc_eff(xc_code, rho0, 1, xctype=xctype, spin=0)[:2]
-    out += contract('xyng,ng->xy', rho1, vxc).real.get() * weight_0
-    out += contract('g,g->', rho0[0], exc.ravel()).real.get() * weight_1
+    out += cp.einsum('xyng,ng->xy', rho1, vxc).real.get() * weight_0
+    out += cp.einsum('g,g->', rho0[0], exc.ravel()).real.get() * weight_1
 
     Gv = cell.get_Gv(mesh)
     coulG_0, coulG_1 = _get_coulG_strain_derivatives(cell, Gv)
     rhoG = pbctools.fft(rho0[0], mesh)
     if with_j:
         vR = pbctools.ifft(rhoG * coulG_0, mesh)
-        EJ = contract('xyg,g->xy', rho1[:,:,0], vR).real.get() * weight_0 * 2
-        EJ += contract('g,g->', rho0[0], vR).real.get() * weight_1
-        EJ += contract('xyg,g->xy', coulG_1, rhoG.conj()*rhoG).real.get() * (weight_0/ngrids)
+        EJ = cp.einsum('xyg,g->xy', rho1[:,:,0], vR).real.get() * weight_0 * 2
+        EJ += cp.einsum('g,g->', rho0[0], vR).real.get() * weight_1
+        EJ += cp.einsum('xyg,g,g->xy', coulG_1, rhoG.conj(), rhoG).real.get() * (weight_0/ngrids)
         out += .5 * EJ
 
     if with_nuc:
         if cell._pseudo:
             vpplocG_0, vpplocG_1 = _get_vpplocG_strain_derivatives(cell, mesh)
             vpplocR = pbctools.ifft(vpplocG_0, mesh).real
-            Ene = contract('xyg,g->xy', rho1[:,:,0], vpplocR).real.get()
-            Ene += contract('g,xyg->xy', rhoG.conj(), vpplocG_1).real.get() * (1./ngrids)
+            Ene = cp.einsum('xyg,g->xy', rho1[:,:,0], vpplocR).real.get()
+            Ene += cp.einsum('g,xyg->xy', rhoG.conj(), vpplocG_1).real.get() * (1./ngrids)
             Ene += _get_pp_nonloc_strain_derivatives(cell, mesh, dm)
         else:
             charge = -cell.atom_charges()
@@ -304,8 +318,8 @@ def partial_dot(bra, ket):
             SI = cell.get_SI(mesh=mesh)
             ZG = asarray(np.dot(charge, SI))
             vR = pbctools.ifft(ZG * coulG_0, mesh).real
-            Ene = contract('xyg,g->xy', rho1[:,:,0], vR).real.get()
-            Ene += contract('xyg,g->xy', coulG_1, rhoG.conj()*ZG).real.get() * (1./ngrids)
+            Ene = cp.einsum('xyg,g->xy', rho1[:,:,0], vR).real.get()
+            Ene += cp.einsum('xyg,g,g->xy', coulG_1, rhoG.conj(), ZG).real.get() * (1./ngrids)
         out += Ene
     return out
 
diff --git a/gpu4pyscf/pbc/gto/int1e.py b/gpu4pyscf/pbc/gto/int1e.py
index 104661558..100c9237c 100644
--- a/gpu4pyscf/pbc/gto/int1e.py
+++ b/gpu4pyscf/pbc/gto/int1e.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import ctypes
 import numpy as np
 import cupy as cp
@@ -19,10 +20,13 @@
 from pyscf.pbc import tools as pbctools
 from pyscf.pbc.lib.kpts_helper import is_zero
 from pyscf.pbc.tools.k2gamma import translation_vectors_for_kmesh
+from gpu4pyscf.gto.mole import extract_pgto_params
 from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh
 from gpu4pyscf.lib.cupy_helper import contract, asarray, sandwich_dot
+from gpu4pyscf.lib import multi_gpu
 from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD
-from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE
+from gpu4pyscf.scf.jk import (
+    _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE, apply_coeff_C_mat_CT)
 from gpu4pyscf.pbc.df.ft_ao import libpbc, PBCIntEnvVars
 from gpu4pyscf.pbc.df.int3c2e import (
     _estimate_shl_pairs_per_block, fill_triu_bvk_conj, LMAX, L_AUX_MAX, THREADS
@@ -118,7 +122,7 @@ def __init__(self, cell, kpts=None, bvk_kmesh=None):
             sorted_cell.natm, sorted_cell.nbas, bvk_ncells, nimgs,
             _atm, _bas, _env, ao_loc_gpu, Ls)
 
-    def generate_shl_pairs(self, hermi=1):
+    def generate_shl_pairs(self, hermi, gout_stride_lookup):
         sorted_cell = self.sorted_cell
         l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts))
         uniq_l = self.uniq_l_ctr[:,0]
@@ -150,7 +154,7 @@ def generate_shl_pairs(self, hermi=1):
             nshl_pair = len(idx)
             bas_ij_idx.append(idx)
             sp0, sp1 = sp1, sp1 + nshl_pair
-            nsp_per_block = _estimate_shl_pairs_per_block(li, lj, nshl_pair)
+            nsp_per_block = gout_stride_lookup[li, lj] * 8
             shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32))
 
         shl_pair_offsets.append(np.int32(sp1))
@@ -167,7 +171,6 @@ def create_gout_stride_lookup_table(self, deriv=None, gout_width=36):
         i_inc, j_inc = deriv
         lmax = self.uniq_l_ctr[:,0].max()
         gout_stride_lookup = np.empty([L_AUX_MAX+1,L_AUX_MAX+1], dtype=np.int32)
-        gout_width = gout_width
         shm_size = SHM_SIZE
         ls = np.arange(lmax+1)
         nf = (ls+1) * (ls+2) // 2
@@ -192,15 +195,15 @@ def intor(self, kern, hermi, comp, deriv_ij):
             gout_width = 18
 
         sorted_cell = self.sorted_cell
-        int1e_envs = self.int1e_envs
-        nao_cart, nao = self.coeff.shape
-        bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi)
-        nbatches_shl_pair = len(shl_pair_offsets) - 1
         gout_stride_lookup, shm_size = self.create_gout_stride_lookup_table(
             deriv_ij, gout_width)
+        bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi, gout_stride_lookup)
+        nbatches_shl_pair = len(shl_pair_offsets) - 1
         bvk_kmesh = self.bvk_kmesh
         bvk_ncells = np.prod(bvk_kmesh)
+        nao_cart, nao = self.coeff.shape
         out = cp.empty((bvk_ncells, comp, nao_cart, nao_cart))
+        int1e_envs = self.int1e_envs
         drv = getattr(libpbc, kern)
         err = drv(
             ctypes.cast(out.data.ptr, ctypes.c_void_p),
@@ -232,3 +235,166 @@ def intor(self, kern, hermi, comp, deriv_ij):
         if self.kpts is not None and self.kpts.ndim == 1:
             out = out[0]
         return out
+
+class _Int1eOptV2:
+    def __init__(self, cell):
+        self.cell = cell
+        cell, ao_idx, l_ctr_pad_counts, uniq_l_ctr, l_ctr_counts = group_basis(
+            cell, 1, sparse_coeff=True)
+        lmax = uniq_l_ctr[:,0].max()
+        assert lmax <= L_AUX_MAX
+        self.sorted_cell = cell
+        self.uniq_l_ctr = uniq_l_ctr
+        self.l_ctr_counts = l_ctr_counts
+        self.l_ctr_pad_counts = l_ctr_pad_counts
+        self.ao_idx = ao_idx
+
+        if isinstance(cell, Mole):
+            # The CUDA code for PBC integrals can be made to support Mole instances.
+            Ls = np.zeros((1, 3))
+            self.precision = 1e-16
+        else:
+            Ls = cell.get_lattice_Ls()
+            Ls = Ls[np.linalg.norm(Ls-.1, axis=1).argsort()]
+            self.precision = cell.precision * 1e-4
+        self.Ls = Ls
+        self._int1e_envs = {}
+
+    @multi_gpu.property(cache='_int1e_envs')
+    def int1e_envs(self):
+        cell = self.sorted_cell
+        atm = asarray(cell._atm)
+        bas = asarray(cell._bas)
+        env = asarray(_scale_sp_ctr_coeff(cell))
+        ao_loc = asarray(cell.ao_loc)
+        Ls = asarray(self.Ls)
+        nimgs = len(Ls)
+        return PBCIntEnvVars.new(cell.natm, cell.nbas, nimgs, nimgs, atm, bas, env, ao_loc, Ls)
+
+    def generate_shl_pairs(self, hermi=1, gout_stride_lookup=None):
+        sorted_cell = self.sorted_cell
+        Ls = asarray(self.Ls)
+        nimgs = len(Ls)
+        nbas = sorted_cell.nbas
+        ovlp_mask = cp.zeros((nbas,nimgs,nbas), dtype=bool)
+        exps, cs = extract_pgto_params(sorted_cell, 'diffuse')
+        exps = cp.asarray(exps, dtype=np.float32)
+        log_coeff = cp.log(abs(asarray(cs, dtype=np.float32)))
+        int1e_envs = self.int1e_envs
+        libpbc.PBCovlp_mask_estimation(
+            ctypes.cast(ovlp_mask.data.ptr, ctypes.c_void_p),
+            ctypes.cast(exps.data.ptr, ctypes.c_void_p),
+            ctypes.cast(log_coeff.data.ptr, ctypes.c_void_p),
+            ctypes.byref(int1e_envs), ctypes.c_int(hermi),
+            ctypes.c_float(math.log(self.precision)))
+        pairs_idx = cp.arange(ovlp_mask.size, dtype=np.int32)
+        pairs_idx = pairs_idx.reshape(ovlp_mask.shape)
+
+        l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts))
+        uniq_l = self.uniq_l_ctr[:,0]
+        bas_ij_idx = [] # The effective shell pair = ish*nbas+jsh
+        shl_pair_offsets = [] # the bas_ij_idx offset for each blockIdx.x
+        sp0 = sp1 = 0
+        nbas = sorted_cell.nbas
+        groups = len(uniq_l)
+        if hermi == 1:
+            ij_tasks = [(i, j) for i in range(groups) for j in range(i+1)]
+        else:
+            ij_tasks = [(i, j) for i in range(groups) for j in range(groups)]
+        for i, j in ij_tasks:
+            li = uniq_l[i]
+            lj = uniq_l[j]
+            ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1]
+            jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1]
+            mask = ovlp_mask[ish0:ish1,:,jsh0:jsh1]
+            idx = pairs_idx[ish0:ish1,:,jsh0:jsh1][mask]
+            nshl_pair = len(idx)
+            bas_ij_idx.append(idx)
+            sp0, sp1 = sp1, sp1 + nshl_pair
+            if gout_stride_lookup is None:
+                nsp_per_block = 512
+            else:
+                nsp_per_block = gout_stride_lookup[li, lj] * 8
+            shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32))
+
+        shl_pair_offsets.append(np.int32(sp1))
+        shl_pair_offsets = cp.array(np.hstack(shl_pair_offsets), dtype=np.int32)
+        bas_ij_idx = cp.array(cp.hstack(bas_ij_idx), dtype=np.int32)
+        return bas_ij_idx, shl_pair_offsets
+
+    def create_gout_stride_lookup_table(self, deriv=None, gout_width=36):
+        # gout_width should be identical to the setting in cuda kernel
+        # based on the shm_size, find optimal gout_stride for each (li,lj)
+        # pattern, store them in the gout_stride_lookup
+        if deriv is None:
+            deriv = (0, 0)
+        i_inc, j_inc = deriv
+        lmax = self.uniq_l_ctr[:,0].max()
+        gout_stride_lookup = np.empty([L_AUX_MAX+1,L_AUX_MAX+1], dtype=np.int32)
+        shm_size = SHM_SIZE
+        ls = np.arange(lmax+1)
+        nf = (ls+1) * (ls+2) // 2
+        max_shm_size = 0
+        for li in range(lmax+1):
+            for lj in range(lmax+1):
+                unit = (li+1+i_inc)*(lj+1+j_inc)*3 + 4
+                nsp_max = _nearest_power2(shm_size // (unit*8))
+                gout_size = nf[li] * nf[lj]
+                gout_stride = (gout_size+gout_width-1) / gout_width
+                # Round up to the next 2^n
+                gout_stride = _nearest_power2(gout_stride, return_leq=False)
+                nsp_per_block = min(nsp_max, THREADS // gout_stride)
+                gout_stride_lookup[li, lj] = THREADS // nsp_per_block
+                max_shm_size = max(max_shm_size, nsp_per_block*unit*8)
+        return cp.array(gout_stride_lookup, dtype=np.int32), max_shm_size
+
+    def get_ovlp_strain_deriv(self, dm, kpts=None):
+        cell = self.cell
+        sorted_cell = self.sorted_cell
+        nao_orig = cell.nao
+        dm = asarray(dm, order='C')
+        dm = dm.reshape(-1,nao_orig,nao_orig)
+        l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts))
+        dm = apply_coeff_C_mat_CT(dm, cell, sorted_cell, self.uniq_l_ctr,
+                                  l_ctr_offsets, self.ao_idx)
+        if kpts is None:
+            kpts = np.zeros((1, 3))
+        else:
+            kpts = kpts.reshape(-1, 3)
+        nkpts = len(kpts)
+        nao = dm.shape[-1]
+        dm = dm.reshape(-1, nkpts, nao, nao)
+        if len(dm) == 1:
+            dm = dm[0]
+        else:
+            dm = dm.sum(axis=0)
+
+        is_gamma_point = is_zero(kpts)
+        if is_gamma_point:
+            assert dm.dtype == np.float64
+        else:
+            expLk = cp.exp(1j * asarray(self.Ls).dot(asarray(kpts).T))
+            dm = contract('Lk,kpq->Lpq', expLk, dm)
+            expLk = None
+            dm = dm.real
+        dm = cp.asarray(dm, order='C')
+
+        hermi = 1
+        deriv = (1, 0)
+        gout_stride_lookup, shm_size = self.create_gout_stride_lookup_table(deriv)
+        bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi, gout_stride_lookup)
+        nbatches_shl_pair = len(shl_pair_offsets) - 1
+        int1e_envs = self.int1e_envs
+        sigma = cp.zeros((3, 3))
+        libpbc.PBCovlp_strain_deriv(
+            ctypes.cast(sigma.data.ptr, ctypes.c_void_p),
+            ctypes.cast(dm.data.ptr, ctypes.c_void_p),
+            ctypes.byref(int1e_envs),
+            ctypes.c_int(shm_size),
+            ctypes.c_int(nbatches_shl_pair),
+            ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p),
+            ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p),
+            ctypes.cast(gout_stride_lookup.data.ptr, ctypes.c_void_p),
+            ctypes.c_int(is_gamma_point))
+        sigma *= 2
+        return sigma.get()
diff --git a/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py
index 6e35df5a7..c29f18d45 100644
--- a/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py
+++ b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py
@@ -14,6 +14,8 @@
 
 import numpy as np
 import pyscf
+from pyscf.gto import intor_cross
+from pyscf.pbc.tools import pbc as pbctools
 from gpu4pyscf.pbc.gto import int1e
 
 def test_int1e_ovlp():
@@ -128,17 +130,7 @@ def test_int1e_ovlp1():
                atom = [['He', (L/2.-.5,L/2.,L/2.-.5)],
                        ['He', (L/2.   ,L/2.,L/2.+.5)]],
                basis = { 'He': [[0, (0.8, 1.0)],
-                                [0, (1.0, 1.0)],
                                 [0, (1.2, 1.0)]]})
-    nk = [2, 2, 1]
-    kpts = cell.make_kpts(nk, wrap_around=True)
-    s = int1e.int1e_ovlp(cell, kpts)
-    ref = cell.pbc_intor('int1e_ovlp', kpts=kpts)
-    assert abs(s.get() - ref).max() < 1e-10
-    k = int1e.int1e_kin(cell, kpts)
-    ref = cell.pbc_intor('int1e_kin', kpts=kpts)
-    assert abs(k.get() - ref).max() < 1e-10
-
     nk = [5, 4, 1]
     kpts = cell.make_kpts(nk, wrap_around=True)[[3, 8, 11]]
     s = int1e.int1e_ovlp(cell, kpts)
@@ -148,4 +140,48 @@ def test_int1e_ovlp1():
     ref = cell.pbc_intor('int1e_kin', kpts=kpts)
     assert abs(k.get() - ref).max() < 1e-10
 
-test_int1e_ovlp1()
+def test_ovlp_stress_tensor():
+    a = np.eye(3) * 6.
+    np.random.seed(5)
+    a += np.random.rand(3, 3) - .5
+    cell = pyscf.M(
+        atom='He 1 1 1; He 2 1.5 2.4',
+        basis=[[0, [.5, 1]],
+               [1, [1.5, 1], [.5, 1]],
+               [2, [.8, 1]],
+               [3, [.7, 1]]
+              ], a=a, unit='Bohr')
+
+    Ls = cell.get_lattice_Ls()
+    Ls = Ls[np.argsort(np.linalg.norm(Ls-.1, axis=1))]
+    scell = cell.copy()
+    scell = pbctools._build_supcell_(scell, cell, Ls)
+
+    aoslices = cell.aoslice_by_atom()
+    ao_repeats = aoslices[:,3] - aoslices[:,2]
+    bas_coords = np.repeat(cell.atom_coords(), ao_repeats, axis=0)
+
+    nao = cell.nao
+    dm = np.random.rand(nao, nao) - .5
+    dm = dm.dot(dm.T)
+    ovlp10 = cell.pbc_intor('int1e_ipovlp')
+    ovlp10 = np.einsum('xij,iy->xyij', ovlp10, bas_coords)
+    sc_ovlp01 = intor_cross('int1e_ipovlp', scell, cell)
+    ovlp01 = np.einsum('xnji,njy->xyij', sc_ovlp01.reshape(3,-1,nao,nao), bas_coords+Ls[:,None])
+    ref = -(ovlp10 + ovlp01)
+    ref = np.einsum('xyij,ji->xy', ref, dm)
+    dat = int1e._Int1eOptV2(cell).get_ovlp_strain_deriv(dm)
+    assert abs(dat - ref).max() < 1e-9
+
+    nk = [5, 4, 1]
+    kpts = cell.make_kpts(nk)
+    dm = np.array(cell.pbc_intor('int1e_ovlp', kpts=kpts))
+    ovlp10 = np.array(cell.pbc_intor('int1e_ipovlp', kpts=kpts))
+    ovlp10 = np.einsum('kxij,iy->xykij', ovlp10, bas_coords)
+    expLk = np.exp(1j*Ls.dot(kpts.T))
+    ovlp01 = np.einsum('xnji,njy->xyinj', sc_ovlp01.reshape(3,-1,nao,nao), bas_coords+Ls[:,None])
+    ovlp01 = np.einsum('xyiLj,Lk->xykij', ovlp01, expLk, optimize=True)
+    ref = ovlp01 + ovlp10
+    ref = -np.einsum('xykij,kji->xy', ref, dm).real
+    dat = int1e._Int1eOptV2(cell).get_ovlp_strain_deriv(dm, kpts=kpts)
+    assert abs(dat - ref).max() < 1e-9
diff --git a/gpu4pyscf/pbc/scf/j_engine.py b/gpu4pyscf/pbc/scf/j_engine.py
index 4e6c0a5ee..2ced645d3 100644
--- a/gpu4pyscf/pbc/scf/j_engine.py
+++ b/gpu4pyscf/pbc/scf/j_engine.py
@@ -319,7 +319,7 @@ def proc(dm_xyz):
                     ctypes.c_int(n_dm),
                     ctypes.c_int(dm_xyz_size),
                     ctypes.c_int(nimgs_uniq_pair),
-                    rys_envs, (ctypes.c_int*6)(*scheme),
+                    ctypes.byref(rys_envs), (ctypes.c_int*6)(*scheme),
                     (ctypes.c_int*8)(*shls_slice),
                     ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl),
                     ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
diff --git a/gpu4pyscf/pbc/scf/rsjk.py b/gpu4pyscf/pbc/scf/rsjk.py
index 0b1867042..4be86ad67 100644
--- a/gpu4pyscf/pbc/scf/rsjk.py
+++ b/gpu4pyscf/pbc/scf/rsjk.py
@@ -50,6 +50,7 @@
 
 libpbc.PBC_build_k.restype = ctypes.c_int
 libpbc.PBC_build_k_init(ctypes.c_int(SHM_SIZE))
+libpbc.PBC_build_jk_ip1_init(ctypes.c_int(SHM_SIZE))
 
 DD_CACHE_MAX = 101250 * (SHM_SIZE//48000)
 OMEGA = 0.3
@@ -204,7 +205,7 @@ def _filter_q_cond(self, supmol, q_cond, s_estimator, rys_envs, cutoff):
         libpbc.filter_q_cond_by_distance(
             ctypes.cast(q_cond.data.ptr, ctypes.c_void_p),
             ctypes.cast(s_estimator.data.ptr, ctypes.c_void_p),
-            rys_envs,
+            ctypes.byref(rys_envs),
             ctypes.cast(diffuse_exps_per_atom.data.ptr, ctypes.c_void_p),
             ctypes.cast(s_max_per_atom.data.ptr, ctypes.c_void_p),
             ctypes.c_float(math.log(cutoff)),
@@ -382,7 +383,7 @@ def proc(dms, dm_cond):
                     ctypes.cast(vk.data.ptr, ctypes.c_void_p),
                     ctypes.cast(dms.data.ptr, ctypes.c_void_p),
                     ctypes.c_int(n_dm), ctypes.c_int(nao),
-                    rys_envs, (ctypes.c_int*8)(*shls_slice),
+                    ctypes.byref(rys_envs), (ctypes.c_int*8)(*shls_slice),
                     ctypes.c_int(SHM_SIZE),
                     ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl),
                     ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
@@ -541,6 +542,11 @@ def _get_ejk_sr_ip1(self, dm, kpts=None, exxdiv=None,
         #:dms = cp.einsum('pi,nij,qj->npq', self.coeff, dms, self.coeff)
         dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, self.uniq_l_ctr,
                                    self.l_ctr_offsets, self.ao_idx)
+        # Symmetrize density matrices because 8-fold symmetry is utilized when
+        # computing integrals. Fold the contribution of the upper triangular
+        # part of the density matrices into the lower triangular part.
+        dms = transpose_sum(dms)
+        dms *= .5
 
         if kpts is None:
             kpts = np.zeros((1, 3))
@@ -575,7 +581,6 @@ def _get_ejk_sr_ip1(self, dm, kpts=None, exxdiv=None,
         log_cutoff = math.log(cutoff)
 
         libpbc.PBC_per_atom_jk_ip1.restype = ctypes.c_int
-        libpbc.PBC_build_jk_ip1_init(ctypes.c_int(SHM_SIZE))
 
         uniq_l_ctr = self.uniq_l_ctr
         uniq_l = uniq_l_ctr[:,0]
@@ -638,7 +643,7 @@ def proc(dms, dm_cond):
                     ctypes.c_double(j_factor), ctypes.c_double(k_factor),
                     ctypes.cast(dms.data.ptr, ctypes.c_void_p),
                     ctypes.c_int(n_dm), ctypes.c_int(nao),
-                    rys_envs, (ctypes.c_int*2)(*scheme),
+                    ctypes.byref(rys_envs), (ctypes.c_int*2)(*scheme),
                     (ctypes.c_int*8)(*shls_slice),
                     ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl),
                     ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
@@ -742,6 +747,219 @@ def _get_ejk_lr_ip1(self, dm, kpts=None, exxdiv=None,
             ek *= k_factor
         return ej - ek
 
+    def _get_jk_sr_strain_deriv(self, dm, kpts=None, exxdiv=None,
+                        j_factor=1., k_factor=1., verbose=None):
+        '''Compute the derivatives of the short-range part of the aggregated
+        J/K contribution. The aggregated J/K contribution is given by
+        j_factor - k_factor / 2.
+        '''
+        log = logger.new_logger(self, verbose)
+        cell = self.cell
+        assert cell.dimension == 3
+        sorted_cell = self.sorted_cell
+        nao_orig = cell.nao
+        nao = sorted_cell.nao
+        supmol = self.supmol
+
+        dm = asarray(dm, order='C')
+        dms = dm.reshape(-1,nao_orig,nao_orig)
+        #:dms = cp.einsum('pi,nij,qj->npq', self.coeff, dms, self.coeff)
+        dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, self.uniq_l_ctr,
+                                   self.l_ctr_offsets, self.ao_idx)
+        # Symmetrize density matrices because 8-fold symmetry is utilized when
+        # computing integrals. Fold the contribution of the upper triangular
+        # part of the density matrices into the lower triangular part.
+        dms = transpose_sum(dms)
+        dms *= .5
+
+        if kpts is None:
+            kpts = np.zeros((1, 3))
+        else:
+            kpts = kpts.reshape(-1, 3)
+        is_gamma_point = is_zero(kpts)
+        if is_gamma_point:
+            assert dms.dtype == np.float64
+            nkpts = 1
+            ao_loc = asarray(sorted_cell.ao_loc)
+            dms = cp.asarray(dms, order='C')
+            dm_cond = condense('absmax', dms, ao_loc)
+            # Add the dimension for kpts
+            dms = dms[:,None,:,:]
+        else:
+            scaled_kpts = kpts.dot(cell.lattice_vectors().T)
+            Ts = cp.asarray(supmol.double_latsum_Ts, dtype=np.float64)
+            expLk = cp.exp(1j * Ts.dot(asarray(scaled_kpts).T))
+            nkpts = expLk.shape[1]
+            dms = dms.reshape(-1, nkpts, nao, nao)
+            dms = contract('skpq,Lk->sLpq', dms, expLk)
+            # Are dms always real for super-mol?
+            assert abs(dms.imag).max() < 1e-6
+            expLk = None
+            dms = dms.real
+            dms = cp.asarray(dms, order='C')
+            dm_cond = _dm_cond_from_compressed_dm(supmol, dms)
+        dm_cond = cp.log(dm_cond + 1e-300).astype(np.float32)
+        n_dm = len(dms)
+        assert n_dm <= 2
+        cutoff = self.estimate_cutoff_with_penalty(cell.precision**.5*1e-2)
+        log_cutoff = math.log(cutoff)
+
+        libpbc.PBC_jk_strain_deriv.restype = ctypes.c_int
+
+        uniq_l_ctr = self.uniq_l_ctr
+        uniq_l = uniq_l_ctr[:,0]
+        l_ctr_bas_loc = self.l_ctr_offsets
+        l_symb = [lib.param.ANGULAR[i] for i in uniq_l]
+        n_groups = np.count_nonzero(uniq_l <= LMAX)
+
+        tasks = ((i,j,k,l)
+                 for i in range(n_groups)
+                 for j in range(i+1)
+                 for k in range(i+1)
+                 for l in range(k+1))
+
+        def proc(dms, dm_cond):
+            device_id = cp.cuda.device.get_device_id()
+            stream = cp.cuda.stream.get_current_stream()
+            log = logger.new_logger(cell, verbose)
+            t0 = log.init_timer()
+            dms = cp.asarray(dms)
+            dm_cond = cp.asarray(dm_cond)
+
+            q_cond = cp.asarray(self.q_cond)
+            s_estimator = cp.asarray(self.s_estimator)
+            pair_ij_mappings = _make_pair_ij_mappings(
+                supmol, l_ctr_bas_loc, q_cond, log_cutoff, tile=6)
+            pair_kl_mappings = _make_tril_pair_mappings(
+                supmol, l_ctr_bas_loc, q_cond, log_cutoff, tile=6)
+            bas_mask_idx = cp.asarray(supmol.bas_mask_idx)
+            nimgs = len(supmol.Ls)
+            if is_gamma_point:
+                Ts_ji_lookup = cp.zeros_like(supmol.Ts_ji_lookup)
+                nimgs_uniq_pair = 1
+            else:
+                Ts_ji_lookup = cp.asarray(supmol.Ts_ji_lookup)
+                nimgs_uniq_pair = len(supmol.double_latsum_Ts)
+            ejk = cp.zeros((cell.natm, 3))
+            sigma = cp.zeros((3, 3))
+
+            t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0)
+            workers = gpu_specs['multiProcessorCount']
+            pool = cp.empty(workers*QUEUE_DEPTH+1, dtype=np.uint32)
+            dd_pool = cp.empty((workers, DD_CACHE_MAX), dtype=np.float64)
+
+            timing_counter = Counter()
+            kern_counts = 0
+            kern = libpbc.PBC_jk_strain_deriv
+            rys_envs = self.rys_envs
+
+            for task in tasks:
+                i, j, k, l = task
+                shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]]
+                pair_ij_mapping = pair_ij_mappings[i,j]
+                pair_kl_mapping = pair_kl_mappings[k,l]
+                npairs_ij = pair_ij_mapping.size
+                npairs_kl = pair_kl_mapping.size
+                if npairs_ij == 0 or npairs_kl == 0:
+                    continue
+                scheme = _ejk_quartets_scheme(supmol, uniq_l_ctr[[i, j, k, l]])
+                err = kern(
+                    ctypes.cast(ejk.data.ptr, ctypes.c_void_p),
+                    ctypes.c_double(j_factor), ctypes.c_double(k_factor),
+                    ctypes.cast(sigma.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(dms.data.ptr, ctypes.c_void_p),
+                    ctypes.c_int(n_dm), ctypes.c_int(nao),
+                    ctypes.byref(rys_envs), (ctypes.c_int*2)(*scheme),
+                    (ctypes.c_int*8)(*shls_slice),
+                    ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl),
+                    ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(pair_kl_mapping.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(bas_mask_idx.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(Ts_ji_lookup.data.ptr, ctypes.c_void_p),
+                    ctypes.c_int(nimgs), ctypes.c_int(nimgs_uniq_pair),
+                    ctypes.cast(q_cond.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(s_estimator.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p),
+                    ctypes.c_float(log_cutoff),
+                    ctypes.cast(pool.data.ptr, ctypes.c_void_p),
+                    ctypes.cast(dd_pool.data.ptr, ctypes.c_void_p),
+                    ctypes.c_int(sorted_cell.nbas),
+                    supmol._atm.ctypes, ctypes.c_int(supmol.natm),
+                    supmol._bas.ctypes, ctypes.c_int(supmol.nbas),
+                    supmol._env.ctypes)
+                llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})'
+                if err != 0:
+                    raise RuntimeError(f'PBC_jk_strain_deriv kernel for {llll} failed')
+                if log.verbose >= logger.DEBUG1:
+                    ntasks = npairs_ij * npairs_kl
+                    msg = f'processing {llll} on Device {device_id} tasks ~= {ntasks}'
+                    t1, t1p = log.timer_debug1(msg, *t1), t1
+                    timing_counter[llll] += t1[1] - t1p[1]
+                    kern_counts += 1
+                if num_devices > 1:
+                    stream.synchronize()
+            return ejk, sigma, kern_counts, timing_counter
+
+        results = multi_gpu.run(proc, args=(dms, dm_cond), non_blocking=True)
+
+        kern_counts = 0
+        timing_collection = Counter()
+        ejk_dist = []
+        sigma_dist = []
+        for ejk, sigma, counts, t_counter in results:
+            kern_counts += counts
+            timing_collection += t_counter
+            ejk_dist.append(ejk)
+            sigma_dist.append(sigma)
+
+        log = logger.new_logger(cell, verbose)
+        if log.verbose >= logger.DEBUG1:
+            log.debug1('kernel launches %d', kern_counts)
+            for llll, t in timing_collection.items():
+                log.debug1('%s wall time %.2f', llll, t)
+
+        ejk = multi_gpu.array_reduce(ejk_dist, inplace=True)
+        sigma = multi_gpu.array_reduce(sigma_dist, inplace=True)
+        sigma *= 2
+        sigma = sigma.get()
+
+        if ((cell.dimension == 3 or
+             (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))):
+            # difference associated to the G=0 term between the real space
+            # integrals and the AFT integrals
+            dm0 = dm.reshape(n_dm, nkpts, nao_orig, nao_orig)
+            omega = self.omega
+            wcoulG_SR_at_G0 = np.pi / omega**2 / cell.vol
+            if exxdiv == 'ewald':
+                wcoulG_for_k = probe_charge_sr_coulomb(cell, omega, kpts)
+            else:
+                wcoulG_for_k = wcoulG_SR_at_G0
+            int1e_opt = int1e._Int1eOpt(cell, kpts)
+            s0 = int1e_opt.intor('PBCint1e_ovlp', 1, 1, (0, 0))
+            s1 = int1e_opt.intor('PBCint1e_ipovlp', 0, 3, (1, 0))
+            s_dm = cp.einsum('kij,nkji->', s0, dm0)
+            j_dm = dm0.sum(axis=0) * (j_factor * s_dm * wcoulG_SR_at_G0)
+            k_dm = contract('nkpq,kqr->nkpr', dm0, s0)
+            k_dm = contract('nkpr,nkrs->kps', k_dm, dm0)
+            if n_dm == 1: # RHF
+                k_dm *= .5 * k_factor * wcoulG_for_k
+            else:
+                k_dm *= k_factor * wcoulG_for_k
+            aoslices = cell.aoslice_by_atom()
+            for i, (p0, p1) in enumerate(aoslices[:,2:]):
+                ejk[i] += cp.einsum('kxpq,kqp->x', s1[:,:,p0:p1], j_dm[:,:,p0:p1]).real
+                ejk[i] -= cp.einsum('kxpq,kqp->x', s1[:,:,p0:p1], k_dm[:,:,p0:p1]).real
+
+            int1e_opt = int1e._Int1eOptV2(cell)
+            sigma -= int1e_opt.get_ovlp_strain_deriv(j_dm, kpts)
+            sigma += .5 * cp.einsum('kij,kji->', s0, j_dm).real.get() * np.eye(3)
+            sigma += int1e_opt.get_ovlp_strain_deriv(k_dm, kpts)
+            sigma -= .5 * cp.einsum('kij,kji->', s0, k_dm).real.get() * np.eye(3)
+
+        if not is_gamma_point:
+            ejk *= 1. / nkpts
+        return sigma
+
 class ExtendedMole(gto.Mole):
     '''A super-Mole cluster to mimic periodicity within the unit cell'''
     def __init__(self):
diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py
index a9f72becd..342d8798e 100644
--- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py
+++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py
@@ -19,6 +19,7 @@
 from pyscf import lib, gto
 from pyscf.pbc.scf.rsjk import RangeSeparationJKBuilder
 from pyscf.pbc.df import fft as fft_cpu
+from pyscf.pbc.tools import pbc as pbctools
 from gpu4pyscf.pbc.df import fft
 from gpu4pyscf.pbc.scf import rsjk
 from gpu4pyscf.pbc.tools.pbc import probe_charge_sr_coulomb
@@ -430,3 +431,97 @@ def test_ejk_ip1_per_atom_kpts():
         p0, p1 = aoslices[i, 2:]
         ref[i] = np.einsum('xkpq,kqp->x', vhf[:,:,p0:p1], dm[:,:,p0:p1]).real
     assert abs(ejk - ref).max() < 5e-6
+
+def test_jk_sr_strain_deriv1():
+    from gpu4pyscf.pbc.grad import rks, rks_stress
+    from gpu4pyscf.pbc.grad import krks, krks_stress
+    a = np.eye(3) * 6.
+    np.random.seed(5)
+    a += np.random.rand(3, 3) - .5
+    cell = pyscf.M(
+        atom='He 1 1 1; He 2 1.5 2.4',
+        basis=[[0, [1.5, 1]]], a=a, unit='Bohr')
+    # gamma point calculations
+    nao = cell.nao
+    dm = np.random.rand(nao, nao) - .5
+    dm = dm.dot(dm.T)
+    with_rsjk = rsjk.PBCJKMatrixOpt(cell).build()
+    sigma = with_rsjk._get_jk_sr_strain_deriv(dm)
+    #sigma_wo_exxdiv = with_rsjk._get_jk_sr_strain_deriv(dm, exxdiv=False)
+
+    Ls = cell.get_lattice_Ls(rcut=cell.rcut+4)
+    Ls = Ls[np.argsort(np.linalg.norm(Ls-.1, axis=1))]
+    scell = cell.copy()
+    scell = pbctools._build_supcell_(scell, cell, Ls)
+    scell.omega = -.3
+    nimgs = len(Ls)
+    aoslices = cell.aoslice_by_atom()
+    ao_repeats = aoslices[:,3] - aoslices[:,2]
+    bas_coords = np.repeat(cell.atom_coords(), ao_repeats, axis=0)
+    sc_eri = scell.intor('int2e_ip1').reshape(3,nimgs,nao,nimgs,nao,nimgs,nao,nimgs,nao)
+    eri1 = np.einsum('xokplinj,oky->xyijkl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None])
+    eri1+= np.einsum('xplokinj,ply->xyijkl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None])
+    eri1+= np.einsum('xinjokpl,iy->xyijkl', sc_eri[:,0], bas_coords)
+    eri1+= np.einsum('xnjiokpl,njy->xyijkl', sc_eri[:,:,:,0], bas_coords+Ls[:,None])
+    ej = -.5 * np.einsum('xyijkl,ji,lk->xy', eri1, dm, dm)
+    ek = -.5 * np.einsum('xyijkl,jk,li->xy', eri1, dm, dm)
+    #ref_wo_exxdiv = ej - ek*.5
+
+    sc_s1 = scell.intor('int1e_ipovlp').reshape(3,nimgs,nao,nimgs,nao)
+    s1 = np.einsum('xmij,miy->xyij', sc_s1[:,:,:,0], bas_coords+Ls[:,None])
+    s1+= np.einsum('xinj,iy->xyij', sc_s1[:,0], bas_coords)
+    s0 = cell.pbc_intor('int1e_ovlp')
+    wcoulG_SR_at_G0 = np.pi/scell.omega**2/cell.vol
+    j_dm = np.einsum('ij,ji->', s0, dm)
+    ej += np.einsum('xyij,ji->xy', s1, dm) * wcoulG_SR_at_G0 * j_dm
+    ej += .5 * j_dm * wcoulG_SR_at_G0 * j_dm * np.eye(3)
+    ek += np.einsum('xyij,jk,kl,li->xy', s1, dm, s0, dm) * wcoulG_SR_at_G0
+    ek += .5 *np.einsum('ij,jk,kl,li->', s0, dm, s0, dm) * wcoulG_SR_at_G0 * np.eye(3)
+    ref = ej - ek*.5
+    assert abs(ref - sigma).max() < 1e-6
+
+    sigma = with_rsjk._get_jk_sr_strain_deriv(dm, k_factor=0)
+    cell.omega = scell.omega
+    xc = 'lda,'
+    mf_grad = cell.RKS(xc=xc).to_gpu().Gradients()
+    ref = rks_stress.get_vxc(mf_grad, cell, dm, with_j=True, with_nuc=False)
+    ref -= rks_stress.get_vxc(mf_grad, cell, dm, with_j=False, with_nuc=False)
+    assert abs(ref - sigma).max() < 1e-7
+
+    # kpts calculations
+    kpts = cell.make_kpts([1,1,1])
+    dm = np.array(cell.pbc_intor('int1e_ovlp', kpts=kpts))
+    sigma = with_rsjk._get_jk_sr_strain_deriv(dm, kpts)
+    #sigma_wo_exxdiv = with_rsjk._get_jk_sr_strain_deriv(dm, exxdiv=False)
+
+    eri1 = np.einsum('xokplinj,oky->xyinjokpl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None])
+    eri1+= np.einsum('xplokinj,ply->xyinjokpl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None])
+    eri1+= np.einsum('xinjokpl,iy->xyinjokpl', sc_eri[:,0], bas_coords)
+    eri1+= np.einsum('xnjiokpl,njy->xyinjokpl', sc_eri[:,:,:,0], bas_coords+Ls[:,None])
+    expLk = np.exp(1j*Ls.dot(kpts.T))
+    eri1 = np.einsum('xyiNjOkPl,Nn,Oo,Pp->xyinjokpl', eri1, expLk, expLk.conj(), expLk, optimize=True)
+    nkpts = len(kpts)
+    ej = -.5 * np.einsum('xyinjokpl,op,nji,plk->xy', eri1, np.eye(nkpts), dm, dm).real
+    ek = -.5 * np.einsum('xyinjokpl,no,ojk,pli->xy', eri1, np.eye(nkpts), dm, dm).real
+    #ref_wo_exxdiv = ej - ek*.5
+
+    sc_s1 = scell.intor('int1e_ipovlp').reshape(3,nimgs,nao,nimgs,nao)
+    s1 = np.einsum('xmij,miy,mk->xykij', sc_s1[:,:,:,0], bas_coords+Ls[:,None], expLk)
+    s1+= np.einsum('xinj,iy,nk->xykij', sc_s1[:,0], bas_coords, expLk)
+    s0 = np.array(cell.pbc_intor('int1e_ovlp', kpts=kpts))
+    wcoulG_SR_at_G0 = np.pi/scell.omega**2/cell.vol
+    j_dm = np.einsum('kij,kji->', s0, dm).real
+    ej += np.einsum('xykij,kji->xy', s1, dm).real * wcoulG_SR_at_G0 * j_dm
+    ej += .5 * j_dm * wcoulG_SR_at_G0 * j_dm * np.eye(3)
+    ek += np.einsum('xytij,tjk,tkl,tli->xy', s1, dm, s0, dm).real * wcoulG_SR_at_G0
+    ek += .5 *np.einsum('tij,tjk,tkl,tli->', s0, dm, s0, dm).real * wcoulG_SR_at_G0 * np.eye(3)
+    ref = ej - ek*.5
+    assert abs(ref - sigma).max() < 2e-5
+
+    sigma = with_rsjk._get_jk_sr_strain_deriv(dm, kpts, k_factor=0)
+    cell.omega = scell.omega
+    xc = 'lda,'
+    mf_grad = cell.KRKS(xc=xc, kpts=kpts).to_gpu().Gradients()
+    ref = krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=True, with_nuc=False)
+    ref -= krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=False, with_nuc=False)
+    assert abs(ref - sigma).max() < 1e-7
diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py
index ad11f0c25..46fb82ab2 100644
--- a/gpu4pyscf/scf/j_engine.py
+++ b/gpu4pyscf/scf/j_engine.py
@@ -286,7 +286,7 @@ def proc(dm_xyz):
                     ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
                     ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
                     ctypes.c_int(n_dm), ctypes.c_int(dm_xyz_size),
-                    rys_envs, (ctypes.c_int*6)(*scheme),
+                    ctypes.byref(rys_envs), (ctypes.c_int*6)(*scheme),
                     (ctypes.c_int*8)(*shls_slice),
                     ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl),
                     ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py
index 2d56ac03e..7b1091fab 100644
--- a/gpu4pyscf/scf/jk.py
+++ b/gpu4pyscf/scf/jk.py
@@ -607,7 +607,7 @@ def proc(dms, dm_cond):
                         ctypes.cast(vk.data.ptr, ctypes.c_void_p),
                         ctypes.cast(dms.data.ptr, ctypes.c_void_p),
                         ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        rys_envs, (ctypes.c_int*8)(*shls_slice),
+                        ctypes.byref(rys_envs), (ctypes.c_int*8)(*shls_slice),
                         ctypes.c_int(SHM_SIZE),
                         ctypes.c_int(npairs_ij), ctypes.c_int(_npairs_kl),
                         ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
@@ -773,7 +773,7 @@ def proc(dm_xyz, dm_cond):
                         ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p),
                         ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p),
                         ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        rys_envs, (ctypes.c_int*3)(*scheme),
+                        ctypes.byref(rys_envs), (ctypes.c_int*3)(*scheme),
                         (ctypes.c_int*8)(*shls_slice),
                         ctypes.c_int(npairs_ij), ctypes.c_int(_npairs_kl),
                         ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),
@@ -912,7 +912,7 @@ def proc(dms, dm_cond):
                         ctypes.cast(vk.data.ptr, ctypes.c_void_p),
                         ctypes.cast(dms.data.ptr, ctypes.c_void_p),
                         ctypes.c_int(n_dm), ctypes.c_int(nao),
-                        rys_envs, (ctypes.c_int*8)(*shls_slice),
+                        ctypes.byref(rys_envs), (ctypes.c_int*8)(*shls_slice),
                         ctypes.c_int(SHM_SIZE),
                         ctypes.c_int(npairs_ij), ctypes.c_int(_npairs_kl),
                         ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),