diff --git a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu index ed39e07e5..bba773880 100644 --- a/gpu4pyscf/lib/gvhf-md/md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/md_contract_j.cu @@ -882,7 +882,7 @@ int md_j_4dm_unrolled(RysIntEnvVars *envs, JKMatrix *jk, MDBoundsInfo *bounds, d extern "C" { int MD_build_j(double *vj, double *dm, int n_dm, int dm_size, - RysIntEnvVars envs, int *scheme, int *shls_slice, + RysIntEnvVars *envs, int *scheme, int *shls_slice, int npairs_ij, int npairs_kl, int *pair_ij_mapping, int *pair_kl_mapping, int *pair_ij_loc, int *pair_kl_loc, @@ -935,15 +935,15 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size, pRt2_kl_ij += offset_for_Rt2_idx(lij, lkl); efg_phase += offset_for_Rt2_idx(0, lkl); if (n_dm == 1) { - if (!md_j_unrolled(&envs, &jk, &bounds, omega)) { + if (!md_j_unrolled(envs, &jk, &bounds, omega)) { bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij); bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl); md_j_1dm_kernel<<>>( - envs, jk, bounds, threads_ij, threads_kl, tilex, tiley, + *envs, jk, bounds, threads_ij, threads_kl, tilex, tiley, pRt2_kl_ij, efg_phase); } } else { - if (!md_j_4dm_unrolled(&envs, &jk, &bounds, omega, dm_size)) { + if (!md_j_4dm_unrolled(envs, &jk, &bounds, omega, dm_size)) { bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij); bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl); for (int dm_offset = 0; dm_offset < n_dm; dm_offset+=4) { @@ -951,7 +951,7 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size, jk.dm = dm + dm_offset * dm_size; jk.n_dm = n_dm - dm_offset; md_j_4dm_kernel<<>>( - envs, jk, bounds, threads_ij, threads_kl, tilex, tiley, dm_size, + *envs, jk, bounds, threads_ij, threads_kl, tilex, tiley, dm_size, pRt2_kl_ij, efg_phase); } } diff --git a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu index d82eb5600..64eddba8b 100644 --- a/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu @@ -333,7 +333,7 @@ void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds, extern "C" { int PBC_build_j(double *vj, double *dm, int n_dm, int dm_xyz_size, int nimgs_uniq_pair, - RysIntEnvVars envs, int *scheme, int *shls_slice, + RysIntEnvVars *envs, int *scheme, int *shls_slice, int npairs_ij, int npairs_kl, int *pair_ij_mapping, int *pair_kl_mapping, int *pair_ij_loc, int *pair_kl_loc, @@ -386,11 +386,11 @@ int PBC_build_j(double *vj, double *dm, int n_dm, int dm_size = dm_xyz_size * nimgs_uniq_pair; for (int i_dm = 0; i_dm < n_dm; ++i_dm) { JKMatrix jmat = {vj+i_dm*dm_size, NULL, dm+i_dm*dm_size, n_dm, 0, omega}; - if (1){//!pbc_md_j_unrolled(&envs, &jmat, &bounds, omega)) { + if (1){//!pbc_md_j_unrolled(envs, &jmat, &bounds, omega)) { bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij); bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl); pbc_md_j_kernel<<>>( - envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley, + *envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley, pRt2_kl_ij, efg_phase); } } diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu index 31da55ddf..921a4cff2 100644 --- a/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu +++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_j.cu @@ -793,7 +793,7 @@ extern int rys_j_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds, extern "C" { int RYS_build_j(double *vj, double *dm, int n_dm, int nao, - RysIntEnvVars envs, int *scheme, int *shls_slice, + RysIntEnvVars *envs, int *scheme, int *shls_slice, int npairs_ij, int npairs_kl, int *pair_ij_mapping, int *pair_kl_mapping, float *q_cond, float *s_estimator, float *dm_cond, float cutoff, int *pool, int *atm, int natm, int *bas, int nbas, double *env) @@ -834,7 +834,7 @@ int RYS_build_j(double *vj, double *dm, int n_dm, int nao, JKMatrix jk = {vj, NULL, dm, n_dm, 0, omega}; - if (!rys_j_unrolled(&envs, &jk, &bounds, pool)) { + if (!rys_j_unrolled(envs, &jk, &bounds, pool)) { int quartets_per_block = scheme[0]; int gout_stride = scheme[1]; int with_gout = scheme[2]; @@ -846,11 +846,11 @@ int RYS_build_j(double *vj, double *dm, int n_dm, int nao, int buflen = (nroots*2 + g_size*3 + 6) * quartets_per_block + iprim*jprim; if (with_gout) { buflen += nf3_ij*nf3_kl * quartets_per_block; - rys_j_with_gout_kernel<<>>(envs, jk, bounds, pool); + rys_j_with_gout_kernel<<>>(*envs, jk, bounds, pool); } else { buflen += (nf3_ij+nf3_kl*2+(lij+1)*(lkl+1)*(nmax+2)) * quartets_per_block; buflen += nf3_ij; // dm_ij_cache - rys_j_kernel<<>>(envs, jk, bounds, pool); + rys_j_kernel<<>>(*envs, jk, bounds, pool); } } cudaError_t err = cudaGetLastError(); diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu index 1bee928ab..569d71e7f 100644 --- a/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu +++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_jk.cu @@ -523,7 +523,7 @@ extern int rys_jk_unrolled(RysIntEnvVars *envs, JKMatrix *jk, BoundsInfo *bounds extern "C" { int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao, - RysIntEnvVars envs, int *shls_slice, int shm_size, + RysIntEnvVars *envs, int *shls_slice, int shm_size, int npairs_ij, int npairs_kl, uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping, float *q_cond, float *s_estimator, float *dm_cond, float cutoff, @@ -567,7 +567,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao, ntiles_i, ntiles_j, ntiles_k, ntiles_l}; JKMatrix jk = {vj, vk, dm, n_dm, 0, omega}; - if (!rys_jk_unrolled(&envs, &jk, &bounds, pool)) { + if (!rys_jk_unrolled(envs, &jk, &bounds, pool)) { GXYZOffset* p_gxyz_offset = RYS_make_gxyz_offset(bounds); int gout_pattern = (((li == 0) >> 3) | ((lj == 0) >> 2) | @@ -579,7 +579,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao, int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_jk_kernel<<>>( - envs, jk, bounds, pool, p_gxyz_offset, + *envs, jk, bounds, pool, p_gxyz_offset, gout_pattern, reserved_shm_size); int n_tiles = ntiles_i * ntiles_j * ntiles_k * ntiles_l; @@ -588,7 +588,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao, min(256, n_tiles-256)); int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_jk_kernel<<>>( - envs, jk, bounds, pool, p_gxyz_offset+256, + *envs, jk, bounds, pool, p_gxyz_offset+256, gout_pattern, reserved_shm_size); } @@ -597,7 +597,7 @@ int RYS_build_jk(double *vj, double *vk, double *dm, int n_dm, int nao, min(256, n_tiles-512)); int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_jk_kernel<<>>( - envs, jk, bounds, pool, p_gxyz_offset+512, + *envs, jk, bounds, pool, p_gxyz_offset+512, gout_pattern, reserved_shm_size); } } diff --git a/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu b/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu index e5434f491..119dac73a 100644 --- a/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu +++ b/gpu4pyscf/lib/gvhf-rys/rys_contract_k.cu @@ -612,7 +612,7 @@ extern int rys_k_unrolled(RysIntEnvVars *envs, JKMatrix *kmat, BoundsInfo *bound extern "C" { int RYS_build_k(double *vk, double *dm, int n_dm, int nao, - RysIntEnvVars envs, int *shls_slice, int shm_size, + RysIntEnvVars *envs, int *shls_slice, int shm_size, int npairs_ij, int npairs_kl, uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping, float *q_cond, float *s_estimator, float *dm_cond, float cutoff, @@ -664,7 +664,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao, kmat.sr_factor = 1; } - if (!rys_k_unrolled(&envs, &kmat, &bounds, pool)) { + if (!rys_k_unrolled(envs, &kmat, &bounds, pool)) { GXYZOffset* p_gxyz_offset = RYS_make_gxyz_offset(bounds); int gout_pattern = (((li == 0) >> 3) | ((lj == 0) >> 2) | @@ -676,7 +676,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao, int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_k_kernel<<>>( - envs, kmat, bounds, pool, p_gxyz_offset, + *envs, kmat, bounds, pool, p_gxyz_offset, gout_pattern, reserved_shm_size); int n_tiles = ntiles_i * ntiles_j * ntiles_k * ntiles_l; @@ -685,7 +685,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao, min(256, n_tiles-256)); int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_k_kernel<<>>( - envs, kmat, bounds, pool, p_gxyz_offset+256, + *envs, kmat, bounds, pool, p_gxyz_offset+256, gout_pattern, reserved_shm_size); } @@ -694,7 +694,7 @@ int RYS_build_k(double *vk, double *dm, int n_dm, int nao, min(256, n_tiles-512)); int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_k_kernel<<>>( - envs, kmat, bounds, pool, p_gxyz_offset+512, + *envs, kmat, bounds, pool, p_gxyz_offset+512, gout_pattern, reserved_shm_size); } } diff --git a/gpu4pyscf/lib/pbc/ft_ao_ip1.cu b/gpu4pyscf/lib/pbc/ft_ao_ip1.cu index 66246464e..2e7aa6f24 100644 --- a/gpu4pyscf/lib/pbc/ft_ao_ip1.cu +++ b/gpu4pyscf/lib/pbc/ft_ao_ip1.cu @@ -342,6 +342,377 @@ void ft_aopair_ejk_ip1_kernel(double *out, double *dm, double *vG, double *Gv, } } +__global__ +void ft_aopair_strain_deriv_kernel(double *out, double *sigma, + double *dm, double *vG, double *Gv, + PBCIntEnvVars envs, int nGv, int shm_size, + int *bas_ij_idx, int *bas_ij_img_idx, + int *shl_pair_offsets) +{ + constexpr int nGv_per_block = NGV_PER_BLOCK; + constexpr int threads = NGV_PER_BLOCK * NSP_PER_BLOCK; + int sp_block_id = blockIdx.x; + int Gv_block_id = blockIdx.y; + int Gv_id_in_block = threadIdx.x; + + int thread_id = threadIdx.x + threadIdx.y * blockDim.x; + int shl_pair0 = shl_pair_offsets[sp_block_id]; + int shl_pair1 = shl_pair_offsets[sp_block_id+1]; + int bas_ij0 = bas_ij_idx[shl_pair0]; + int nbas = envs.cell0_nbas * envs.bvk_ncells; + int ish0 = bas_ij0 / nbas; + int jsh0 = bas_ij0 % nbas; + + int *bas = envs.bas; + double *env = envs.env; + double *img_coords = envs.img_coords; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int stride_j = li + 2; + int g_size = stride_j * (lj + 1); + int gx_len = g_size * nGv_per_block * NSP_PER_BLOCK; + int gout_stride = 1; + while (8*6*gx_len > shm_size) { + gx_len /= 2; + gout_stride *= 2; + } + int nsp_per_block = NSP_PER_BLOCK / gout_stride; + int gout_id = threadIdx.y % gout_stride; + int sp_id = threadIdx.y / gout_stride; + int Gv_gout_id = Gv_id_in_block + nGv_per_block * gout_id; + int nGv_gout = nGv_per_block * gout_stride; + int lij = li + lj + 1; + int nfi = (li + 1) * (li + 2) / 2; + int nfj = (lj + 1) * (lj + 2) / 2; + int nfij = nfi * nfj; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int ijprim = iprim * jprim; + int i_1 = nGv_per_block; + int j_1 = stride_j*nGv_per_block; + int *ao_loc = envs.ao_loc; + int nao = ao_loc[envs.cell0_nbas]; + + int Gv_id = Gv_block_id * nGv_per_block + Gv_id_in_block; + Gv += Gv_id; + double kx = Gv[0]; + double ky = Gv[nGv]; + double kz = Gv[nGv * 2]; + double kk = kx * kx + ky * ky + kz * kz; + + extern __shared__ double shared_memory[]; + double *gxR = shared_memory + g_size * nGv_per_block * sp_id + Gv_id_in_block; + double *gxI = gxR + gx_len*1; + double *gyR = gxR + gx_len*2; + double *gyI = gxR + gx_len*3; + double *gzR = gxR + gx_len*4; + double *gzI = gxR + gx_len*5; + double *rjri = shared_memory + gx_len * 6 + sp_id; + int *idx_i = _c_cartesian_lexical_xyz + lex_xyz_offset(li); + int *idx_j = _c_cartesian_lexical_xyz + lex_xyz_offset(lj); + + double sigma_xx = 0; + double sigma_xy = 0; + double sigma_xz = 0; + double sigma_yx = 0; + double sigma_yy = 0; + double sigma_yz = 0; + double sigma_zx = 0; + double sigma_zy = 0; + double sigma_zz = 0; + + for (int pair_ij = shl_pair0+sp_id; pair_ij < shl_pair1+sp_id; pair_ij += nsp_per_block) { + __syncthreads(); + int bas_ij, jL; + if (pair_ij < shl_pair1) { + bas_ij = bas_ij_idx[pair_ij]; + jL = bas_ij_img_idx[pair_ij]; + } else { + bas_ij = bas_ij_idx[shl_pair0]; + jL = bas_ij_img_idx[shl_pair0]; + } + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + int ish_cell0 = ish; + int jsh_cell0 = jsh % envs.cell0_nbas; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + double xj = rj[0] + img_coords[jL*3+0]; + double yj = rj[1] + img_coords[jL*3+1]; + double zj = rj[2] + img_coords[jL*3+2]; + if (Gv_gout_id == 0) { + double xjxi = xj - xi; + double yjyi = yj - yi; + double zjzi = zj - zi; + rjri[0*nsp_per_block] = xjxi; + rjri[1*nsp_per_block] = yjyi; + rjri[2*nsp_per_block] = zjzi; + } + int i0 = ao_loc[ish]; + int j0 = ao_loc[jsh]; + // Note the density matrix is assumed to be real in get_ej_ip1 function + double *dm_ij; + if (vG == NULL) { + dm_ij = dm + (Gv_id + (j0*nao+i0) * nGv) * OF_COMPLEX; + } else { + dm_ij = dm + (j0*nao+i0); + } + + double v_ix = 0; + double v_iy = 0; + double v_iz = 0; + double v_jx = 0; + double v_jy = 0; + double v_jz = 0; + double s0xR, s1xR, s2xR; + double s0xI, s1xI, s2xI; + double goutx, gouty, goutz; + + for (int ijp = 0; ijp < ijprim; ++ijp) { + __syncthreads(); + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ai2 = ai * 2; + double aj2 = aj * 2; + double aij = ai + aj; + double aj_aij = aj / aij; + double a2 = .5 / aij; + if (gout_id == 0) { + double theta_ij = ai * aj_aij; + double fac = OVERLAP_FAC * ci[ip] * cj[jp] / (aij * sqrt(aij)); + if (ish_cell0 == jsh_cell0) { + fac *= .5; + } + if (Gv_id >= nGv) { + fac = 0; + } + double xjxi = rjri[0*nsp_per_block]; + double yjyi = rjri[1*nsp_per_block]; + double zjzi = rjri[2*nsp_per_block]; + double xij = xjxi * aj_aij + ri[0]; + double yij = yjyi * aj_aij + ri[1]; + double zij = zjzi * aj_aij + ri[2]; + double kR = kx * xij + ky * yij + kz * zij; + sincos(-kR, gzI, gzR); + double rr = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double theta_rr = theta_ij*rr + .5*a2*kk; + double Kab = exp(-theta_rr); + gxR[0] = fac; + gxI[0] = 0.; + gyR[0] = 1.; + gyI[0] = 0.; + // exp(-theta_rr-kR*1j) + gzR[0] *= Kab; + gzI[0] *= Kab; + } + + // gx[i+1] = ia2 * gx[i-1] + (rijrx[0] - kx*a2*_Complex_I) * gx[i]; + __syncthreads(); + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gxR = gxR + n * gx_len * OF_COMPLEX; + double *_gxI = _gxR + gx_len; + double RpaR = rjri[n*nsp_per_block] * aj_aij; // Rp - Ra + double RpaI = -a2 * Gv[nGv*n]; + s0xR = _gxR[0]; + s0xI = _gxI[0]; + multiply(RpaR, RpaI, s0xR, s0xI, s1xR, s1xI); + _gxR[nGv_per_block] = s1xR; + _gxI[nGv_per_block] = s1xI; + for (int i = 1; i < lij; i++) { + double ia2 = i * a2; + multiply(RpaR, RpaI, s1xR, s1xI, s2xR, s2xI); + s2xR += ia2 * s0xR; + s2xI += ia2 * s0xI; + _gxR[(i+1)*nGv_per_block] = s2xR; + _gxI[(i+1)*nGv_per_block] = s2xI; + s0xR = s1xR; + s0xI = s1xI; + s1xR = s2xR; + s1xI = s2xI; + } + } + + // hrr + if (lj > 0) { + __syncthreads(); + for (int n = gout_id; n < 3*OF_COMPLEX; n += gout_stride) { + double *_gx = gxR + n * gx_len; + // The real and imaginary parts call the same expression + int _ix = n / 2; + double xjxi = rjri[_ix*nsp_per_block]; + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1xR = _gx[ij*nGv_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0xR = _gx[ij*nGv_per_block]; + _gx[(ij+stride_j)*nGv_per_block] = s1xR - xjxi * s0xR; + s1xR = s0xR; + } + } + } + } + __syncthreads(); + if (pair_ij >= shl_pair1 || Gv_id >= nGv) { + continue; + } + for (int ij = gout_id; ij < nfij; ij += gout_stride) { + int i = ij % nfi; + int j = ij / nfi; + double dm_vR, dm_vI; + if (vG == NULL) { + int addr = (j*nao+i)*nGv * OF_COMPLEX; + dm_vR = dm_ij[addr]; + dm_vI = dm_ij[addr+1]; + } else { + double tmp = dm_ij[j*nao+i]; + dm_vR = tmp * vG[Gv_id*OF_COMPLEX ]; + dm_vI = tmp * vG[Gv_id*OF_COMPLEX+1]; + } + int ix = idx_i[i*3+0]; + int iy = idx_i[i*3+1]; + int iz = idx_i[i*3+2]; + int jx = idx_j[j*3+0]; + int jy = idx_j[j*3+1]; + int jz = idx_j[j*3+2]; + int addrx = (ix + jx*stride_j) * nGv_per_block; + int addry = (iy + jy*stride_j) * nGv_per_block; + int addrz = (iz + jz*stride_j) * nGv_per_block; + double IxR = gxR[addrx]; + double IxI = gxI[addrx]; + double IyR = gyR[addry]; + double IyI = gyI[addry]; + double IzR = gzR[addrz]; + double IzI = gzI[addrz]; + double prod_xyR, prod_xyI; + double prod_xzR, prod_xzI; + double prod_yzR, prod_yzI; + multiply(IxR, IxI, IyR, IyI, prod_xyR, prod_xyI); + multiply(IxR, IxI, IzR, IzI, prod_xzR, prod_xzI); + multiply(IyR, IyI, IzR, IzI, prod_yzR, prod_yzI); + multiply(prod_xyR, prod_xyI, dm_vR, dm_vI, prod_xyR, prod_xyI); + multiply(prod_xzR, prod_xzI, dm_vR, dm_vI, prod_xzR, prod_xzI); + multiply(prod_yzR, prod_yzI, dm_vR, dm_vI, prod_yzR, prod_yzI); + double gixR = gxR[addrx+i_1]; + double gixI = gxI[addrx+i_1]; + double giyR = gyR[addry+i_1]; + double giyI = gyI[addry+i_1]; + double gizR = gzR[addrz+i_1]; + double gizI = gzI[addrz+i_1]; + // + double fjxR = aj2 * (gixR - rjri[0*nsp_per_block] * IxR); + double fjxI = aj2 * (gixI - rjri[0*nsp_per_block] * IxI); + double fjyR = aj2 * (giyR - rjri[1*nsp_per_block] * IyR); + double fjyI = aj2 * (giyI - rjri[1*nsp_per_block] * IyI); + double fjzR = aj2 * (gizR - rjri[2*nsp_per_block] * IzR); + double fjzI = aj2 * (gizI - rjri[2*nsp_per_block] * IzI); + if (jx > 0) { fjxR -= jx * gxR[addrx-j_1]; fjxI -= jx * gxI[addrx-j_1]; } + if (jy > 0) { fjyR -= jy * gyR[addry-j_1]; fjyI -= jy * gyI[addry-j_1]; } + if (jz > 0) { fjzR -= jz * gzR[addrz-j_1]; fjzI -= jz * gzI[addrz-j_1]; } + goutx = fjxR * prod_yzR - fjxI * prod_yzI; + gouty = fjyR * prod_xzR - fjyI * prod_xzI; + goutz = fjzR * prod_xyR - fjzI * prod_xyI; + v_jx += goutx; + v_jy += gouty; + v_jz += goutz; + sigma_xx += goutx * xj; + sigma_xy += goutx * yj; + sigma_xz += goutx * zj; + sigma_yx += gouty * xj; + sigma_yy += gouty * yj; + sigma_yz += gouty * zj; + sigma_zx += goutz * xj; + sigma_zy += goutz * yj; + sigma_zz += goutz * zj; + // <\nabla i|exp(-iGr)|j> + double fixR = ai2 * gixR; + double fiyR = ai2 * giyR; + double fizR = ai2 * gizR; + double fixI = ai2 * gixI; + double fiyI = ai2 * giyI; + double fizI = ai2 * gizI; + if (ix > 0) { fixR -= ix * gxR[addrx-i_1]; fixI -= ix * gxI[addrx-i_1]; } + if (iy > 0) { fiyR -= iy * gyR[addry-i_1]; fiyI -= iy * gyI[addry-i_1]; } + if (iz > 0) { fizR -= iz * gzR[addrz-i_1]; fizI -= iz * gzI[addrz-i_1]; } + goutx = fixR * prod_yzR - fixI * prod_yzI; + gouty = fiyR * prod_xzR - fiyI * prod_xzI; + goutz = fizR * prod_xyR - fizI * prod_xyI; + v_ix += goutx; + v_iy += gouty; + v_iz += goutz; + sigma_xx += goutx * xi; + sigma_xy += goutx * yi; + sigma_xz += goutx * zi; + sigma_yx += gouty * xi; + sigma_yy += gouty * yi; + sigma_yz += gouty * zi; + sigma_zx += goutz * xi; + sigma_zy += goutz * yi; + sigma_zz += goutz * zi; + // = Gx + // = -i <(y-Yi + Yi)i|exp(-iGr)|j> Gx + goutx = (gixR + xi * IxR) * prod_yzI + (gixI + xi * IxI) * prod_yzR; + gouty = (giyR + yi * IyR) * prod_xzI + (giyI + yi * IyI) * prod_xzR; + goutz = (gizR + zi * IzR) * prod_xyI + (gizI + zi * IzI) * prod_xyR; + sigma_xx -= kx * goutx; + sigma_xy -= kx * gouty; + sigma_xz -= kx * goutz; + sigma_yx -= ky * goutx; + sigma_yy -= ky * gouty; + sigma_yz -= ky * goutz; + sigma_zx -= kz * goutx; + sigma_zy -= kz * gouty; + sigma_zz -= kz * goutz; + } + } + + double *reduce = shared_memory + thread_id; + __syncthreads(); + reduce[0*threads] = v_ix; + reduce[1*threads] = v_iy; + reduce[2*threads] = v_iz; + reduce[3*threads] = v_jx; + reduce[4*threads] = v_jy; + reduce[5*threads] = v_jz; + for (int i = nGv_gout/2; i > 0; i >>= 1) { + __syncthreads(); + if (Gv_gout_id < i) { +#pragma unroll + for (int n = 0; n < 6; ++n) { + reduce[n*threads] += reduce[n*threads+i]; + } + } + } + if (Gv_gout_id == 0 && pair_ij < shl_pair1) { + int ia = bas[ish_cell0*BAS_SLOTS+ATOM_OF]; + int ja = bas[jsh_cell0*BAS_SLOTS+ATOM_OF]; + atomicAdd(out+ia*3+0, reduce[0*threads]); + atomicAdd(out+ia*3+1, reduce[1*threads]); + atomicAdd(out+ia*3+2, reduce[2*threads]); + atomicAdd(out+ja*3+0, reduce[3*threads]); + atomicAdd(out+ja*3+1, reduce[4*threads]); + atomicAdd(out+ja*3+2, reduce[5*threads]); + } + } + atomicAdd(sigma+0, sigma_xx); + atomicAdd(sigma+1, sigma_xy); + atomicAdd(sigma+2, sigma_xz); + atomicAdd(sigma+3, sigma_yx); + atomicAdd(sigma+4, sigma_yy); + atomicAdd(sigma+5, sigma_yz); + atomicAdd(sigma+6, sigma_zx); + atomicAdd(sigma+7, sigma_zy); + atomicAdd(sigma+8, sigma_zz); +} + extern "C" { int PBC_ft_aopair_ej_ip1(double *out, double *dm, double *vG, double *GvT, PBCIntEnvVars *envs, @@ -383,4 +754,46 @@ int PBC_ft_aopair_ek_ip1(double *out, double *dm_vG, double *GvT, PBCIntEnvVars } return 0; } + +int PBC_ft_aopair_ej_strain_deriv(double *out, double *sigma, double *dm, + double *vG, double *GvT, PBCIntEnvVars *envs, + int nbatches_shl_pair, int ngrids, int shm_size, + int *bas_ij_idx, int *bas_ij_img_idx, int *shl_pair_offsets, + int *atm, int natm, int *bas, int nbas, double *env) +{ + cudaFuncSetAttribute(ft_aopair_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + dim3 threads(NGV_PER_BLOCK, NSP_PER_BLOCK); + int Gv_batches = (ngrids + NGV_PER_BLOCK - 1) / NGV_PER_BLOCK; + dim3 blocks(nbatches_shl_pair, Gv_batches); + ft_aopair_strain_deriv_kernel<<>>( + out, sigma, dm, vG, GvT, *envs, ngrids, shm_size, + bas_ij_idx, bas_ij_img_idx, shl_pair_offsets); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in ft_aopair_ej_strain_deriv: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} + +int PBC_ft_aopair_ek_strain_deriv(double *out, double *sigma, + double *dm_vG, double *GvT, PBCIntEnvVars *envs, + int nbatches_shl_pair, int ngrids, int shm_size, + int *bas_ij_idx, int *bas_ij_img_idx, int *shl_pair_offsets, + int *atm, int natm, int *bas, int nbas, double *env) +{ + cudaFuncSetAttribute(ft_aopair_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + dim3 threads(NGV_PER_BLOCK, NSP_PER_BLOCK); + int Gv_batches = (ngrids + NGV_PER_BLOCK - 1) / NGV_PER_BLOCK; + dim3 blocks(nbatches_shl_pair, Gv_batches); + ft_aopair_strain_deriv_kernel<<>>( + out, sigma, dm_vG, NULL, GvT, *envs, ngrids, shm_size, + bas_ij_idx, bas_ij_img_idx, shl_pair_offsets); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in ft_aopair_ek_strain_deriv: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} } diff --git a/gpu4pyscf/lib/pbc/overlap.cu b/gpu4pyscf/lib/pbc/overlap.cu index caaaa4d83..23ca997c8 100644 --- a/gpu4pyscf/lib/pbc/overlap.cu +++ b/gpu4pyscf/lib/pbc/overlap.cu @@ -20,6 +20,7 @@ #include #include +#include "gvhf-rys/rys_contract_k.cuh" #include "pbc.cuh" #include "int3c2e.cuh" @@ -34,6 +35,7 @@ typedef struct { #define GOUT_WIDTH 36 #define GOUT_WIDTH_IP1 18 +#define REMOTE_THRESHOLD 50 static __global__ void int1e_ovlp_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds) @@ -795,6 +797,289 @@ void int1e_ipkin_kernel(double *out, PBCIntEnvVars envs, PBCInt2c2eBounds bounds } } +static __global__ +void ovlp_strain_deriv_kernel(double *out, double *dm, PBCIntEnvVars envs, + int *shl_pair_offsets, int *bas_ij_idx, + int *gout_stride_lookup, int is_gamma_point) +{ + int sp_block_id = blockIdx.x; + int thread_id = threadIdx.x; + int shl_pair0 = shl_pair_offsets[sp_block_id]; + int shl_pair1 = shl_pair_offsets[sp_block_id+1]; + int bas_ij0 = bas_ij_idx[shl_pair0]; + int cell0_nbas = envs.cell0_nbas; + int supmol_nbas = cell0_nbas * envs.nimgs; + int ish0 = bas_ij0 / supmol_nbas; + int jsh0 = bas_ij0 % cell0_nbas; + int nao = envs.ao_loc[cell0_nbas]; + + int *bas = envs.bas; + int *ao_loc = envs.ao_loc; + double *env = envs.env; + double *img_coords = envs.img_coords; + int li = bas[ish0*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh0*BAS_SLOTS+ANG_OF]; + int nfi = (li + 1) * (li + 2) / 2; + int nfj = (lj + 1) * (lj + 2) / 2; + int iprim = bas[ish0*BAS_SLOTS+NPRIM_OF]; + int jprim = bas[jsh0*BAS_SLOTS+NPRIM_OF]; + int lij = li + lj + 1; + int stride_j = li + 2; + int ijprim = iprim * jprim; + int nfij = nfi * nfj; + + int gout_stride = gout_stride_lookup[li*L_AUX1+lj]; + int nsp_per_block = THREADS / gout_stride; + int sp_id = thread_id % nsp_per_block; + int gout_id = thread_id / nsp_per_block; + int i_1 = nsp_per_block; + int j_1 = stride_j*nsp_per_block; + + int g_size = (li + 2) * (lj + 1); + int gx_len = g_size * nsp_per_block; + extern __shared__ double g[]; + double *gx = g + sp_id; + double *gy = gx + gx_len; + double *gz = gx + gx_len * 2; + double *rjri = gx + gx_len * 3; + gx[0] = PI_POW_1_5; + gy[0] = 1.; + int *idx_i = _c_cartesian_lexical_xyz + lex_xyz_offset(li); + int *idx_j = _c_cartesian_lexical_xyz + lex_xyz_offset(lj); + + double sigma_xx = 0; + double sigma_xy = 0; + double sigma_xz = 0; + double sigma_yx = 0; + double sigma_yy = 0; + double sigma_yz = 0; + double sigma_zx = 0; + double sigma_zy = 0; + double sigma_zz = 0; + for (int task_id = shl_pair0; task_id < shl_pair1; task_id += nsp_per_block) { + int pair_ij = task_id + sp_id; + if (pair_ij >= shl_pair1) { + pair_ij = shl_pair0; + gx[0] = 0; + } + int bas_ij = bas_ij_idx[pair_ij]; + int ish = bas_ij / supmol_nbas; + int _jsh = bas_ij % supmol_nbas; + int cell_j = _jsh / cell0_nbas; + int jsh = _jsh % cell0_nbas; + if (ish == jsh) { + gy[0] = .5; + } else if (ish < jsh) { + gx[0] = 0; + } + int i0 = ao_loc[ish]; + int j0 = ao_loc[jsh]; + double *dm_ji; + if (is_gamma_point) { + dm_ji = dm + j0*nao+i0; + } else { + dm_ji = dm + (cell_j*nao+j0)*nao+i0; + } + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + double xj = rj[0] + img_coords[cell_j*3+0]; + double yj = rj[1] + img_coords[cell_j*3+1]; + double zj = rj[2] + img_coords[cell_j*3+2]; + if (gout_id == 0) { + double xjxi = xj - ri[0]; + double yjyi = yj - ri[1]; + double zjzi = zj - ri[2]; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + rjri[0*nsp_per_block] = xjxi; + rjri[1*nsp_per_block] = yjyi; + rjri[2*nsp_per_block] = zjzi; + rjri[3*nsp_per_block] = rr_ij; + } + for (int ijp = 0; ijp < ijprim; ++ijp) { + __syncthreads(); + int ip = ijp % iprim; + int jp = ijp / iprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ai2 = ai * 2; + double aj2 = aj * 2; + double aij = ai + aj; + double aj_aij = aj / aij; + if (gout_id == 0) { + double theta = ai * aj_aij; + double theta_rr = theta * rjri[3*nsp_per_block]; + double cicj = ci[ip] * cj[jp]; + gz[0] = cicj / (aij*sqrt(aij)) * exp(-theta_rr); + } + __syncthreads(); + double s0x, s1x, s2x; + double b = .5 / aij; + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * gx_len; + double xjxi = rjri[n*nsp_per_block]; + double xpa = xjxi * aj_aij; + s0x = _gx[0]; + s1x = xpa * s0x; + _gx[nsp_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = xpa * s1x + i * b * s0x; + _gx[(i+1)*nsp_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nsp_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nsp_per_block]; + _gx[(ij+stride_j)*nsp_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + __syncthreads(); + if (task_id + sp_id >= shl_pair1) { + continue; + } +#pragma unroll + for (int n = 0; n < GOUT_WIDTH; ++n) { + int ij = n*gout_stride+gout_id; + if (ij >= nfij) break; + int i = ij % nfi; + int j = ij / nfi; + int ix = idx_i[i*3+0]; + int iy = idx_i[i*3+1]; + int iz = idx_i[i*3+2]; + int jx = idx_j[j*3+0]; + int jy = idx_j[j*3+1]; + int jz = idx_j[j*3+2]; + int addrx = (ix + jx*stride_j) * nsp_per_block; + int addry = (iy + jy*stride_j + g_size) * nsp_per_block; + int addrz = (iz + jz*stride_j + g_size*2) * nsp_per_block; + double Ix = gx[addrx]; + double Iy = gx[addry]; + double Iz = gx[addrz]; + double dm_val = dm_ji[j*nao+i]; + double prod_xy = Ix * Iy * dm_val; + double prod_xz = Ix * Iz * dm_val; + double prod_yz = Iy * Iz * dm_val; + double gix = gx[addrx+i_1]; + double giy = gx[addry+i_1]; + double giz = gx[addrz+i_1]; + double fix = ai2 * gix; if (ix > 0) { fix -= ix * gx[addrx-i_1]; } + double fiy = ai2 * giy; if (iy > 0) { fiy -= iy * gx[addry-i_1]; } + double fiz = ai2 * giz; if (iz > 0) { fiz -= iz * gx[addrz-i_1]; } + double v_ix = fix * prod_yz; + double v_iy = fiy * prod_xz; + double v_iz = fiz * prod_xy; + double fjx = aj2 * (gix - rjri[0*nsp_per_block] * Ix); if (jx > 0) { fjx -= jx * gx[addrx-j_1]; } + double fjy = aj2 * (giy - rjri[1*nsp_per_block] * Iy); if (jy > 0) { fjy -= jy * gx[addry-j_1]; } + double fjz = aj2 * (giz - rjri[2*nsp_per_block] * Iz); if (jz > 0) { fjz -= jz * gx[addrz-j_1]; } + double v_jx = fjx * prod_yz; + double v_jy = fjy * prod_xz; + double v_jz = fjz * prod_xy; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + sigma_xx += v_ix * xi; + sigma_xy += v_ix * yi; + sigma_xz += v_ix * zi; + sigma_yx += v_iy * xi; + sigma_yy += v_iy * yi; + sigma_yz += v_iy * zi; + sigma_zx += v_iz * xi; + sigma_zy += v_iz * yi; + sigma_zz += v_iz * zi; + sigma_xx += v_jx * xj; + sigma_xy += v_jx * yj; + sigma_xz += v_jx * zj; + sigma_yx += v_jy * xj; + sigma_yy += v_jy * yj; + sigma_yz += v_jy * zj; + sigma_zx += v_jz * xj; + sigma_zy += v_jz * yj; + sigma_zz += v_jz * zj; + } + } + } + atomicAdd(out+0, sigma_xx); + atomicAdd(out+1, sigma_xy); + atomicAdd(out+2, sigma_xz); + atomicAdd(out+3, sigma_yx); + atomicAdd(out+4, sigma_yy); + atomicAdd(out+5, sigma_yz); + atomicAdd(out+6, sigma_zx); + atomicAdd(out+7, sigma_zy); + atomicAdd(out+8, sigma_zz); +} + +// An estimation of the upper bound of the overlap || for +// shell pairs between the primitve cell and the super-mol +__global__ static +void ovlp_mask_estimation_kernel(int8_t *ovlp_mask, float *exps, float *log_coeff, + PBCIntEnvVars envs, int hermi, float log_cutoff) +{ + int bas_ij = blockIdx.x * blockDim.x + threadIdx.x; + int nbas = envs.cell0_nbas; + int nbas2 = nbas * nbas; + if (bas_ij >= nbas2) { + return; + } + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + if (hermi && ish < jsh) { + return; + } + int nimgs = envs.nimgs; + int supmol_nbas = nbas * nimgs; + ovlp_mask += ish * supmol_nbas + jsh; + int *bas = envs.bas; + double *env = envs.env; + double *img_coords = envs.img_coords; + int li = bas[ish*BAS_SLOTS+ANG_OF]; + int lj = bas[jsh*BAS_SLOTS+ANG_OF]; + double *ri = env + bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + double *rj = env + bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + float xi = ri[0]; + float yi = ri[1]; + float zi = ri[2]; + float xj = rj[0]; + float yj = rj[1]; + float zj = rj[2]; + float ai = exps[ish]; + float aj = exps[jsh]; + float aij = ai + aj; + float fi = ai / aij; + float fj = aj / aij; + float theta = ai * fj; + float xjxi = xj - xi; + float yjyi = yj - yi; + float zjzi = zj - zi; + float fac_norm = log_coeff[ish] + log_coeff[jsh] + 1.717f - 1.5f * logf(aij); + for (int img = 0; img < nimgs; ++img) { + float xjLxi = xjxi + img_coords[img*3+0]; + float yjLyi = yjyi + img_coords[img*3+1]; + float zjLzi = zjzi + img_coords[img*3+2]; + float rr_ij = xjLxi * xjLxi + yjLyi * yjLyi + zjLzi * zjLzi; + if (theta*rr_ij > REMOTE_THRESHOLD) { + continue; + } + float dr = sqrtf(rr_ij); + float dri = fj * dr; + float drj = fi * dr; + float dri_fac = .5f*li * logf(.5f*li/aij + dri*dri + 1e-9f); + float drj_fac = .5f*lj * logf(.5f*lj/aij + drj*drj + 1e-9f); + float log_ovlp = fac_norm - theta*rr_ij + dri_fac + drj_fac; + if (log_ovlp > log_cutoff) { + ovlp_mask[img*nbas] = 1; + } + } +} + extern "C" { int PBCint1e_ovlp(double *out, PBCIntEnvVars *envs, int shm_size, int nbatches_shl_pair, int *bas_ij_idx, @@ -863,4 +1148,28 @@ int PBCint1e_ipkin(double *out, PBCIntEnvVars *envs, int shm_size, } return 0; } + +int PBCovlp_strain_deriv(double *out, double *dm, + PBCIntEnvVars *envs, int shm_size, int nbatches_shl_pair, + int *shl_pair_offsets, int *bas_ij_idx, int *gout_stride_lookup, + int is_gamma_point) +{ + cudaFuncSetAttribute(ovlp_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + ovlp_strain_deriv_kernel<<>>( + out, dm, *envs, shl_pair_offsets, bas_ij_idx, gout_stride_lookup, is_gamma_point); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in ovlp_strain_deriv kernel: %s\n", cudaGetErrorString(err)); + return 1; + } + return 0; +} +void PBCovlp_mask_estimation(int8_t *ovlp_mask, float *exps, float *log_coeff, + PBCIntEnvVars *envs, int hermi, float log_cutoff) +{ + int nbas = envs->cell0_nbas; + int nbatches = (nbas * nbas + THREADS-1) / THREADS; + ovlp_mask_estimation_kernel<<>>( + ovlp_mask, exps, log_coeff, *envs, hermi, log_cutoff); +} } diff --git a/gpu4pyscf/lib/pbc/pbc.cuh b/gpu4pyscf/lib/pbc/pbc.cuh index 7b7f44178..778d6ae2a 100644 --- a/gpu4pyscf/lib/pbc/pbc.cuh +++ b/gpu4pyscf/lib/pbc/pbc.cuh @@ -66,12 +66,12 @@ typedef struct { int cell0_natm; // in bvk-cell int cell0_nbas; // in bvk-cell - int bvk_ncells; // number of images in the BvK cell - int nimgs; // number of images in lattice sum int *atm; int *bas; double *env; int *ao_loc; // in bvk-cell + int bvk_ncells; // number of images in the BvK cell + int nimgs; // number of images in lattice sum double *img_coords; // vectors in lattice sum } PBCIntEnvVars; diff --git a/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu b/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu index d25182e5f..8bda9b0be 100644 --- a/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu +++ b/gpu4pyscf/lib/pbc/rys_contract_jk_ip1.cu @@ -1,5 +1,5 @@ /* - * Copyright 2021-2024 The PySCF Developers. All Rights Reserved. + * Copyright 2025 The PySCF Developers. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -550,13 +550,629 @@ while (1) { } } +__global__ static +void rys_ejk_strain_deriv_kernel(RysIntEnvVars envs, JKEnergy jk, BoundsInfo bounds, + double *sigma, int *bas_mask_idx, int *Ts_ji_lookup, + int nimgs, int nimgs_uniq_pair, int nbas_cell0, int nao, + uint32_t *pool, double *dd_pool, int *head, + int reserved_shm_size) +{ + int sq_id = threadIdx.x; + int nsq_per_block = blockDim.x; + int gout_id = threadIdx.y; + int gout_stride = blockDim.y; + int threads = blockDim.x * blockDim.y; + int thread_id = threadIdx.x + blockDim.x * threadIdx.y; + int nbas = envs.nbas; + int *bas = envs.bas; + double *env = envs.env; + int li = bounds.li; + int lj = bounds.lj; + int lk = bounds.lk; + int ll = bounds.ll; + int iprim = bounds.iprim; + int jprim = bounds.jprim; + int kprim = bounds.kprim; + int lprim = bounds.lprim; + int stride_j = bounds.stride_j; + int stride_k = bounds.stride_k; + int stride_l = bounds.stride_l; + int g_size = bounds.g_size; + int nfi = bounds.nfi; + int nfj = bounds.nfj; + int nfk = bounds.nfk; + int nfl = bounds.nfl; + int nfij = nfi * nfj; + int nfkl = nfk * nfl; + int lij = li + lj + 1; + int lkl = lk + ll + 1; + int i_1 = nsq_per_block; + int j_1 = stride_j*nsq_per_block; + int k_1 = stride_k*nsq_per_block; + int l_1 = stride_l*nsq_per_block; + + extern __shared__ double shared_memory[]; + double *rlrk = shared_memory + sq_id; + double *Rpq = shared_memory + nsq_per_block * 3 + sq_id; + double *gx = shared_memory + nsq_per_block * 6 + sq_id; + double *rw = shared_memory + nsq_per_block * (g_size*3+6) + sq_id; + double *cicj_cache = shared_memory + reserved_shm_size; + int *idx_i = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.li); + int *idx_j = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.lj); + int *idx_k = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.lk); + int *idx_l = _c_cartesian_lexical_xyz + lex_xyz_offset(bounds.ll); + + int do_j = jk.j_factor != 0.; + int do_k = jk.k_factor != 0.; + int *ao_loc = envs.ao_loc; + double *dm = jk.dm; + + uint32_t *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH; + int nf = bounds.nfi * bounds.nfj * bounds.nfk * bounds.nfl; + double *dd_cache = dd_pool + blockIdx.x * nf * blockDim.x + sq_id; + __shared__ int ntasks, pair_ij, pair_kl0; + double sigma_xx = 0; + double sigma_xy = 0; + double sigma_xz = 0; + double sigma_yx = 0; + double sigma_yy = 0; + double sigma_yz = 0; + double sigma_zx = 0; + double sigma_zy = 0; + double sigma_zz = 0; +while (1) { + if (thread_id == 0) { + pair_ij = atomicAdd(head, 1); + } + __syncthreads(); + if (pair_ij >= bounds.npairs_ij) { + break; + } + + uint32_t bas_ij = bounds.pair_ij_mapping[pair_ij]; + int ish = bas_ij / nbas; + int jsh = bas_ij % nbas; + __shared__ int cell_j, ish_cell0, jsh_cell0, i0, j0; + __shared__ double ri[3]; + __shared__ double rj[3]; + __shared__ double rjri[3]; + if (thread_id == 0) { + int _jsh = bas_mask_idx[jsh]; + ish_cell0 = ish; + jsh_cell0 = _jsh % nbas_cell0; + cell_j = _jsh / nbas_cell0; + i0 = ao_loc[ish_cell0]; + j0 = ao_loc[jsh_cell0]; + } + __syncthreads(); + double *expi = env + bas[ish*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh*BAS_SLOTS+PTR_EXP]; + double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF]; + double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF]; + if (thread_id < 3) { + int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD]; + int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD]; + ri[thread_id] = env[ri_ptr+thread_id]; + rj[thread_id] = env[rj_ptr+thread_id]; + rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id]; + } + __syncthreads(); + double xjxi = rjri[0]; + double yjyi = rjri[1]; + double zjzi = rjri[2]; + for (int ij = thread_id; ij < iprim*jprim; ij += threads) { + int ip = ij / jprim; + int jp = ij % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double aij = ai + aj; + double theta_ij = ai * aj / aij; + double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi; + double Kab = exp(-theta_ij * rr_ij); + double cicj = ci[ip] * cj[jp]; + if (ish_cell0 == jsh_cell0) { + cicj *= .5; + } + cicj_cache[ij] = cicj * Kab; + } + double v_ix = 0; + double v_iy = 0; + double v_iz = 0; + double v_jx = 0; + double v_jy = 0; + double v_jz = 0; + double goutx, gouty, goutz; + + if (thread_id == 0) { + pair_kl0 = 0; + } + __syncthreads(); + while (pair_kl0 < bounds.npairs_kl) { + _fill_sr_ejk_tasks(ntasks, pair_kl0, bas_kl_idx, bas_ij, bas_mask_idx, + Ts_ji_lookup, nimgs, nbas_cell0, jk, envs, bounds); + if (ntasks == 0) { + continue; + } + + for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) { + __syncthreads(); + int bas_kl = bas_kl_idx[task_id]; + int ksh = bas_kl / nbas; + int lsh = bas_kl % nbas; + int _ksh = bas_mask_idx[ksh]; + int cell_k = _ksh / nbas_cell0; + int ksh_cell0 = _ksh % nbas_cell0; + int _lsh = bas_mask_idx[lsh]; + int cell_l = _lsh / nbas_cell0; + int lsh_cell0 = _lsh % nbas_cell0; + double fac_sym = PI_FAC; + if (task_id < ntasks) { + if (ksh_cell0 == lsh_cell0) fac_sym *= .5; + if (ish_cell0 == ksh_cell0 && jsh_cell0 == lsh_cell0) fac_sym *= .5; + } else { + fac_sym = 0; + } + int k0 = ao_loc[ksh_cell0]; + int l0 = ao_loc[lsh_cell0]; + double *expi = env + bas[ish_cell0*BAS_SLOTS+PTR_EXP]; + double *expj = env + bas[jsh_cell0*BAS_SLOTS+PTR_EXP]; + double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP]; + double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP]; + double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF]; + double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF]; + double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD]; + double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD]; + double xk = rk[0]; + double yk = rk[1]; + double zk = rk[2]; + double xl = rl[0]; + double yl = rl[1]; + double zl = rl[2]; + if (gout_id == 0) { + double xlxk = xl - xk; + double ylyk = yl - yk; + double zlzk = zl - zk; + rlrk[0*nsq_per_block] = xlxk; + rlrk[1*nsq_per_block] = ylyk; + rlrk[2*nsq_per_block] = zlzk; + } + + double v_kx = 0; + double v_ky = 0; + double v_kz = 0; + double v_lx = 0; + double v_ly = 0; + double v_lz = 0; + int nao2 = nao * nao; + double *dm_jk = dm + Ts_ji_lookup[cell_j+cell_k*nimgs] * nao2; + double *dm_jl = dm + Ts_ji_lookup[cell_j+cell_l*nimgs] * nao2; + double *dm_ki = dm + Ts_ji_lookup[cell_k ] * nao2; + double *dm_li = dm + Ts_ji_lookup[cell_l ] * nao2; + double *dm_ji = dm + Ts_ji_lookup[cell_j ] * nao2; + double *dm_lk = dm + Ts_ji_lookup[cell_l+cell_k*nimgs] * nao2; + if (jk.n_dm == 1) { + for (int n = gout_id; n < nfij*nfkl; n+=gout_stride) { + int kl = n / nfij; + int ij = n % nfij; + int i = ij % nfi; + int j = ij / nfi; + int k = kl % nfk; + int l = kl / nfk; + int _i = i + i0; + int _j = j + j0; + int _k = k + k0; + int _l = l + l0; + int _jl = _j*nao+_l; + int _jk = _j*nao+_k; + int _li = _l*nao+_i; + int _ki = _k*nao+_i; + int _ji = _j*nao+_i; + int _lk = _l*nao+_k; + double dd = 0; + if (do_k) { + dd += jk.k_factor * (dm_jk[_jk] * dm_li[_li] + dm_jl[_jl] * dm_ki[_ki]); + } + if (do_j) { + dd += jk.j_factor * dm_ji[_ji] * dm_lk[_lk]; + } + dd_cache[n*nsq_per_block] = fac_sym * dd; + } + } else { + int dm_size = nao2 * nimgs_uniq_pair; + for (int n = gout_id; n < nfij*nfkl; n+=gout_stride) { + int kl = n / nfij; + int ij = n % nfij; + int i = ij % nfi; + int j = ij / nfi; + int k = kl % nfk; + int l = kl / nfk; + int _i = i + i0; + int _j = j + j0; + int _k = k + k0; + int _l = l + l0; + int _jl = _j*nao+_l; + int _jk = _j*nao+_k; + int _li = _l*nao+_i; + int _ki = _k*nao+_i; + int _ji = _j*nao+_i; + int _lk = _l*nao+_k; + double dd = 0; + if (do_k) { + dd += dm_jk[_jk] * dm_li[_li] + dm_jl[_jl] * dm_ki[_ki]; + dd += dm_jk[dm_size+_jk] * dm_li[dm_size+_li] + + dm_jl[dm_size+_jl] * dm_ki[dm_size+_ki]; + dd *= jk.k_factor; + } + if (do_j) { + dd += jk.j_factor * (dm_ji[_ji] + dm_ji[dm_size+_ji]) * + (dm_lk[_lk] + dm_lk[dm_size+_lk]); + } + dd_cache[n*nsq_per_block] = fac_sym * dd; + } + } + + for (int klp = 0; klp < kprim*lprim; ++klp) { + __syncthreads(); + int kp = klp / lprim; + int lp = klp % lprim; + double ak = expk[kp]; + double al = expl[lp]; + double akl = ak + al; + double al_akl = al / akl; + double ak2 = ak * 2; + double al2 = al * 2; + if (gout_id == 0) { + double xlxk = rlrk[0*nsq_per_block]; + double ylyk = rlrk[1*nsq_per_block]; + double zlzk = rlrk[2*nsq_per_block]; + double rr_kl = xlxk*xlxk + ylyk*ylyk + zlzk*zlzk; + double theta_kl = ak * al_akl; + double Kcd = exp(-theta_kl * rr_kl); + double ckcl = ck[kp] * cl[lp] * Kcd; + gx[0] = ckcl; + } + for (int ijp = 0; ijp < iprim*jprim; ++ijp) { + __syncthreads(); + int ip = ijp / jprim; + int jp = ijp % jprim; + double ai = expi[ip]; + double aj = expj[jp]; + double ai2 = ai * 2; + double aj2 = aj * 2; + double aij = ai + aj; + double aj_aij = aj / aij; + double xij = ri[0] + (rjri[0]) * aj_aij; + double yij = ri[1] + (rjri[1]) * aj_aij; + double zij = ri[2] + (rjri[2]) * aj_aij; + double xkl = xk + rlrk[0*nsq_per_block] * al_akl; + double ykl = yk + rlrk[1*nsq_per_block] * al_akl; + double zkl = zk + rlrk[2*nsq_per_block] * al_akl; + double xpq = xij - xkl; + double ypq = yij - ykl; + double zpq = zij - zkl; + if (gout_id == 0) { + Rpq[0*nsq_per_block] = xpq; + Rpq[1*nsq_per_block] = ypq; + Rpq[2*nsq_per_block] = zpq; + double cicj = cicj_cache[ijp]; + gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl)); + } + double rr = xpq*xpq + ypq*ypq + zpq*zpq; + double theta = aij * akl / (aij + akl); + int nroots = bounds.nroots; + rys_roots_for_k(nroots, theta, rr, rw, jk.omega, jk.lr_factor, jk.sr_factor); + for (int irys = 0; irys < nroots; ++irys) { + __syncthreads(); + if (gout_id == 0) { + gx[nsq_per_block*g_size*2] = rw[(irys*2+1)*nsq_per_block]; + } + double rt = rw[irys*2*nsq_per_block]; + double rt_aa = rt / (aij + akl); + double rt_aij = rt_aa * akl; + double rt_akl = rt_aa * aij; + double b00 = .5 * rt_aa; + double b10 = .5/aij * (1 - rt_aij); + double b01 = .5/akl * (1 - rt_akl); + double s0x, s1x, s2x; + __syncthreads(); + // gx(0,n+1) = c0*gx(0,n) + n*b10*gx(0,n-1) + for (int n = gout_id; n < 3; n += gout_stride) { + double *_gx = gx + n * g_size * nsq_per_block; + double Rpa = (rjri[n]) * aj_aij; + double c0x = Rpa - rt_aij * Rpq[n*nsq_per_block]; + s0x = _gx[0]; + s1x = c0x * s0x; + _gx[nsq_per_block] = s1x; + for (int i = 1; i < lij; ++i) { + s2x = c0x * s1x + i * b10 * s0x; + _gx[(i+1)*nsq_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + + int lij3 = (lij+1)*3; + for (int n = gout_id; n < lij3+gout_id; n += gout_stride) { + __syncthreads(); + int i = n / 3; //for i in range(lij+1): + int _ix = n % 3; // TODO: remove _ix for nroots > 2 + double *_gx = gx + (i + _ix * g_size) * nsq_per_block; + double Rqc = rlrk[_ix*nsq_per_block] * al_akl; + double cpx = Rqc + rt_akl * Rpq[_ix*nsq_per_block]; + //for i in range(lij+1): + // trr(i,1) = c0p * trr(i,0) + i*b00 * trr(i-1,0) + if (n < lij3) { + s0x = _gx[0]; + s1x = cpx * s0x; + if (i > 0) { + s1x += i * b00 * _gx[-nsq_per_block]; + } + _gx[stride_k*nsq_per_block] = s1x; + } + + //for k in range(1, lkl): + // for i in range(lij+1): + // trr(i,k+1) = cp * trr(i,k) + k*b01 * trr(i,k-1) + i*b00 * trr(i-1,k) + for (int k = 1; k < lkl; ++k) { + __syncthreads(); + if (n < lij3) { + s2x = cpx*s1x + k*b01*s0x; + if (i > 0) { + s2x += i * b00 * _gx[(k*stride_k-1)*nsq_per_block]; + } + _gx[(k*stride_k+stride_k)*nsq_per_block] = s2x; + s0x = s1x; + s1x = s2x; + } + } + } + + // hrr + // g(i,j+1) = rirj * g(i,j) + g(i+1,j) + // g(...,k,l+1) = rkrl * g(...,k,l) + g(...,k+1,l) + if (lj > 0) { + __syncthreads(); + if (task_id < ntasks) { + int lkl3 = (lkl+1)*3; + for (int m = gout_id; m < lkl3; m += gout_stride) { + int k = m / 3; + int _ix = m % 3; + double xjxi = rjri[_ix]; + double *_gx = gx + (_ix*g_size + k*stride_k) * nsq_per_block; + for (int j = 0; j < lj; ++j) { + int ij = (lij-j) + j*stride_j; + s1x = _gx[ij*nsq_per_block]; + for (--ij; ij >= j*stride_j; --ij) { + s0x = _gx[ij*nsq_per_block]; + _gx[(ij+stride_j)*nsq_per_block] = s1x - xjxi * s0x; + s1x = s0x; + } + } + } + } + } + if (ll > 0) { + __syncthreads(); + if (task_id < ntasks) { + for (int n = gout_id; n < stride_k*3; n += gout_stride) { + int i = n / 3; + int _ix = n % 3; + double xlxk = rlrk[_ix*nsq_per_block]; + double *_gx = gx + (_ix*g_size + i) * nsq_per_block; + for (int l = 0; l < ll; ++l) { + int kl = (lkl-l)*stride_k + l*stride_l; + s1x = _gx[kl*nsq_per_block]; + for (kl-=stride_k; kl >= l*stride_l; kl-=stride_k) { + s0x = _gx[kl*nsq_per_block]; + _gx[(kl+stride_l)*nsq_per_block] = s1x - xlxk * s0x; + s1x = s0x; + } + } + } + } + } + + __syncthreads(); + if (task_id >= ntasks) { + continue; + } + for (int n = gout_id; n < nfij*nfkl; n+=gout_stride) { + int kl = n / nfij; + int ij = n % nfij; + int i = ij % nfi; + int j = ij / nfi; + int k = kl % nfk; + int l = kl / nfk; + int ix = idx_i[i*3+0]; + int iy = idx_i[i*3+1]; + int iz = idx_i[i*3+2]; + int jx = idx_j[j*3+0]; + int jy = idx_j[j*3+1]; + int jz = idx_j[j*3+2]; + int kx = idx_k[k*3+0]; + int ky = idx_k[k*3+1]; + int kz = idx_k[k*3+2]; + int lx = idx_l[l*3+0]; + int ly = idx_l[l*3+1]; + int lz = idx_l[l*3+2]; + double dd = dd_cache[n*nsq_per_block]; + int addrx = (ix + jx*stride_j + kx*stride_k + lx*stride_l) * nsq_per_block; + int addry = (iy + jy*stride_j + ky*stride_k + ly*stride_l + g_size) * nsq_per_block; + int addrz = (iz + jz*stride_j + kz*stride_k + lz*stride_l + g_size*2) * nsq_per_block; + double Ix = gx[addrx]; + double Iy = gx[addry]; + double Iz = gx[addrz]; + double prod_xy = Ix * Iy * dd; + double prod_xz = Ix * Iz * dd; + double prod_yz = Iy * Iz * dd; + double gix = gx[addrx+i_1]; + double giy = gx[addry+i_1]; + double giz = gx[addrz+i_1]; + double gkx = gx[addrx+k_1]; + double gky = gx[addry+k_1]; + double gkz = gx[addrz+k_1]; + double fix = ai2 * gix; if (ix > 0) { fix -= ix * gx[addrx-i_1]; } + double fiy = ai2 * giy; if (iy > 0) { fiy -= iy * gx[addry-i_1]; } + double fiz = ai2 * giz; if (iz > 0) { fiz -= iz * gx[addrz-i_1]; } + goutx = fix * prod_yz; + gouty = fiy * prod_xz; + goutz = fiz * prod_xy; + v_ix += goutx; + v_iy += gouty; + v_iz += goutz; + double xi = ri[0]; + double yi = ri[1]; + double zi = ri[2]; + sigma_xx += goutx * xi; + sigma_xy += goutx * yi; + sigma_xz += goutx * zi; + sigma_yx += gouty * xi; + sigma_yy += gouty * yi; + sigma_yz += gouty * zi; + sigma_zx += goutz * xi; + sigma_zy += goutz * yi; + sigma_zz += goutz * zi; + double fkx = ak2 * gkx; if (kx > 0) { fkx -= kx * gx[addrx-k_1]; } + double fky = ak2 * gky; if (ky > 0) { fky -= ky * gx[addry-k_1]; } + double fkz = ak2 * gkz; if (kz > 0) { fkz -= kz * gx[addrz-k_1]; } + goutx = fkx * prod_yz; + gouty = fky * prod_xz; + goutz = fkz * prod_xy; + v_kx += goutx; + v_ky += gouty; + v_kz += goutz; + sigma_xx += goutx * xk; + sigma_xy += goutx * yk; + sigma_xz += goutx * zk; + sigma_yx += gouty * xk; + sigma_yy += gouty * yk; + sigma_yz += gouty * zk; + sigma_zx += goutz * xk; + sigma_zy += goutz * yk; + sigma_zz += goutz * zk; + double fjx = aj2 * (gix - rjri[0] * Ix); if (jx > 0) { fjx -= jx * gx[addrx-j_1]; } + double fjy = aj2 * (giy - rjri[1] * Iy); if (jy > 0) { fjy -= jy * gx[addry-j_1]; } + double fjz = aj2 * (giz - rjri[2] * Iz); if (jz > 0) { fjz -= jz * gx[addrz-j_1]; } + goutx = fjx * prod_yz; + gouty = fjy * prod_xz; + goutz = fjz * prod_xy; + v_jx += goutx; + v_jy += gouty; + v_jz += goutz; + double xj = rj[0]; + double yj = rj[1]; + double zj = rj[2]; + sigma_xx += goutx * xj; + sigma_xy += goutx * yj; + sigma_xz += goutx * zj; + sigma_yx += gouty * xj; + sigma_yy += gouty * yj; + sigma_yz += gouty * zj; + sigma_zx += goutz * xj; + sigma_zy += goutz * yj; + sigma_zz += goutz * zj; + double flx = al2 * (gkx - rlrk[0*nsq_per_block] * Ix); if (lx > 0) { flx -= lx * gx[addrx-l_1]; } + double fly = al2 * (gky - rlrk[1*nsq_per_block] * Iy); if (ly > 0) { fly -= ly * gx[addry-l_1]; } + double flz = al2 * (gkz - rlrk[2*nsq_per_block] * Iz); if (lz > 0) { flz -= lz * gx[addrz-l_1]; } + goutx = flx * prod_yz; + gouty = fly * prod_xz; + goutz = flz * prod_xy; + v_lx += goutx; + v_ly += gouty; + v_lz += goutz; + sigma_xx += goutx * xl; + sigma_xy += goutx * yl; + sigma_xz += goutx * zl; + sigma_yx += gouty * xl; + sigma_yy += gouty * yl; + sigma_yz += gouty * zl; + sigma_zx += goutz * xl; + sigma_zy += goutz * yl; + sigma_zz += goutz * zl; + } + } + } + } + int ka = bas[ksh_cell0*BAS_SLOTS+ATOM_OF]; + int la = bas[lsh_cell0*BAS_SLOTS+ATOM_OF]; + int threads = nsq_per_block * gout_stride; + double *reduce = shared_memory + thread_id; + __syncthreads(); + if (task_id < ntasks) { + reduce[0*threads] = v_kx; + reduce[1*threads] = v_ky; + reduce[2*threads] = v_kz; + reduce[3*threads] = v_lx; + reduce[4*threads] = v_ly; + reduce[5*threads] = v_lz; + } + for (int i = gout_stride/2; i > 0; i >>= 1) { + __syncthreads(); + if (gout_id < i && task_id < ntasks) { +#pragma unroll + for (int n = 0; n < 6; ++n) { + reduce[n*threads] += reduce[n*threads+i*nsq_per_block]; + } + } + } + if (gout_id == 0 && task_id < ntasks) { + double *ejk = jk.ejk; + atomicAdd(ejk+ka*3+0, reduce[0*threads]); + atomicAdd(ejk+ka*3+1, reduce[1*threads]); + atomicAdd(ejk+ka*3+2, reduce[2*threads]); + atomicAdd(ejk+la*3+0, reduce[3*threads]); + atomicAdd(ejk+la*3+1, reduce[4*threads]); + atomicAdd(ejk+la*3+2, reduce[5*threads]); + } + } + } + int ia = bas[ish_cell0*BAS_SLOTS+ATOM_OF]; + int ja = bas[jsh_cell0*BAS_SLOTS+ATOM_OF]; + double *reduce = shared_memory + thread_id; + __syncthreads(); + reduce[0*threads] = v_ix; + reduce[1*threads] = v_iy; + reduce[2*threads] = v_iz; + reduce[3*threads] = v_jx; + reduce[4*threads] = v_jy; + reduce[5*threads] = v_jz; + for (int i = gout_stride/2; i > 0; i >>= 1) { + __syncthreads(); + if (gout_id < i) { +#pragma unroll + for (int n = 0; n < 6; ++n) { + reduce[n*threads] += reduce[n*threads+i*nsq_per_block]; + } + } + } + if (gout_id == 0) { + double *ejk = jk.ejk; + atomicAdd(ejk+ia*3+0, reduce[0*threads]); + atomicAdd(ejk+ia*3+1, reduce[1*threads]); + atomicAdd(ejk+ia*3+2, reduce[2*threads]); + atomicAdd(ejk+ja*3+0, reduce[3*threads]); + atomicAdd(ejk+ja*3+1, reduce[4*threads]); + atomicAdd(ejk+ja*3+2, reduce[5*threads]); + } +} + atomicAdd(sigma+0, sigma_xx); + atomicAdd(sigma+1, sigma_xy); + atomicAdd(sigma+2, sigma_xz); + atomicAdd(sigma+3, sigma_yx); + atomicAdd(sigma+4, sigma_yy); + atomicAdd(sigma+5, sigma_yz); + atomicAdd(sigma+6, sigma_zx); + atomicAdd(sigma+7, sigma_zy); + atomicAdd(sigma+8, sigma_zz); +} + //extern int rys_ejk_ip1_unrolled(RysIntEnvVars *envs, JKEnergy *jk, BoundsInfo *bounds, // int *pool, double *dd_pool); extern "C" { int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor, double *dm, int n_dm, int nao, - RysIntEnvVars envs, int *scheme, int *shls_slice, + RysIntEnvVars *envs, int *scheme, int *shls_slice, int npairs_ij, int npairs_kl, uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping, int *bas_mask_idx, int *Ts_ji_lookup, int nimgs, int nimgs_uniq_pair, @@ -609,7 +1225,7 @@ int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor, int *head = (int *)(pool + workers * QUEUE_DEPTH); cudaMemset(head, 0, sizeof(int)); - if (1){//!rys_ejk_ip1_unrolled(&envs, &jk, &bounds, pool, dd_pool)) { + if (1){//!rys_ejk_ip1_unrolled(envs, &jk, &bounds, pool, dd_pool)) { int quartets_per_block = scheme[0]; int gout_stride = scheme[1]; int ij_prims = iprim * jprim; @@ -619,7 +1235,7 @@ int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor, buflen = (reserved_shm_size + ij_prims)*sizeof(double); rys_ejk_ip1_kernel<<>>( - envs, jk, bounds, bas_mask_idx, Ts_ji_lookup, + *envs, jk, bounds, bas_mask_idx, Ts_ji_lookup, nimgs, nimgs_uniq_pair, nbas_cell0, nao, pool, dd_pool, head, reserved_shm_size); } @@ -632,9 +1248,88 @@ int PBC_per_atom_jk_ip1(double *ejk, double j_factor, double k_factor, return 0; } +int PBC_jk_strain_deriv(double *ejk, double j_factor, double k_factor, + double *sigma, double *dm, int n_dm, int nao, + RysIntEnvVars *envs, int *scheme, int *shls_slice, + int npairs_ij, int npairs_kl, + uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping, + int *bas_mask_idx, int *Ts_ji_lookup, int nimgs, int nimgs_uniq_pair, + float *q_cond, float *s_estimator, float *dm_cond, float cutoff, + uint32_t *pool, double *dd_pool, int nbas_cell0, + int *atm, int natm, int *bas, int nbas, double *env) +{ + int ish0 = shls_slice[0]; + int jsh0 = shls_slice[2]; + int ksh0 = shls_slice[4]; + int lsh0 = shls_slice[6]; + int li = bas[ANG_OF + ish0*BAS_SLOTS]; + int lj = bas[ANG_OF + jsh0*BAS_SLOTS]; + int lk = bas[ANG_OF + ksh0*BAS_SLOTS]; + int ll = bas[ANG_OF + lsh0*BAS_SLOTS]; + int iprim = bas[NPRIM_OF + ish0*BAS_SLOTS]; + int jprim = bas[NPRIM_OF + jsh0*BAS_SLOTS]; + int kprim = bas[NPRIM_OF + ksh0*BAS_SLOTS]; + int lprim = bas[NPRIM_OF + lsh0*BAS_SLOTS]; + int nfi = (li+1)*(li+2)/2; + int nfj = (lj+1)*(lj+2)/2; + int nfk = (lk+1)*(lk+2)/2; + int nfl = (ll+1)*(ll+2)/2; + int order = li + lj + lk + ll; + int nroots = (order + 1) / 2 + 1; + double omega = env[PTR_RANGE_OMEGA]; + if (omega < 0) { // SR ERIs + nroots *= 2; + } + int stride_j = li + 2; + int stride_k = stride_j * (lj + 1); + int stride_l = stride_k * (lk + 2); + int g_size = stride_l * (ll + 1); + BoundsInfo bounds = {li, lj, lk, ll, nfi, nfj, nfk, nfl, + nroots, stride_j, stride_k, stride_l, g_size, + iprim, jprim, kprim, lprim, + npairs_ij, npairs_kl, pair_ij_mapping, pair_kl_mapping, + q_cond, s_estimator, dm_cond, cutoff}; + + if (n_dm == 1) { // RHF + k_factor *= .5; + } + // *4 for the symmetry (i,j) = (j,i), (k,l) = (l,k) in J contraction + // Additional factor 1/2 from the two-electron Coulomb operator + JKEnergy jk = {ejk, dm, 2.*j_factor, -k_factor, n_dm, omega, 0, 1}; + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + int workers = prop.multiProcessorCount; + int *head = (int *)(pool + workers * QUEUE_DEPTH); + cudaMemset(head, 0, sizeof(int)); + + if (1){//!rys_ejk_strain_deriv_unrolled(envs, &jk, &bounds, pool, dd_pool)) { + int quartets_per_block = scheme[0]; + int gout_stride = scheme[1]; + int ij_prims = iprim * jprim; + dim3 threads(quartets_per_block, gout_stride); + int buflen = (nroots*2 + g_size*3 + 6) * quartets_per_block; + int reserved_shm_size = MAX(buflen, 6*gout_stride*quartets_per_block); + buflen = (reserved_shm_size + ij_prims)*sizeof(double); + + rys_ejk_strain_deriv_kernel<<>>( + *envs, jk, bounds, sigma, bas_mask_idx, Ts_ji_lookup, + nimgs, nimgs_uniq_pair, nbas_cell0, nao, + pool, dd_pool, head, reserved_shm_size); + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error in PBC_jk_strain_deriv, li,lj,lk,ll = %d,%d,%d,%d, error message = %s\n", + li,lj,lk,ll, cudaGetErrorString(err)); + return 1; + } + return 0; +} + int PBC_build_jk_ip1_init(int shm_size) { cudaFuncSetAttribute(rys_ejk_ip1_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); + cudaFuncSetAttribute(rys_ejk_strain_deriv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size, diff --git a/gpu4pyscf/lib/pbc/rys_contract_k.cu b/gpu4pyscf/lib/pbc/rys_contract_k.cu index f8265774d..d0c5efc02 100644 --- a/gpu4pyscf/lib/pbc/rys_contract_k.cu +++ b/gpu4pyscf/lib/pbc/rys_contract_k.cu @@ -589,7 +589,7 @@ static size_t threads_scheme_for_k(dim3& threads, BoundsInfo &bounds, extern "C" { int PBC_build_k(double *vk, double *dm, int n_dm, int nao, - RysIntEnvVars envs, int *shls_slice, int shm_size, + RysIntEnvVars *envs, int *shls_slice, int shm_size, int npairs_ij, int npairs_kl, uint32_t *pair_ij_mapping, uint32_t *pair_kl_mapping, int *bas_mask_idx, int *Ts_ji_lookup, int nimgs, int nimgs_uniq_pair, @@ -642,7 +642,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao, int *head = (int *)(pool + workers * QUEUE_DEPTH); cudaMemset(head, 0, sizeof(int)); - if (1){//!rys_k_unrolled(&envs, &kmat, &bounds, pool)) { + if (1){//!rys_k_unrolled(envs, &kmat, &bounds, pool)) { GXYZOffset* p_gxyz_offset = RYS_make_gxyz_offset(bounds); int gout_pattern = (((li == 0) >> 3) | ((lj == 0) >> 2) | @@ -654,7 +654,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao, int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_k_kernel<<>>( - envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup, + *envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup, nimgs, nimgs_uniq_pair, nbas_cell0, nao, pool, head, p_gxyz_offset, gout_pattern, reserved_shm_size); @@ -664,7 +664,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao, min(256, n_tiles-256)); int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_k_kernel<<>>( - envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup, + *envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup, nimgs, nimgs_uniq_pair, nbas_cell0, nao, pool, head, p_gxyz_offset+256, gout_pattern, reserved_shm_size); } @@ -674,7 +674,7 @@ int PBC_build_k(double *vk, double *dm, int n_dm, int nao, min(256, n_tiles-512)); int reserved_shm_size = (buflen - cart_idx_size*4)/8; rys_k_kernel<<>>( - envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup, + *envs, kmat, bounds, bas_mask_idx, Ts_ji_lookup, nimgs, nimgs_uniq_pair, nbas_cell0, nao, pool, head, p_gxyz_offset+512, gout_pattern, reserved_shm_size); } diff --git a/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu b/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu index 33b67ce94..57ee9c26a 100644 --- a/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu +++ b/gpu4pyscf/lib/pbc/supmol_sr_estimator.cu @@ -114,7 +114,7 @@ void filter_q_cond_by_distance_kernel(float *q_cond, float *s_estimator, RysIntE } extern "C" { -int filter_q_cond_by_distance(float *q_cond, float *s_estimator, RysIntEnvVars envs, +int filter_q_cond_by_distance(float *q_cond, float *s_estimator, RysIntEnvVars *envs, float *diffuse_exps_per_atom, float *s_max_per_atom, float log_cutoff, int natm_cell0, int nbas) { @@ -123,7 +123,7 @@ int filter_q_cond_by_distance(float *q_cond, float *s_estimator, RysIntEnvVars e dim3 blocks(sh_blocks, sh_blocks); int buflen = natm_cell0 * 3 * sizeof(float); filter_q_cond_by_distance_kernel<<>>( - q_cond, s_estimator, envs, diffuse_exps_per_atom, s_max_per_atom, + q_cond, s_estimator, *envs, diffuse_exps_per_atom, s_max_per_atom, log_cutoff, natm_cell0); cudaError_t err = cudaGetLastError(); diff --git a/gpu4pyscf/pbc/df/aft_jk.py b/gpu4pyscf/pbc/df/aft_jk.py index d04269c22..97934861d 100644 --- a/gpu4pyscf/pbc/df/aft_jk.py +++ b/gpu4pyscf/pbc/df/aft_jk.py @@ -32,7 +32,7 @@ from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh from gpu4pyscf.pbc.df.ft_ao import FTOpt, libpbc from gpu4pyscf.pbc.df.fft_jk import _format_dms, _format_jks, _ewald_exxdiv_for_G0 -from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem +from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, asarray from gpu4pyscf.lib import logger from gpu4pyscf.scf.jk import apply_coeff_C_mat_CT, SHM_SIZE @@ -213,11 +213,11 @@ def get_k_kpts(mydf, dm_kpts, hermi=1, kpts=None, kpts_band=None, for group_id, (kpt, ki_idx, kj_idx, self_conj) \ in enumerate(kk_adapted_iter(cell, kpts)): - vkcoulG = mydf.weighted_coulG(kpt, exxdiv, mesh) + vkcoulG = mydf.weighted_coulG(kpt, exxdiv, mesh) * weight for p0, p1 in lib.prange(0, ngrids, Gblksize): log.debug3('update_vk [%s:%s]', p0, p1) Gpq = ft_kern(Gv[p0:p1], kpt, kpts) - update_vk(vk_kpts, Gpq, dms, vkcoulG[p0:p1] * weight, ki_idx, kj_idx, + update_vk(vk_kpts, Gpq, dms, vkcoulG[p0:p1], ki_idx, kj_idx, not self_conj, k_to_compute, t_rev_pairs) Gpq = None cpu1 = log.timer_debug1(f'get_k_kpts group {group_id}', *cpu1) @@ -361,9 +361,9 @@ def get_ej_ip1(mydf, dm, kpts=None): blksize = min(blksize, ngrids, 16384) kpt_allow = np.zeros(3) - coulG = mydf.weighted_coulG() + wcoulG = mydf.weighted_coulG() - bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _screen_shl_pairs(ft_opt) + bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt) nbatches_shl_pair = len(shl_pair_offsets) - 1 aft_envs = ft_opt.aft_envs @@ -389,7 +389,7 @@ def get_ej_ip1(mydf, dm, kpts=None): Gpq = ft_kern(Gv[p0:p1], kpt_allow, kpts, transform_ao=False) Gpq = Gpq.transpose(0,2,3,1) vG[:nGv] = contract('kji,kijg->g', dms, Gpq).conj() - vG[:nGv] *= coulG[p0:p1] + vG[:nGv] *= wcoulG[p0:p1] GvT[:3*nGv].set(Gv[p0:p1].T.ravel()) Gpq = None err = kern( @@ -449,7 +449,7 @@ def get_ek_ip1(mydf, dm, kpts=None, exxdiv=None): blksize = max(16, int(avail_mem/(nao**2*bvk_ncells*16*2))//16*16) blksize = min(blksize, ngrids, 16384) - bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _screen_shl_pairs(ft_opt) + bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt) nbatches_shl_pair = len(shl_pair_offsets) - 1 aft_envs = ft_opt.aft_envs @@ -530,7 +530,7 @@ def get_ek_ip1(mydf, dm, kpts=None, exxdiv=None): log.timer_debug1('get_ek_ip1', *cpu0) return ek.get() -def _screen_shl_pairs(ft_opt): +def _generate_shl_pairs(ft_opt): img_idx_cache = ft_opt.make_img_idx_cache(permutation_symmetry=True) bas_ij_idx = [] bas_ij_img_idx = [] @@ -551,6 +551,291 @@ def _screen_shl_pairs(ft_opt): shl_pair_offsets = cp.hstack(shl_pair_offsets, dtype=np.int32) return bas_ij_idx, bas_ij_img_idx, shl_pair_offsets +def get_ej_strain_deriv(mydf, dm, kpts=None): + '''Strain derivatives from Coulomb matrix''' + from gpu4pyscf.pbc.grad import rks_stress + log = logger.new_logger(mydf) + cell = mydf.cell + if kpts is None: + kpts = np.zeros((1,3)) + else: + kpts = kpts.reshape(-1, 3) + is_gamma_point = is_zero(kpts) + dms = _format_dms(dm, kpts) + n_dm, nkpts, nao = dms.shape[:3] + assert nkpts == len(kpts) + if n_dm == 2: + dms = dms[0] + dms[1] + elif n_dm > 1: + raise NotImplementedError + + ft_opt = FTOpt(cell, kpts).build() + ft_kern = ft_opt.gen_ft_kernel() + sorted_cell = ft_opt.sorted_cell + dms = cp.asarray(dms.reshape(-1,nao,nao)) + dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, ft_opt.uniq_l_ctr, + ft_opt.l_ctr_offsets, ft_opt.ao_idx) + if is_gamma_point: + dms_bvkcell = cp.asarray(dms.real, order='C') + else: + expLk = cp.exp(1j*cp.asarray(ft_opt.bvkmesh_Ls).dot(cp.asarray(kpts).T)) + dms_bvkcell = contract('Lk,kpq->Lpq', expLk, dms) + assert abs(dms_bvkcell.imag).max() < 1e-6 + dms_bvkcell = cp.asarray(dms_bvkcell.real, order='C') + expLk = None + + bvk_ncells = np.prod(ft_opt.bvk_kmesh) + nao = ft_opt.sorted_cell.nao + Gv = cell.get_Gv(mydf.mesh) + ngrids = len(Gv) + # memory buffer required by ft_kern + avail_mem = get_avail_mem() * .8 + blksize = max(16, int(avail_mem/(nao**2*bvk_ncells*16*2))//16*16) + blksize = min(blksize, ngrids, 16384) + + kpt_allow = np.zeros(3) + coulG_0, coulG_1 = rks_stress._get_coulG_strain_derivatives(cell, Gv) + coulG_0 = asarray(coulG_0) + coulG_1 = asarray(coulG_1) + weight_0 = 1/cell.vol + weight_1 = -1/cell.vol * cp.eye(3) + wcoulG_0 = weight_0 * coulG_0 + # wcoulG_1 includes two terms, weight_0*coulG_1 + weight_1*coulG_0 + wcoulG_1 = weight_0 * coulG_1 + wcoulG_1 += weight_1[:,:,None] * coulG_0 + + bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt) + nbatches_shl_pair = len(shl_pair_offsets) - 1 + aft_envs = ft_opt.aft_envs + + lmax = ft_opt.uniq_l_ctr[:,0].max() + ls = np.arange(lmax+1) + gx_len = (ls[:,None]+2)*(ls+1) * 6*32 + nsp_per_block = np.ones_like(gx_len) + for m in [2, 4, 8]: + nsp_per_block[(gx_len + 6)*m*8 < SHM_SIZE] = m + shm_size = (nsp_per_block * (gx_len + 6)).max() * 8 + + log.debug('bas_ij_idx=%d nbatches=%d shm_size=%d blksize=%d', + len(bas_ij_idx), nbatches_shl_pair, shm_size, blksize) + + kern = libpbc.PBC_ft_aopair_ej_strain_deriv + vG = cp.zeros(blksize+256, dtype=np.complex128) + GvT = cp.zeros(3*blksize+256) + ej = cp.zeros((cell.natm, 3)) + sigma = cp.zeros((3, 3)) + for p0, p1 in lib.prange(0, ngrids, blksize): + nGv = p1 - p0 + # TODO: Gpq are transformed to the k-points adapted representation in + # gen_ft_kernel. This transfomration can be skipped. + Gpq = ft_kern(Gv[p0:p1], kpt_allow, kpts, transform_ao=False) + Gpq = Gpq.transpose(0,2,3,1) + rhoG = contract('kji,kijg->g', dms, Gpq) + sigma += .25*cp.einsum('xyg,g,g->xy', wcoulG_1[:,:,p0:p1], rhoG.conj(), rhoG).real + + vG[:nGv] = rhoG.conj() + vG[:nGv] *= wcoulG_0[p0:p1] + GvT[:3*nGv].set(Gv[p0:p1].T.ravel()) + Gpq = None + err = kern( + ctypes.cast(ej.data.ptr, ctypes.c_void_p), + ctypes.cast(sigma.data.ptr, ctypes.c_void_p), + ctypes.cast(dms_bvkcell.data.ptr, ctypes.c_void_p), + ctypes.cast(vG.data.ptr, ctypes.c_void_p), + ctypes.cast(GvT.data.ptr, ctypes.c_void_p), + ctypes.byref(aft_envs), + ctypes.c_int(nbatches_shl_pair), + ctypes.c_int(nGv), + ctypes.c_int(shm_size), + ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(bas_ij_img_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p), + sorted_cell._atm.ctypes, ctypes.c_int(sorted_cell.natm), + sorted_cell._bas.ctypes, ctypes.c_int(sorted_cell.nbas), + sorted_cell._env.ctypes) + if err != 0: + raise RuntimeError('PBC_ft_aopair_ej_strain_deriv failed') + if nkpts != 1: + ej /= nkpts + sigma *= 2 / nkpts**2 + return sigma.get() + +def get_ek_strain_deriv(mydf, dm, kpts=None, exxdiv=None, omega=None): + '''Strain derivatives from exact exchange''' + from gpu4pyscf.pbc.grad import rks_stress + log = logger.new_logger(mydf) + cpu0 = cpu1 = log.init_timer() + if kpts is None: + kpts = np.zeros((1,3)) + else: + kpts = kpts.reshape(-1, 3) + is_gamma_point = is_zero(kpts) + cell = mydf.cell + dm0 = _format_dms(dm, kpts) + n_dm, nkpts, nao = dm0.shape[:3] + assert nkpts == len(kpts) + if n_dm > 2: + raise NotImplementedError + + ft_opt = FTOpt(cell, kpts).build() + ft_kern = ft_opt.gen_ft_kernel() + sorted_cell = ft_opt.sorted_cell + dms = cp.asarray(dm0.reshape(-1,nao,nao)) + dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, ft_opt.uniq_l_ctr, + ft_opt.l_ctr_offsets, ft_opt.ao_idx) + nao = dms.shape[-1] + dms = dms.reshape(n_dm,nkpts,nao,nao) + + if not is_gamma_point: + expLk = cp.exp(1j*cp.asarray(ft_opt.bvkmesh_Ls).dot(cp.asarray(kpts).T)) + + bvk_ncells = np.prod(ft_opt.bvk_kmesh) + Gv = cell.get_Gv(mydf.mesh) + ngrids = len(Gv) + # memory buffer required by ft_kern + avail_mem = get_avail_mem() * .8 + blksize = max(16, int(avail_mem/(nao**2*bvk_ncells*16*2))//16*16) + blksize = min(blksize, ngrids, 16384) + + bas_ij_idx, bas_ij_img_idx, shl_pair_offsets = _generate_shl_pairs(ft_opt) + nbatches_shl_pair = len(shl_pair_offsets) - 1 + aft_envs = ft_opt.aft_envs + + lmax = ft_opt.uniq_l_ctr[:,0].max() + ls = np.arange(lmax+1) + gx_len = (ls[:,None]+2)*(ls+1) * 6*32 + nsp_per_block = np.ones_like(gx_len) + for m in [2, 4, 8]: + nsp_per_block[(gx_len + 3)*m*8 < SHM_SIZE] = m + shm_size = (nsp_per_block * (gx_len + 3)).max() * 8 + + log.debug('bas_ij_idx=%d nbatches=%d shm_size=%d blksize=%d', + len(bas_ij_idx), nbatches_shl_pair, shm_size, blksize) + + kern = libpbc.PBC_ft_aopair_ek_strain_deriv + GvT = cp.zeros(3*blksize+256) + ek = cp.zeros((cell.natm, 3)) + sigma = cp.zeros((3, 3)) + for group_id, (kpt, ki_idx, kj_idx, self_conj) \ + in enumerate(kk_adapted_iter(cell, kpts)): + Gvk = Gv + kpt + coulG_0, coulG_1 = rks_stress._get_coulG_strain_derivatives( + cell, Gvk, omega=omega, remove_G0=is_zero(kpt)) + coulG_0 = asarray(coulG_0) + coulG_1 = asarray(coulG_1) + weight_0 = 1/cell.vol + weight_1 = -1/cell.vol * cp.eye(3) + wcoulG_0 = weight_0 * coulG_0 + wcoulG_1 = weight_0 * coulG_1 + wcoulG_1 += weight_1[:,:,None] * coulG_0 + + swap_2e = not self_conj + for p0, p1 in lib.prange(0, ngrids, blksize): + nGv = p1 - p0 + Gpq = ft_kern(Gv[p0:p1], kpt, kpts, transform_ao=False) + Gpq = Gpq.transpose(0,2,3,1) + Gpq_conj = Gpq.conj() + # Gpq.conj() can be computed equivalently as + #Gpq_conj = ft_kern(-Gv[p0:p1], -kpt, -kpts, transform_ao=False) + #Gpq_conj = Gpq_conj.transpose(0,2,3,1) + + if is_gamma_point: + tmp = contract('sjk,lkg->sjlg', dms[:,0], Gpq_conj[0]) + dm_vG = contract('sjlg,sli->jig', tmp, dms[:,0]) + vkG = cp.einsum('pqg,qpg->g', dm_vG, Gpq[0]).real + sigma += .25*cp.einsum('xyg,g->xy', wcoulG_1[:,:,p0:p1], vkG) + else: + # einsum(nijG[kj_idx],jk[kj_idx],nlkG*[kj_idx],li[ki_idx]) + # apply derivatives to nlkG* + #:tmp = contract('nijg,snjk->snikg', Gpq[kj_idx], dms[:,kj_idx]) + #:tmp = contract('snikg,snli->nklg', tmp, dms[:,ki_idx]) + #:dm_vG = contract('Lk,kpqg->Lpqg', expLk[:,kj_idx].conj(), tmp).conj() + #:if swap_2e: + # apply derivatives to nijG. This term is equivalent to the + # derivatives of nlkG*. + #: tmp = contract('snjk,nlkg->snjlg', dms[:,kj_idx], Gpq.conj()[kj_idx]) + #: tmp = contract('snjlg,snli->njig', tmp, dms[:,ki_idx]) + #: dm_vG += contract('Lk,kpqg->Lpqg', expLk[:,kj_idx], tmp) + idx = np.empty_like(ki_idx) + idx[kj_idx] = ki_idx + dm_k = contract('snjk,nlkg->snjlg', dms, Gpq_conj) + dm_k = contract('snjlg,snli->njig', dm_k, dms[:,idx]) + dm_vG = contract('Lk,kpqg->Lpqg', expLk, dm_k) + + vkG = cp.einsum('njig,nijg->g', dm_k, Gpq).real + tmp = .25*cp.einsum('xyg,g->xy', wcoulG_1[:,:,p0:p1], vkG) + if swap_2e: + sigma += tmp * 2 + else: + sigma += tmp + + if swap_2e: + dm_vG *= wcoulG_0[p0:p1] * 2 + else: + dm_vG *= wcoulG_0[p0:p1] + dm_vG = cp.asarray(dm_vG, order='C') + + GvT[:3*nGv].set(Gvk[p0:p1].T.ravel()) + err = kern( + ctypes.cast(ek.data.ptr, ctypes.c_void_p), + ctypes.cast(sigma.data.ptr, ctypes.c_void_p), + ctypes.cast(dm_vG.data.ptr, ctypes.c_void_p), + ctypes.cast(GvT.data.ptr, ctypes.c_void_p), + ctypes.byref(aft_envs), + ctypes.c_int(nbatches_shl_pair), + ctypes.c_int(nGv), + ctypes.c_int(shm_size), + ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(bas_ij_img_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p), + sorted_cell._atm.ctypes, ctypes.c_int(sorted_cell.natm), + sorted_cell._bas.ctypes, ctypes.c_int(sorted_cell.nbas), + sorted_cell._env.ctypes) + Gpq_conj = tmp = dm_vG = None + if err != 0: + raise RuntimeError('PBC_ft_aopair_ek_strain_deriv failed') + cpu1 = log.timer_debug1(f'get_k_kpts group {group_id}', *cpu1) + if nkpts != 1: + ek /= nkpts + sigma *= 2 / nkpts**2 + sigma = sigma.get() + + if (exxdiv == 'ewald' and + (cell.dimension == 3 or + (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))): + from pyscf.pbc.tools.pbc import madelung + from gpu4pyscf.pbc.gto import int1e + int1e_opt = int1e._Int1eOpt(cell, kpts) + s0 = int1e_opt.intor('PBCint1e_ovlp', 1, 1, (0, 0)) + k_dm = contract('nkpq,kqr->nkpr', dm0, s0) + k_dm = contract('nkpr,nkrs->kps', k_dm, dm0) + ek_G0 = cp.einsum('kij,kji->', s0, k_dm).real.get() + + scaled_kpts = kpts.dot(cell.lattice_vectors().T) + ewald_G0 = np.empty((3,3)) + disp = max(1e-5, (cell.precision*.1)**.5) + for i in range(3): + for j in range(i+1): + cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp) + kpts1 = scaled_kpts.dot(cell1.reciprocal_vectors(norm_to=1)) + kpts2 = scaled_kpts.dot(cell2.reciprocal_vectors(norm_to=1)) + e1 = madelung(cell1, kpts1, omega=omega) + e2 = madelung(cell2, kpts2, omega=omega) + ewald_G0[j,i] = ewald_G0[i,j] = (e1-e2)/(2*disp) + if n_dm == 1: # RHF + ewald_G0 *= .5 * ek_G0 + k_dm *= .5 * madelung(cell, kpts, omega=omega) + else: + ewald_G0 *= ek_G0 + k_dm *= madelung(cell, kpts, omega=omega) + int1e_opt = int1e._Int1eOptV2(cell) + ewald_G0 += int1e_opt.get_ovlp_strain_deriv(k_dm, kpts) * 2 + ewald_G0 /= nkpts + sigma += ewald_G0 + + log.timer_debug1('get_ek_ip1', *cpu0) + return sigma + ################################################## # # Single k-point diff --git a/gpu4pyscf/pbc/df/ft_ao.py b/gpu4pyscf/pbc/df/ft_ao.py index 976690357..183b933a5 100644 --- a/gpu4pyscf/pbc/df/ft_ao.py +++ b/gpu4pyscf/pbc/df/ft_ao.py @@ -91,8 +91,8 @@ def ft_ao(cell, Gv, shls_slice=None, b=None, ao_loc_cpu = sorted_cell.ao_loc ao_loc_gpu = cp.array(ao_loc_cpu) envs = PBCIntEnvVars( - sorted_cell.natm, sorted_cell.nbas, 1, 1, _atm.data.ptr, - _bas.data.ptr, _env.data.ptr, ao_loc_gpu.data.ptr, 0, + sorted_cell.natm, sorted_cell.nbas, _atm.data.ptr, + _bas.data.ptr, _env.data.ptr, ao_loc_gpu.data.ptr, 1, 1, 0, ) ngrids = len(Gv) GvT = (asarray(Gv).T + asarray(kpt[:,None])).ravel() @@ -504,21 +504,21 @@ def most_diffuse_pgto(cell): class PBCIntEnvVars(ctypes.Structure): _fields_ = [ - ('cell0_natm', ctypes.c_int), - ('cell0_nbas', ctypes.c_int), - ('bvk_ncells', ctypes.c_int), - ('nimgs', ctypes.c_int), + ('natm', ctypes.c_int), + ('nbas', ctypes.c_int), ('atm', ctypes.c_void_p), ('bas', ctypes.c_void_p), ('env', ctypes.c_void_p), ('ao_loc', ctypes.c_void_p), + ('bvk_ncells', ctypes.c_int), + ('nimgs', ctypes.c_int), ('img_coords', ctypes.c_void_p), ] @classmethod def new(cls, natm, nbas, ncells, nimgs, atm, bas, env, ao_loc, Ls): - obj = PBCIntEnvVars(natm, nbas, ncells, nimgs, atm.data.ptr, bas.data.ptr, - env.data.ptr, ao_loc.data.ptr, Ls.data.ptr) + obj = PBCIntEnvVars(natm, nbas, atm.data.ptr, bas.data.ptr, env.data.ptr, + ao_loc.data.ptr, ncells, nimgs, Ls.data.ptr) # Keep a reference to these arrays, prevent releasing them upon returning obj._env_ref_holder = (atm, bas, env, ao_loc, Ls) obj._device = cp.cuda.device.get_device_id() diff --git a/gpu4pyscf/pbc/df/rsdf_builder.py b/gpu4pyscf/pbc/df/rsdf_builder.py index 89c0d6235..aafa8cefe 100644 --- a/gpu4pyscf/pbc/df/rsdf_builder.py +++ b/gpu4pyscf/pbc/df/rsdf_builder.py @@ -429,8 +429,8 @@ def _int3c2e_overlap_mask(int3c2e_opt, cutoff): _bas = cp.array(pcell._bas) _env = cp.array(_scale_sp_ctr_coeff(pcell)) int3c2e_envs = PBCIntEnvVars( - pcell.natm, p_nbas, 1, nimgs, _atm.data.ptr, _bas.data.ptr, - _env.data.ptr, 0, Ls.data.ptr, + pcell.natm, p_nbas, _atm.data.ptr, _bas.data.ptr, + _env.data.ptr, 0, 1, nimgs, Ls.data.ptr, ) err = libpbc.bvk_overlap_img_counts( ctypes.cast(ovlp_img_counts.data.ptr, ctypes.c_void_p), diff --git a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py index 5522ac591..71088675c 100644 --- a/gpu4pyscf/pbc/df/tests/test_pbc_aft.py +++ b/gpu4pyscf/pbc/df/tests/test_pbc_aft.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import cupy as cp import pyscf from pyscf import lib from pyscf.pbc import gto as pgto @@ -22,6 +23,8 @@ from gpu4pyscf.pbc.df import aft, aft_jk from gpu4pyscf.pbc.df import fft from gpu4pyscf.lib.cupy_helper import tag_array +from gpu4pyscf.pbc.grad import rks_stress +from gpu4pyscf.pbc.grad import krks_stress from packaging import version def setUpModule(): @@ -362,6 +365,116 @@ def test_ek_ip1_kpts(self): ref[i] = np.einsum('xkpq,kqp->x', vk[:,:,p0:p1], dm[:,:,p0:p1]).real assert abs(ek_ewald - ref).max() < 1e-8 + def test_ej_strain_deriv_gamma_point(self): + cell = pgto.M( + atom = ''' + C 1. 1. 0. + H 4. 0. 3. + H 0. 1. .6 + ''', + a=np.eye(3)*4., + basis=[[0, [.25, 1]], [1, [.3, 1]]], + ) + np.random.seed(9) + nao = cell.nao + dm = np.random.rand(nao, nao) * .5 + dm = dm.dot(dm.T) + mydf = aft.AFTDF(cell) + sigma = aft_jk.get_ej_strain_deriv(mydf, dm) + + xc = 'lda,' + mf_grad = cell.RKS(xc=xc).to_gpu().Gradients() + ref = rks_stress.get_vxc(mf_grad, cell, dm, with_j=True, with_nuc=False) + ref -= rks_stress.get_vxc(mf_grad, cell, dm, with_j=False, with_nuc=False) + assert abs(ref - sigma).max() < 1e-8 + + def test_ej_strain_deriv_kpts(self): + cell = pgto.M( + atom = ''' + C 1. 1. 0. + H 4. 0. 3. + H 0. 1. .6 + ''', + a=np.eye(3)*4., + basis=[[0, [.25, 1]], [1, [.3, 1]]], + ) + kmesh = [3,2,1] + kpts = cell.make_kpts(kmesh) + nkpts = len(kpts) + dm = cp.asarray(cell.pbc_intor('int1e_ovlp', kpts=kpts)) + mydf = aft.AFTDF(cell) + sigma = aft_jk.get_ej_strain_deriv(mydf, dm, kpts) + + xc = 'lda,' + mf_grad = cell.KRKS(xc=xc, kpts=kpts).to_gpu().Gradients() + ref = krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=True, with_nuc=False) + ref -= krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=False, with_nuc=False) + assert abs(ref - sigma).max() < 1e-8 + + for (i, j) in [(0, 0), (0, 1), (1, 2), (2, 1), (2, 2)]: + cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp=1e-4) + mydf = aft.AFTDF(cell1, kpts=cell1.make_kpts(kmesh)) + vj = aft_jk.get_j_kpts(mydf, dm, hermi=1, kpts=mydf.kpts) + e1 = cp.einsum('kij,kji->', vj, dm).real / nkpts + mydf = aft.AFTDF(cell2, kpts=cell2.make_kpts(kmesh)) + vj = aft_jk.get_j_kpts(mydf, dm, hermi=1, kpts=mydf.kpts) + e2 = cp.einsum('kij,kji->', vj, dm).real / nkpts + assert abs(sigma[i,j] - (e1-e2)/2e-4) < 1e-7 + + def test_ek_strain_deriv_gamma_point(self): + cell = pgto.M( + atom = ''' + C 1. 1. 0. + H 4. 0.5 3. + H 0.5 1. .6 + ''', + a=np.eye(3)*4., + basis=[[0, [.25, 1]], [1, [.3, 1]]], + ) + np.random.seed(9) + nao = cell.nao + dm = np.random.rand(nao, nao) * .5 + dm = cp.array(dm.dot(dm.T)) + mydf = aft.AFTDF(cell) + sigma = aft_jk.get_ek_strain_deriv(mydf, dm) + + for (i, j) in [(0, 0), (0, 1), (1, 2), (2, 1), (2, 2)]: + cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp=1e-4) + mydf = aft.AFTDF(cell1) + vk = aft_jk.get_jk(mydf, dm, hermi=1, with_j=False, exxdiv=None)[1] + e1 = .5 * cp.einsum('ij,ji->', vk, dm).real + mydf = aft.AFTDF(cell2) + vk = aft_jk.get_jk(mydf, dm, hermi=1, with_j=False, exxdiv=None)[1] + e2 = .5 * cp.einsum('ij,ji->', vk, dm).real + assert abs(sigma[i, j] - (e1-e2)/2e-4).max() < 2e-7 + + def test_ek_strain_deriv_kpts(self): + cell = pgto.M( + atom = ''' + C 1. 1. 0. + H 4. 0.5 3. + H 0.5 1. .6 + ''', + a=np.eye(3)*4., + basis=[[0, [.25, 1]], [1, [.3, 1]]], + ) + kmesh = [1,3,1] + kpts = cell.make_kpts(kmesh) + nkpts = len(kpts) + dm = cp.asarray(cell.pbc_intor('int1e_ovlp', kpts=kpts)) + mydf = aft.AFTDF(cell) + sigma = aft_jk.get_ek_strain_deriv(mydf, dm, kpts, exxdiv='ewald') + + for (i, j) in [(0, 0), (0, 1), (1, 2), (2, 1), (2, 2)]: + cell1, cell2 = rks_stress._finite_diff_cells(cell, i, j, disp=1e-4) + mydf = aft.AFTDF(cell1, kpts=cell1.make_kpts(kmesh)) + vk = aft_jk.get_k_kpts(mydf, dm, hermi=1, kpts=mydf.kpts, exxdiv='ewald') + e1 = .5 * cp.einsum('kij,kji->', vk, dm).real / nkpts + mydf = aft.AFTDF(cell2, kpts=cell2.make_kpts(kmesh)) + vk = aft_jk.get_k_kpts(mydf, dm, hermi=1, kpts=mydf.kpts, exxdiv='ewald') + e2 = .5 * cp.einsum('kij,kji->', vk, dm).real / nkpts + assert abs(sigma[i, j] - (e1-e2)/2e-4).max() < 5e-7 + if __name__ == '__main__': print("Full Tests for aft") unittest.main() diff --git a/gpu4pyscf/pbc/grad/rks_stress.py b/gpu4pyscf/pbc/grad/rks_stress.py index 3528e4ba8..d3906e6cb 100644 --- a/gpu4pyscf/pbc/grad/rks_stress.py +++ b/gpu4pyscf/pbc/grad/rks_stress.py @@ -90,14 +90,28 @@ def _finite_diff_cells(cell, x, y, disp=1e-4, precision=None): cell2.build(False, False) return cell1, cell2 -def _get_coulG_strain_derivatives(cell, Gv): +def _get_coulG_strain_derivatives(cell, Gv, omega=None, remove_G0=True): '''derivatives of 4pi/G^2''' Gv = asarray(Gv) G2 = cp.einsum('gx,gx->g', Gv, Gv) - G2[0] = np.inf + if remove_G0: + G2[0] = np.inf coulG_0 = 4 * np.pi / G2 - coulG_1 = cp.einsum('gx,gy->xyg', Gv, Gv) - coulG_1 *= coulG_0 * 2/G2 + if omega is None: + omega = cell.omega + coulGxy = cp.einsum('gx,gy->xyg', Gv, Gv) + coulGxy *= coulG_0 + coulG_1 = coulGxy * 2/G2 + if omega < 0: + coulG_1 *= (1 - cp.exp(-.25/omega**2 * G2)) + coulG_1 -= cp.exp(-.25/omega**2 * G2) * (.25/omega**2*2) * coulGxy + coulG_0 *= (1 - cp.exp(-.25/omega**2 * G2)) + #coulG_0[0] = np.pi/omega**2 + elif omega > 0: + coulG_1 *= cp.exp(-.25/omega**2 * G2) + coulG_1 += cp.exp(-.25/omega**2 * G2) * (.25/omega**2*2) * coulGxy + coulG_0 *= cp.exp(-.25/omega**2 * G2) + #coulG_0[0] = -np.pi/omega**2 return coulG_0, coulG_1 def _get_weight_strain_derivatives(cell, grids): @@ -276,25 +290,25 @@ def partial_dot(bra, ket): rho0, rho1 = rho0_fft_order, rho1_fft_order exc, vxc = ni.eval_xc_eff(xc_code, rho0, 1, xctype=xctype, spin=0)[:2] - out += contract('xyng,ng->xy', rho1, vxc).real.get() * weight_0 - out += contract('g,g->', rho0[0], exc.ravel()).real.get() * weight_1 + out += cp.einsum('xyng,ng->xy', rho1, vxc).real.get() * weight_0 + out += cp.einsum('g,g->', rho0[0], exc.ravel()).real.get() * weight_1 Gv = cell.get_Gv(mesh) coulG_0, coulG_1 = _get_coulG_strain_derivatives(cell, Gv) rhoG = pbctools.fft(rho0[0], mesh) if with_j: vR = pbctools.ifft(rhoG * coulG_0, mesh) - EJ = contract('xyg,g->xy', rho1[:,:,0], vR).real.get() * weight_0 * 2 - EJ += contract('g,g->', rho0[0], vR).real.get() * weight_1 - EJ += contract('xyg,g->xy', coulG_1, rhoG.conj()*rhoG).real.get() * (weight_0/ngrids) + EJ = cp.einsum('xyg,g->xy', rho1[:,:,0], vR).real.get() * weight_0 * 2 + EJ += cp.einsum('g,g->', rho0[0], vR).real.get() * weight_1 + EJ += cp.einsum('xyg,g,g->xy', coulG_1, rhoG.conj(), rhoG).real.get() * (weight_0/ngrids) out += .5 * EJ if with_nuc: if cell._pseudo: vpplocG_0, vpplocG_1 = _get_vpplocG_strain_derivatives(cell, mesh) vpplocR = pbctools.ifft(vpplocG_0, mesh).real - Ene = contract('xyg,g->xy', rho1[:,:,0], vpplocR).real.get() - Ene += contract('g,xyg->xy', rhoG.conj(), vpplocG_1).real.get() * (1./ngrids) + Ene = cp.einsum('xyg,g->xy', rho1[:,:,0], vpplocR).real.get() + Ene += cp.einsum('g,xyg->xy', rhoG.conj(), vpplocG_1).real.get() * (1./ngrids) Ene += _get_pp_nonloc_strain_derivatives(cell, mesh, dm) else: charge = -cell.atom_charges() @@ -304,8 +318,8 @@ def partial_dot(bra, ket): SI = cell.get_SI(mesh=mesh) ZG = asarray(np.dot(charge, SI)) vR = pbctools.ifft(ZG * coulG_0, mesh).real - Ene = contract('xyg,g->xy', rho1[:,:,0], vR).real.get() - Ene += contract('xyg,g->xy', coulG_1, rhoG.conj()*ZG).real.get() * (1./ngrids) + Ene = cp.einsum('xyg,g->xy', rho1[:,:,0], vR).real.get() + Ene += cp.einsum('xyg,g,g->xy', coulG_1, rhoG.conj(), ZG).real.get() * (1./ngrids) out += Ene return out diff --git a/gpu4pyscf/pbc/gto/int1e.py b/gpu4pyscf/pbc/gto/int1e.py index 104661558..100c9237c 100644 --- a/gpu4pyscf/pbc/gto/int1e.py +++ b/gpu4pyscf/pbc/gto/int1e.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math import ctypes import numpy as np import cupy as cp @@ -19,10 +20,13 @@ from pyscf.pbc import tools as pbctools from pyscf.pbc.lib.kpts_helper import is_zero from pyscf.pbc.tools.k2gamma import translation_vectors_for_kmesh +from gpu4pyscf.gto.mole import extract_pgto_params from gpu4pyscf.pbc.tools.k2gamma import kpts_to_kmesh from gpu4pyscf.lib.cupy_helper import contract, asarray, sandwich_dot +from gpu4pyscf.lib import multi_gpu from gpu4pyscf.gto.mole import group_basis, PTR_BAS_COORD -from gpu4pyscf.scf.jk import _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE +from gpu4pyscf.scf.jk import ( + _nearest_power2, _scale_sp_ctr_coeff, SHM_SIZE, apply_coeff_C_mat_CT) from gpu4pyscf.pbc.df.ft_ao import libpbc, PBCIntEnvVars from gpu4pyscf.pbc.df.int3c2e import ( _estimate_shl_pairs_per_block, fill_triu_bvk_conj, LMAX, L_AUX_MAX, THREADS @@ -118,7 +122,7 @@ def __init__(self, cell, kpts=None, bvk_kmesh=None): sorted_cell.natm, sorted_cell.nbas, bvk_ncells, nimgs, _atm, _bas, _env, ao_loc_gpu, Ls) - def generate_shl_pairs(self, hermi=1): + def generate_shl_pairs(self, hermi, gout_stride_lookup): sorted_cell = self.sorted_cell l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts)) uniq_l = self.uniq_l_ctr[:,0] @@ -150,7 +154,7 @@ def generate_shl_pairs(self, hermi=1): nshl_pair = len(idx) bas_ij_idx.append(idx) sp0, sp1 = sp1, sp1 + nshl_pair - nsp_per_block = _estimate_shl_pairs_per_block(li, lj, nshl_pair) + nsp_per_block = gout_stride_lookup[li, lj] * 8 shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32)) shl_pair_offsets.append(np.int32(sp1)) @@ -167,7 +171,6 @@ def create_gout_stride_lookup_table(self, deriv=None, gout_width=36): i_inc, j_inc = deriv lmax = self.uniq_l_ctr[:,0].max() gout_stride_lookup = np.empty([L_AUX_MAX+1,L_AUX_MAX+1], dtype=np.int32) - gout_width = gout_width shm_size = SHM_SIZE ls = np.arange(lmax+1) nf = (ls+1) * (ls+2) // 2 @@ -192,15 +195,15 @@ def intor(self, kern, hermi, comp, deriv_ij): gout_width = 18 sorted_cell = self.sorted_cell - int1e_envs = self.int1e_envs - nao_cart, nao = self.coeff.shape - bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi) - nbatches_shl_pair = len(shl_pair_offsets) - 1 gout_stride_lookup, shm_size = self.create_gout_stride_lookup_table( deriv_ij, gout_width) + bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi, gout_stride_lookup) + nbatches_shl_pair = len(shl_pair_offsets) - 1 bvk_kmesh = self.bvk_kmesh bvk_ncells = np.prod(bvk_kmesh) + nao_cart, nao = self.coeff.shape out = cp.empty((bvk_ncells, comp, nao_cart, nao_cart)) + int1e_envs = self.int1e_envs drv = getattr(libpbc, kern) err = drv( ctypes.cast(out.data.ptr, ctypes.c_void_p), @@ -232,3 +235,166 @@ def intor(self, kern, hermi, comp, deriv_ij): if self.kpts is not None and self.kpts.ndim == 1: out = out[0] return out + +class _Int1eOptV2: + def __init__(self, cell): + self.cell = cell + cell, ao_idx, l_ctr_pad_counts, uniq_l_ctr, l_ctr_counts = group_basis( + cell, 1, sparse_coeff=True) + lmax = uniq_l_ctr[:,0].max() + assert lmax <= L_AUX_MAX + self.sorted_cell = cell + self.uniq_l_ctr = uniq_l_ctr + self.l_ctr_counts = l_ctr_counts + self.l_ctr_pad_counts = l_ctr_pad_counts + self.ao_idx = ao_idx + + if isinstance(cell, Mole): + # The CUDA code for PBC integrals can be made to support Mole instances. + Ls = np.zeros((1, 3)) + self.precision = 1e-16 + else: + Ls = cell.get_lattice_Ls() + Ls = Ls[np.linalg.norm(Ls-.1, axis=1).argsort()] + self.precision = cell.precision * 1e-4 + self.Ls = Ls + self._int1e_envs = {} + + @multi_gpu.property(cache='_int1e_envs') + def int1e_envs(self): + cell = self.sorted_cell + atm = asarray(cell._atm) + bas = asarray(cell._bas) + env = asarray(_scale_sp_ctr_coeff(cell)) + ao_loc = asarray(cell.ao_loc) + Ls = asarray(self.Ls) + nimgs = len(Ls) + return PBCIntEnvVars.new(cell.natm, cell.nbas, nimgs, nimgs, atm, bas, env, ao_loc, Ls) + + def generate_shl_pairs(self, hermi=1, gout_stride_lookup=None): + sorted_cell = self.sorted_cell + Ls = asarray(self.Ls) + nimgs = len(Ls) + nbas = sorted_cell.nbas + ovlp_mask = cp.zeros((nbas,nimgs,nbas), dtype=bool) + exps, cs = extract_pgto_params(sorted_cell, 'diffuse') + exps = cp.asarray(exps, dtype=np.float32) + log_coeff = cp.log(abs(asarray(cs, dtype=np.float32))) + int1e_envs = self.int1e_envs + libpbc.PBCovlp_mask_estimation( + ctypes.cast(ovlp_mask.data.ptr, ctypes.c_void_p), + ctypes.cast(exps.data.ptr, ctypes.c_void_p), + ctypes.cast(log_coeff.data.ptr, ctypes.c_void_p), + ctypes.byref(int1e_envs), ctypes.c_int(hermi), + ctypes.c_float(math.log(self.precision))) + pairs_idx = cp.arange(ovlp_mask.size, dtype=np.int32) + pairs_idx = pairs_idx.reshape(ovlp_mask.shape) + + l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts)) + uniq_l = self.uniq_l_ctr[:,0] + bas_ij_idx = [] # The effective shell pair = ish*nbas+jsh + shl_pair_offsets = [] # the bas_ij_idx offset for each blockIdx.x + sp0 = sp1 = 0 + nbas = sorted_cell.nbas + groups = len(uniq_l) + if hermi == 1: + ij_tasks = [(i, j) for i in range(groups) for j in range(i+1)] + else: + ij_tasks = [(i, j) for i in range(groups) for j in range(groups)] + for i, j in ij_tasks: + li = uniq_l[i] + lj = uniq_l[j] + ish0, ish1 = l_ctr_offsets[i], l_ctr_offsets[i+1] + jsh0, jsh1 = l_ctr_offsets[j], l_ctr_offsets[j+1] + mask = ovlp_mask[ish0:ish1,:,jsh0:jsh1] + idx = pairs_idx[ish0:ish1,:,jsh0:jsh1][mask] + nshl_pair = len(idx) + bas_ij_idx.append(idx) + sp0, sp1 = sp1, sp1 + nshl_pair + if gout_stride_lookup is None: + nsp_per_block = 512 + else: + nsp_per_block = gout_stride_lookup[li, lj] * 8 + shl_pair_offsets.append(np.arange(sp0, sp1, nsp_per_block, dtype=np.int32)) + + shl_pair_offsets.append(np.int32(sp1)) + shl_pair_offsets = cp.array(np.hstack(shl_pair_offsets), dtype=np.int32) + bas_ij_idx = cp.array(cp.hstack(bas_ij_idx), dtype=np.int32) + return bas_ij_idx, shl_pair_offsets + + def create_gout_stride_lookup_table(self, deriv=None, gout_width=36): + # gout_width should be identical to the setting in cuda kernel + # based on the shm_size, find optimal gout_stride for each (li,lj) + # pattern, store them in the gout_stride_lookup + if deriv is None: + deriv = (0, 0) + i_inc, j_inc = deriv + lmax = self.uniq_l_ctr[:,0].max() + gout_stride_lookup = np.empty([L_AUX_MAX+1,L_AUX_MAX+1], dtype=np.int32) + shm_size = SHM_SIZE + ls = np.arange(lmax+1) + nf = (ls+1) * (ls+2) // 2 + max_shm_size = 0 + for li in range(lmax+1): + for lj in range(lmax+1): + unit = (li+1+i_inc)*(lj+1+j_inc)*3 + 4 + nsp_max = _nearest_power2(shm_size // (unit*8)) + gout_size = nf[li] * nf[lj] + gout_stride = (gout_size+gout_width-1) / gout_width + # Round up to the next 2^n + gout_stride = _nearest_power2(gout_stride, return_leq=False) + nsp_per_block = min(nsp_max, THREADS // gout_stride) + gout_stride_lookup[li, lj] = THREADS // nsp_per_block + max_shm_size = max(max_shm_size, nsp_per_block*unit*8) + return cp.array(gout_stride_lookup, dtype=np.int32), max_shm_size + + def get_ovlp_strain_deriv(self, dm, kpts=None): + cell = self.cell + sorted_cell = self.sorted_cell + nao_orig = cell.nao + dm = asarray(dm, order='C') + dm = dm.reshape(-1,nao_orig,nao_orig) + l_ctr_offsets = np.append(0, np.cumsum(self.l_ctr_counts)) + dm = apply_coeff_C_mat_CT(dm, cell, sorted_cell, self.uniq_l_ctr, + l_ctr_offsets, self.ao_idx) + if kpts is None: + kpts = np.zeros((1, 3)) + else: + kpts = kpts.reshape(-1, 3) + nkpts = len(kpts) + nao = dm.shape[-1] + dm = dm.reshape(-1, nkpts, nao, nao) + if len(dm) == 1: + dm = dm[0] + else: + dm = dm.sum(axis=0) + + is_gamma_point = is_zero(kpts) + if is_gamma_point: + assert dm.dtype == np.float64 + else: + expLk = cp.exp(1j * asarray(self.Ls).dot(asarray(kpts).T)) + dm = contract('Lk,kpq->Lpq', expLk, dm) + expLk = None + dm = dm.real + dm = cp.asarray(dm, order='C') + + hermi = 1 + deriv = (1, 0) + gout_stride_lookup, shm_size = self.create_gout_stride_lookup_table(deriv) + bas_ij_idx, shl_pair_offsets = self.generate_shl_pairs(hermi, gout_stride_lookup) + nbatches_shl_pair = len(shl_pair_offsets) - 1 + int1e_envs = self.int1e_envs + sigma = cp.zeros((3, 3)) + libpbc.PBCovlp_strain_deriv( + ctypes.cast(sigma.data.ptr, ctypes.c_void_p), + ctypes.cast(dm.data.ptr, ctypes.c_void_p), + ctypes.byref(int1e_envs), + ctypes.c_int(shm_size), + ctypes.c_int(nbatches_shl_pair), + ctypes.cast(shl_pair_offsets.data.ptr, ctypes.c_void_p), + ctypes.cast(bas_ij_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(gout_stride_lookup.data.ptr, ctypes.c_void_p), + ctypes.c_int(is_gamma_point)) + sigma *= 2 + return sigma.get() diff --git a/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py index 6e35df5a7..c29f18d45 100644 --- a/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py +++ b/gpu4pyscf/pbc/gto/tests/test_pbc_int1e.py @@ -14,6 +14,8 @@ import numpy as np import pyscf +from pyscf.gto import intor_cross +from pyscf.pbc.tools import pbc as pbctools from gpu4pyscf.pbc.gto import int1e def test_int1e_ovlp(): @@ -128,17 +130,7 @@ def test_int1e_ovlp1(): atom = [['He', (L/2.-.5,L/2.,L/2.-.5)], ['He', (L/2. ,L/2.,L/2.+.5)]], basis = { 'He': [[0, (0.8, 1.0)], - [0, (1.0, 1.0)], [0, (1.2, 1.0)]]}) - nk = [2, 2, 1] - kpts = cell.make_kpts(nk, wrap_around=True) - s = int1e.int1e_ovlp(cell, kpts) - ref = cell.pbc_intor('int1e_ovlp', kpts=kpts) - assert abs(s.get() - ref).max() < 1e-10 - k = int1e.int1e_kin(cell, kpts) - ref = cell.pbc_intor('int1e_kin', kpts=kpts) - assert abs(k.get() - ref).max() < 1e-10 - nk = [5, 4, 1] kpts = cell.make_kpts(nk, wrap_around=True)[[3, 8, 11]] s = int1e.int1e_ovlp(cell, kpts) @@ -148,4 +140,48 @@ def test_int1e_ovlp1(): ref = cell.pbc_intor('int1e_kin', kpts=kpts) assert abs(k.get() - ref).max() < 1e-10 -test_int1e_ovlp1() +def test_ovlp_stress_tensor(): + a = np.eye(3) * 6. + np.random.seed(5) + a += np.random.rand(3, 3) - .5 + cell = pyscf.M( + atom='He 1 1 1; He 2 1.5 2.4', + basis=[[0, [.5, 1]], + [1, [1.5, 1], [.5, 1]], + [2, [.8, 1]], + [3, [.7, 1]] + ], a=a, unit='Bohr') + + Ls = cell.get_lattice_Ls() + Ls = Ls[np.argsort(np.linalg.norm(Ls-.1, axis=1))] + scell = cell.copy() + scell = pbctools._build_supcell_(scell, cell, Ls) + + aoslices = cell.aoslice_by_atom() + ao_repeats = aoslices[:,3] - aoslices[:,2] + bas_coords = np.repeat(cell.atom_coords(), ao_repeats, axis=0) + + nao = cell.nao + dm = np.random.rand(nao, nao) - .5 + dm = dm.dot(dm.T) + ovlp10 = cell.pbc_intor('int1e_ipovlp') + ovlp10 = np.einsum('xij,iy->xyij', ovlp10, bas_coords) + sc_ovlp01 = intor_cross('int1e_ipovlp', scell, cell) + ovlp01 = np.einsum('xnji,njy->xyij', sc_ovlp01.reshape(3,-1,nao,nao), bas_coords+Ls[:,None]) + ref = -(ovlp10 + ovlp01) + ref = np.einsum('xyij,ji->xy', ref, dm) + dat = int1e._Int1eOptV2(cell).get_ovlp_strain_deriv(dm) + assert abs(dat - ref).max() < 1e-9 + + nk = [5, 4, 1] + kpts = cell.make_kpts(nk) + dm = np.array(cell.pbc_intor('int1e_ovlp', kpts=kpts)) + ovlp10 = np.array(cell.pbc_intor('int1e_ipovlp', kpts=kpts)) + ovlp10 = np.einsum('kxij,iy->xykij', ovlp10, bas_coords) + expLk = np.exp(1j*Ls.dot(kpts.T)) + ovlp01 = np.einsum('xnji,njy->xyinj', sc_ovlp01.reshape(3,-1,nao,nao), bas_coords+Ls[:,None]) + ovlp01 = np.einsum('xyiLj,Lk->xykij', ovlp01, expLk, optimize=True) + ref = ovlp01 + ovlp10 + ref = -np.einsum('xykij,kji->xy', ref, dm).real + dat = int1e._Int1eOptV2(cell).get_ovlp_strain_deriv(dm, kpts=kpts) + assert abs(dat - ref).max() < 1e-9 diff --git a/gpu4pyscf/pbc/scf/j_engine.py b/gpu4pyscf/pbc/scf/j_engine.py index 4e6c0a5ee..2ced645d3 100644 --- a/gpu4pyscf/pbc/scf/j_engine.py +++ b/gpu4pyscf/pbc/scf/j_engine.py @@ -319,7 +319,7 @@ def proc(dm_xyz): ctypes.c_int(n_dm), ctypes.c_int(dm_xyz_size), ctypes.c_int(nimgs_uniq_pair), - rys_envs, (ctypes.c_int*6)(*scheme), + ctypes.byref(rys_envs), (ctypes.c_int*6)(*scheme), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), diff --git a/gpu4pyscf/pbc/scf/rsjk.py b/gpu4pyscf/pbc/scf/rsjk.py index 0b1867042..4be86ad67 100644 --- a/gpu4pyscf/pbc/scf/rsjk.py +++ b/gpu4pyscf/pbc/scf/rsjk.py @@ -50,6 +50,7 @@ libpbc.PBC_build_k.restype = ctypes.c_int libpbc.PBC_build_k_init(ctypes.c_int(SHM_SIZE)) +libpbc.PBC_build_jk_ip1_init(ctypes.c_int(SHM_SIZE)) DD_CACHE_MAX = 101250 * (SHM_SIZE//48000) OMEGA = 0.3 @@ -204,7 +205,7 @@ def _filter_q_cond(self, supmol, q_cond, s_estimator, rys_envs, cutoff): libpbc.filter_q_cond_by_distance( ctypes.cast(q_cond.data.ptr, ctypes.c_void_p), ctypes.cast(s_estimator.data.ptr, ctypes.c_void_p), - rys_envs, + ctypes.byref(rys_envs), ctypes.cast(diffuse_exps_per_atom.data.ptr, ctypes.c_void_p), ctypes.cast(s_max_per_atom.data.ptr, ctypes.c_void_p), ctypes.c_float(math.log(cutoff)), @@ -382,7 +383,7 @@ def proc(dms, dm_cond): ctypes.cast(vk.data.ptr, ctypes.c_void_p), ctypes.cast(dms.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(nao), - rys_envs, (ctypes.c_int*8)(*shls_slice), + ctypes.byref(rys_envs), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(SHM_SIZE), ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), @@ -541,6 +542,11 @@ def _get_ejk_sr_ip1(self, dm, kpts=None, exxdiv=None, #:dms = cp.einsum('pi,nij,qj->npq', self.coeff, dms, self.coeff) dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, self.uniq_l_ctr, self.l_ctr_offsets, self.ao_idx) + # Symmetrize density matrices because 8-fold symmetry is utilized when + # computing integrals. Fold the contribution of the upper triangular + # part of the density matrices into the lower triangular part. + dms = transpose_sum(dms) + dms *= .5 if kpts is None: kpts = np.zeros((1, 3)) @@ -575,7 +581,6 @@ def _get_ejk_sr_ip1(self, dm, kpts=None, exxdiv=None, log_cutoff = math.log(cutoff) libpbc.PBC_per_atom_jk_ip1.restype = ctypes.c_int - libpbc.PBC_build_jk_ip1_init(ctypes.c_int(SHM_SIZE)) uniq_l_ctr = self.uniq_l_ctr uniq_l = uniq_l_ctr[:,0] @@ -638,7 +643,7 @@ def proc(dms, dm_cond): ctypes.c_double(j_factor), ctypes.c_double(k_factor), ctypes.cast(dms.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(nao), - rys_envs, (ctypes.c_int*2)(*scheme), + ctypes.byref(rys_envs), (ctypes.c_int*2)(*scheme), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), @@ -742,6 +747,219 @@ def _get_ejk_lr_ip1(self, dm, kpts=None, exxdiv=None, ek *= k_factor return ej - ek + def _get_jk_sr_strain_deriv(self, dm, kpts=None, exxdiv=None, + j_factor=1., k_factor=1., verbose=None): + '''Compute the derivatives of the short-range part of the aggregated + J/K contribution. The aggregated J/K contribution is given by + j_factor - k_factor / 2. + ''' + log = logger.new_logger(self, verbose) + cell = self.cell + assert cell.dimension == 3 + sorted_cell = self.sorted_cell + nao_orig = cell.nao + nao = sorted_cell.nao + supmol = self.supmol + + dm = asarray(dm, order='C') + dms = dm.reshape(-1,nao_orig,nao_orig) + #:dms = cp.einsum('pi,nij,qj->npq', self.coeff, dms, self.coeff) + dms = apply_coeff_C_mat_CT(dms, cell, sorted_cell, self.uniq_l_ctr, + self.l_ctr_offsets, self.ao_idx) + # Symmetrize density matrices because 8-fold symmetry is utilized when + # computing integrals. Fold the contribution of the upper triangular + # part of the density matrices into the lower triangular part. + dms = transpose_sum(dms) + dms *= .5 + + if kpts is None: + kpts = np.zeros((1, 3)) + else: + kpts = kpts.reshape(-1, 3) + is_gamma_point = is_zero(kpts) + if is_gamma_point: + assert dms.dtype == np.float64 + nkpts = 1 + ao_loc = asarray(sorted_cell.ao_loc) + dms = cp.asarray(dms, order='C') + dm_cond = condense('absmax', dms, ao_loc) + # Add the dimension for kpts + dms = dms[:,None,:,:] + else: + scaled_kpts = kpts.dot(cell.lattice_vectors().T) + Ts = cp.asarray(supmol.double_latsum_Ts, dtype=np.float64) + expLk = cp.exp(1j * Ts.dot(asarray(scaled_kpts).T)) + nkpts = expLk.shape[1] + dms = dms.reshape(-1, nkpts, nao, nao) + dms = contract('skpq,Lk->sLpq', dms, expLk) + # Are dms always real for super-mol? + assert abs(dms.imag).max() < 1e-6 + expLk = None + dms = dms.real + dms = cp.asarray(dms, order='C') + dm_cond = _dm_cond_from_compressed_dm(supmol, dms) + dm_cond = cp.log(dm_cond + 1e-300).astype(np.float32) + n_dm = len(dms) + assert n_dm <= 2 + cutoff = self.estimate_cutoff_with_penalty(cell.precision**.5*1e-2) + log_cutoff = math.log(cutoff) + + libpbc.PBC_jk_strain_deriv.restype = ctypes.c_int + + uniq_l_ctr = self.uniq_l_ctr + uniq_l = uniq_l_ctr[:,0] + l_ctr_bas_loc = self.l_ctr_offsets + l_symb = [lib.param.ANGULAR[i] for i in uniq_l] + n_groups = np.count_nonzero(uniq_l <= LMAX) + + tasks = ((i,j,k,l) + for i in range(n_groups) + for j in range(i+1) + for k in range(i+1) + for l in range(k+1)) + + def proc(dms, dm_cond): + device_id = cp.cuda.device.get_device_id() + stream = cp.cuda.stream.get_current_stream() + log = logger.new_logger(cell, verbose) + t0 = log.init_timer() + dms = cp.asarray(dms) + dm_cond = cp.asarray(dm_cond) + + q_cond = cp.asarray(self.q_cond) + s_estimator = cp.asarray(self.s_estimator) + pair_ij_mappings = _make_pair_ij_mappings( + supmol, l_ctr_bas_loc, q_cond, log_cutoff, tile=6) + pair_kl_mappings = _make_tril_pair_mappings( + supmol, l_ctr_bas_loc, q_cond, log_cutoff, tile=6) + bas_mask_idx = cp.asarray(supmol.bas_mask_idx) + nimgs = len(supmol.Ls) + if is_gamma_point: + Ts_ji_lookup = cp.zeros_like(supmol.Ts_ji_lookup) + nimgs_uniq_pair = 1 + else: + Ts_ji_lookup = cp.asarray(supmol.Ts_ji_lookup) + nimgs_uniq_pair = len(supmol.double_latsum_Ts) + ejk = cp.zeros((cell.natm, 3)) + sigma = cp.zeros((3, 3)) + + t1 = log.timer_debug1(f'q_cond and dm_cond on Device {device_id}', *t0) + workers = gpu_specs['multiProcessorCount'] + pool = cp.empty(workers*QUEUE_DEPTH+1, dtype=np.uint32) + dd_pool = cp.empty((workers, DD_CACHE_MAX), dtype=np.float64) + + timing_counter = Counter() + kern_counts = 0 + kern = libpbc.PBC_jk_strain_deriv + rys_envs = self.rys_envs + + for task in tasks: + i, j, k, l = task + shls_slice = l_ctr_bas_loc[[i, i+1, j, j+1, k, k+1, l, l+1]] + pair_ij_mapping = pair_ij_mappings[i,j] + pair_kl_mapping = pair_kl_mappings[k,l] + npairs_ij = pair_ij_mapping.size + npairs_kl = pair_kl_mapping.size + if npairs_ij == 0 or npairs_kl == 0: + continue + scheme = _ejk_quartets_scheme(supmol, uniq_l_ctr[[i, j, k, l]]) + err = kern( + ctypes.cast(ejk.data.ptr, ctypes.c_void_p), + ctypes.c_double(j_factor), ctypes.c_double(k_factor), + ctypes.cast(sigma.data.ptr, ctypes.c_void_p), + ctypes.cast(dms.data.ptr, ctypes.c_void_p), + ctypes.c_int(n_dm), ctypes.c_int(nao), + ctypes.byref(rys_envs), (ctypes.c_int*2)(*scheme), + (ctypes.c_int*8)(*shls_slice), + ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl), + ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(pair_kl_mapping.data.ptr, ctypes.c_void_p), + ctypes.cast(bas_mask_idx.data.ptr, ctypes.c_void_p), + ctypes.cast(Ts_ji_lookup.data.ptr, ctypes.c_void_p), + ctypes.c_int(nimgs), ctypes.c_int(nimgs_uniq_pair), + ctypes.cast(q_cond.data.ptr, ctypes.c_void_p), + ctypes.cast(s_estimator.data.ptr, ctypes.c_void_p), + ctypes.cast(dm_cond.data.ptr, ctypes.c_void_p), + ctypes.c_float(log_cutoff), + ctypes.cast(pool.data.ptr, ctypes.c_void_p), + ctypes.cast(dd_pool.data.ptr, ctypes.c_void_p), + ctypes.c_int(sorted_cell.nbas), + supmol._atm.ctypes, ctypes.c_int(supmol.natm), + supmol._bas.ctypes, ctypes.c_int(supmol.nbas), + supmol._env.ctypes) + llll = f'({l_symb[i]}{l_symb[j]}|{l_symb[k]}{l_symb[l]})' + if err != 0: + raise RuntimeError(f'PBC_jk_strain_deriv kernel for {llll} failed') + if log.verbose >= logger.DEBUG1: + ntasks = npairs_ij * npairs_kl + msg = f'processing {llll} on Device {device_id} tasks ~= {ntasks}' + t1, t1p = log.timer_debug1(msg, *t1), t1 + timing_counter[llll] += t1[1] - t1p[1] + kern_counts += 1 + if num_devices > 1: + stream.synchronize() + return ejk, sigma, kern_counts, timing_counter + + results = multi_gpu.run(proc, args=(dms, dm_cond), non_blocking=True) + + kern_counts = 0 + timing_collection = Counter() + ejk_dist = [] + sigma_dist = [] + for ejk, sigma, counts, t_counter in results: + kern_counts += counts + timing_collection += t_counter + ejk_dist.append(ejk) + sigma_dist.append(sigma) + + log = logger.new_logger(cell, verbose) + if log.verbose >= logger.DEBUG1: + log.debug1('kernel launches %d', kern_counts) + for llll, t in timing_collection.items(): + log.debug1('%s wall time %.2f', llll, t) + + ejk = multi_gpu.array_reduce(ejk_dist, inplace=True) + sigma = multi_gpu.array_reduce(sigma_dist, inplace=True) + sigma *= 2 + sigma = sigma.get() + + if ((cell.dimension == 3 or + (cell.dimension == 2 and cell.low_dim_ft_type != 'inf_vacuum'))): + # difference associated to the G=0 term between the real space + # integrals and the AFT integrals + dm0 = dm.reshape(n_dm, nkpts, nao_orig, nao_orig) + omega = self.omega + wcoulG_SR_at_G0 = np.pi / omega**2 / cell.vol + if exxdiv == 'ewald': + wcoulG_for_k = probe_charge_sr_coulomb(cell, omega, kpts) + else: + wcoulG_for_k = wcoulG_SR_at_G0 + int1e_opt = int1e._Int1eOpt(cell, kpts) + s0 = int1e_opt.intor('PBCint1e_ovlp', 1, 1, (0, 0)) + s1 = int1e_opt.intor('PBCint1e_ipovlp', 0, 3, (1, 0)) + s_dm = cp.einsum('kij,nkji->', s0, dm0) + j_dm = dm0.sum(axis=0) * (j_factor * s_dm * wcoulG_SR_at_G0) + k_dm = contract('nkpq,kqr->nkpr', dm0, s0) + k_dm = contract('nkpr,nkrs->kps', k_dm, dm0) + if n_dm == 1: # RHF + k_dm *= .5 * k_factor * wcoulG_for_k + else: + k_dm *= k_factor * wcoulG_for_k + aoslices = cell.aoslice_by_atom() + for i, (p0, p1) in enumerate(aoslices[:,2:]): + ejk[i] += cp.einsum('kxpq,kqp->x', s1[:,:,p0:p1], j_dm[:,:,p0:p1]).real + ejk[i] -= cp.einsum('kxpq,kqp->x', s1[:,:,p0:p1], k_dm[:,:,p0:p1]).real + + int1e_opt = int1e._Int1eOptV2(cell) + sigma -= int1e_opt.get_ovlp_strain_deriv(j_dm, kpts) + sigma += .5 * cp.einsum('kij,kji->', s0, j_dm).real.get() * np.eye(3) + sigma += int1e_opt.get_ovlp_strain_deriv(k_dm, kpts) + sigma -= .5 * cp.einsum('kij,kji->', s0, k_dm).real.get() * np.eye(3) + + if not is_gamma_point: + ejk *= 1. / nkpts + return sigma + class ExtendedMole(gto.Mole): '''A super-Mole cluster to mimic periodicity within the unit cell''' def __init__(self): diff --git a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py index a9f72becd..342d8798e 100644 --- a/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py +++ b/gpu4pyscf/pbc/scf/tests/test_pbc_scf_jk.py @@ -19,6 +19,7 @@ from pyscf import lib, gto from pyscf.pbc.scf.rsjk import RangeSeparationJKBuilder from pyscf.pbc.df import fft as fft_cpu +from pyscf.pbc.tools import pbc as pbctools from gpu4pyscf.pbc.df import fft from gpu4pyscf.pbc.scf import rsjk from gpu4pyscf.pbc.tools.pbc import probe_charge_sr_coulomb @@ -430,3 +431,97 @@ def test_ejk_ip1_per_atom_kpts(): p0, p1 = aoslices[i, 2:] ref[i] = np.einsum('xkpq,kqp->x', vhf[:,:,p0:p1], dm[:,:,p0:p1]).real assert abs(ejk - ref).max() < 5e-6 + +def test_jk_sr_strain_deriv1(): + from gpu4pyscf.pbc.grad import rks, rks_stress + from gpu4pyscf.pbc.grad import krks, krks_stress + a = np.eye(3) * 6. + np.random.seed(5) + a += np.random.rand(3, 3) - .5 + cell = pyscf.M( + atom='He 1 1 1; He 2 1.5 2.4', + basis=[[0, [1.5, 1]]], a=a, unit='Bohr') + # gamma point calculations + nao = cell.nao + dm = np.random.rand(nao, nao) - .5 + dm = dm.dot(dm.T) + with_rsjk = rsjk.PBCJKMatrixOpt(cell).build() + sigma = with_rsjk._get_jk_sr_strain_deriv(dm) + #sigma_wo_exxdiv = with_rsjk._get_jk_sr_strain_deriv(dm, exxdiv=False) + + Ls = cell.get_lattice_Ls(rcut=cell.rcut+4) + Ls = Ls[np.argsort(np.linalg.norm(Ls-.1, axis=1))] + scell = cell.copy() + scell = pbctools._build_supcell_(scell, cell, Ls) + scell.omega = -.3 + nimgs = len(Ls) + aoslices = cell.aoslice_by_atom() + ao_repeats = aoslices[:,3] - aoslices[:,2] + bas_coords = np.repeat(cell.atom_coords(), ao_repeats, axis=0) + sc_eri = scell.intor('int2e_ip1').reshape(3,nimgs,nao,nimgs,nao,nimgs,nao,nimgs,nao) + eri1 = np.einsum('xokplinj,oky->xyijkl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None]) + eri1+= np.einsum('xplokinj,ply->xyijkl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None]) + eri1+= np.einsum('xinjokpl,iy->xyijkl', sc_eri[:,0], bas_coords) + eri1+= np.einsum('xnjiokpl,njy->xyijkl', sc_eri[:,:,:,0], bas_coords+Ls[:,None]) + ej = -.5 * np.einsum('xyijkl,ji,lk->xy', eri1, dm, dm) + ek = -.5 * np.einsum('xyijkl,jk,li->xy', eri1, dm, dm) + #ref_wo_exxdiv = ej - ek*.5 + + sc_s1 = scell.intor('int1e_ipovlp').reshape(3,nimgs,nao,nimgs,nao) + s1 = np.einsum('xmij,miy->xyij', sc_s1[:,:,:,0], bas_coords+Ls[:,None]) + s1+= np.einsum('xinj,iy->xyij', sc_s1[:,0], bas_coords) + s0 = cell.pbc_intor('int1e_ovlp') + wcoulG_SR_at_G0 = np.pi/scell.omega**2/cell.vol + j_dm = np.einsum('ij,ji->', s0, dm) + ej += np.einsum('xyij,ji->xy', s1, dm) * wcoulG_SR_at_G0 * j_dm + ej += .5 * j_dm * wcoulG_SR_at_G0 * j_dm * np.eye(3) + ek += np.einsum('xyij,jk,kl,li->xy', s1, dm, s0, dm) * wcoulG_SR_at_G0 + ek += .5 *np.einsum('ij,jk,kl,li->', s0, dm, s0, dm) * wcoulG_SR_at_G0 * np.eye(3) + ref = ej - ek*.5 + assert abs(ref - sigma).max() < 1e-6 + + sigma = with_rsjk._get_jk_sr_strain_deriv(dm, k_factor=0) + cell.omega = scell.omega + xc = 'lda,' + mf_grad = cell.RKS(xc=xc).to_gpu().Gradients() + ref = rks_stress.get_vxc(mf_grad, cell, dm, with_j=True, with_nuc=False) + ref -= rks_stress.get_vxc(mf_grad, cell, dm, with_j=False, with_nuc=False) + assert abs(ref - sigma).max() < 1e-7 + + # kpts calculations + kpts = cell.make_kpts([1,1,1]) + dm = np.array(cell.pbc_intor('int1e_ovlp', kpts=kpts)) + sigma = with_rsjk._get_jk_sr_strain_deriv(dm, kpts) + #sigma_wo_exxdiv = with_rsjk._get_jk_sr_strain_deriv(dm, exxdiv=False) + + eri1 = np.einsum('xokplinj,oky->xyinjokpl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None]) + eri1+= np.einsum('xplokinj,ply->xyinjokpl', sc_eri[:,:,:,:,:,0], bas_coords+Ls[:,None]) + eri1+= np.einsum('xinjokpl,iy->xyinjokpl', sc_eri[:,0], bas_coords) + eri1+= np.einsum('xnjiokpl,njy->xyinjokpl', sc_eri[:,:,:,0], bas_coords+Ls[:,None]) + expLk = np.exp(1j*Ls.dot(kpts.T)) + eri1 = np.einsum('xyiNjOkPl,Nn,Oo,Pp->xyinjokpl', eri1, expLk, expLk.conj(), expLk, optimize=True) + nkpts = len(kpts) + ej = -.5 * np.einsum('xyinjokpl,op,nji,plk->xy', eri1, np.eye(nkpts), dm, dm).real + ek = -.5 * np.einsum('xyinjokpl,no,ojk,pli->xy', eri1, np.eye(nkpts), dm, dm).real + #ref_wo_exxdiv = ej - ek*.5 + + sc_s1 = scell.intor('int1e_ipovlp').reshape(3,nimgs,nao,nimgs,nao) + s1 = np.einsum('xmij,miy,mk->xykij', sc_s1[:,:,:,0], bas_coords+Ls[:,None], expLk) + s1+= np.einsum('xinj,iy,nk->xykij', sc_s1[:,0], bas_coords, expLk) + s0 = np.array(cell.pbc_intor('int1e_ovlp', kpts=kpts)) + wcoulG_SR_at_G0 = np.pi/scell.omega**2/cell.vol + j_dm = np.einsum('kij,kji->', s0, dm).real + ej += np.einsum('xykij,kji->xy', s1, dm).real * wcoulG_SR_at_G0 * j_dm + ej += .5 * j_dm * wcoulG_SR_at_G0 * j_dm * np.eye(3) + ek += np.einsum('xytij,tjk,tkl,tli->xy', s1, dm, s0, dm).real * wcoulG_SR_at_G0 + ek += .5 *np.einsum('tij,tjk,tkl,tli->', s0, dm, s0, dm).real * wcoulG_SR_at_G0 * np.eye(3) + ref = ej - ek*.5 + assert abs(ref - sigma).max() < 2e-5 + + sigma = with_rsjk._get_jk_sr_strain_deriv(dm, kpts, k_factor=0) + cell.omega = scell.omega + xc = 'lda,' + mf_grad = cell.KRKS(xc=xc, kpts=kpts).to_gpu().Gradients() + ref = krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=True, with_nuc=False) + ref -= krks_stress.get_vxc(mf_grad, cell, dm, kpts=kpts, with_j=False, with_nuc=False) + assert abs(ref - sigma).max() < 1e-7 diff --git a/gpu4pyscf/scf/j_engine.py b/gpu4pyscf/scf/j_engine.py index ad11f0c25..46fb82ab2 100644 --- a/gpu4pyscf/scf/j_engine.py +++ b/gpu4pyscf/scf/j_engine.py @@ -286,7 +286,7 @@ def proc(dm_xyz): ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p), ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(dm_xyz_size), - rys_envs, (ctypes.c_int*6)(*scheme), + ctypes.byref(rys_envs), (ctypes.c_int*6)(*scheme), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(npairs_ij), ctypes.c_int(npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), diff --git a/gpu4pyscf/scf/jk.py b/gpu4pyscf/scf/jk.py index 2d56ac03e..7b1091fab 100644 --- a/gpu4pyscf/scf/jk.py +++ b/gpu4pyscf/scf/jk.py @@ -607,7 +607,7 @@ def proc(dms, dm_cond): ctypes.cast(vk.data.ptr, ctypes.c_void_p), ctypes.cast(dms.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(nao), - rys_envs, (ctypes.c_int*8)(*shls_slice), + ctypes.byref(rys_envs), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(SHM_SIZE), ctypes.c_int(npairs_ij), ctypes.c_int(_npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), @@ -773,7 +773,7 @@ def proc(dm_xyz, dm_cond): ctypes.cast(vj_xyz.data.ptr, ctypes.c_void_p), ctypes.cast(dm_xyz.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(nao), - rys_envs, (ctypes.c_int*3)(*scheme), + ctypes.byref(rys_envs), (ctypes.c_int*3)(*scheme), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(npairs_ij), ctypes.c_int(_npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p), @@ -912,7 +912,7 @@ def proc(dms, dm_cond): ctypes.cast(vk.data.ptr, ctypes.c_void_p), ctypes.cast(dms.data.ptr, ctypes.c_void_p), ctypes.c_int(n_dm), ctypes.c_int(nao), - rys_envs, (ctypes.c_int*8)(*shls_slice), + ctypes.byref(rys_envs), (ctypes.c_int*8)(*shls_slice), ctypes.c_int(SHM_SIZE), ctypes.c_int(npairs_ij), ctypes.c_int(_npairs_kl), ctypes.cast(pair_ij_mapping.data.ptr, ctypes.c_void_p),