Skip to content

Commit 951c0cd

Browse files
authored
PBC HF gradients (#554)
* Small updates for rys_contract_k * Update SR estimator for molecular get_jk code * Add rys_contract_jk_ip1 * Update gradients cuda kernel for molecular code * 8-fold symmetry for pbc rys_contract_k * PBC JK gradients withou permutation symmetry * pbc JK gradients with 8-fold symmetry * Fix hermi=0 for get_k_sr; Fix integral screening for 8-fold symmetry * Fix pbc short-range ejk_ip1 for kpts * filter q_cond based on s_estimator * Fix a bug in Et_dot_dm for pbc J-engine caused by the transpose of DM * Filter pairs based on its distance to the reference cell * Add AFT derivative integrals for Coulomb term * Bugfix for get_ej_ip1 * Validate the occupancy assigment * AFTDF get_ek_ip1 for gamma point and kpts matches fftdf results when exxdiv=None * Add LR part of the gradients for RSJK * Enable rsjk in DFT modules * Add tests for KRHF and KRKS * Add UKS tests * Validate the shape of kpts and dm dimension * Add tests for RKS, UKS, KRKS, KUKS gradients * lint * typo * Fix dimension issue and tests * Update tests * kpts dimension error in multigrid code
1 parent 67c243d commit 951c0cd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+4574
-1762
lines changed

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
- name: Install ruff
1212
run: pip install ruff
1313
- name: Check style
14-
run: ruff check --config .ruff.toml gpu4pyscf
14+
run: ruff check --config .ruff.toml --unsafe-fixes gpu4pyscf
1515
- name: Check NumPy
1616
run: ruff check --select NPY --ignore NPY002 gpu4pyscf
1717
flake:

gpu4pyscf/dft/rks.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
9292
t0 = logger.init_timer(ks)
9393
initialize_grids(ks, mol, dm)
9494

95-
ground_state = getattr(dm, 'ndim', 0) == 2
96-
9795
ni = ks._numint
9896
if hermi == 2: # because rho = 0
9997
n, exc, vxc = 0, 0, 0
@@ -120,10 +118,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
120118
if vj_last is not None:
121119
vj += asarray(vj_last)
122120
vxc += vj
123-
if ground_state:
124-
ecoul = float(cupy.einsum('ij,ij', dm_orig, vj).real) * .5
125-
else:
126-
ecoul = None
121+
ecoul = float(cupy.einsum('ij,ij', dm_orig, vj).real) * .5
127122

128123
vk = None
129124
if ni.libxc.is_hybrid_xc(ks.xc):
@@ -147,8 +142,7 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
147142
if vj_last is not None:
148143
vk += asarray(vhf_last.vk)
149144
vxc -= vk
150-
if ground_state:
151-
exc -= float(cupy.einsum('ij,ij', dm_orig, vk).real) * .5
145+
exc -= float(cupy.einsum('ij,ij', dm_orig, vk).real) * .5
152146
t0 = logger.timer(ks, 'veff', *t0)
153147
vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
154148
return vxc

gpu4pyscf/grad/rhf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,10 @@ def _ejk_quartets_scheme(mol, l_ctr_pattern, shm_size=SHM_SIZE):
195195
nps = l_ctr_pattern[:,1]
196196
ij_prims = nps[0] * nps[1]
197197
nroots = (order + 1) // 2 + 1
198-
unit = nroots*2 + g_size*3 + ij_prims + 9
198+
unit = nroots*2 + g_size*3 + 6
199199
if mol.omega < 0: # SR
200200
unit += nroots * 2
201-
counts = shm_size // (unit*8)
201+
counts = (shm_size - ij_prims*8) // (unit*8)
202202
n = min(THREADS, _nearest_power2(counts))
203203
gout_stride = THREADS // n
204204
return n, gout_stride

gpu4pyscf/lib/gvhf-md/md_contract_j.cu

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -963,17 +963,4 @@ int MD_build_j(double *vj, double *dm, int n_dm, int dm_size,
963963
}
964964
return 0;
965965
}
966-
967-
int init_mdj_constant(int shm_size)
968-
{
969-
cudaFuncSetAttribute(md_j_1dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
970-
cudaFuncSetAttribute(md_j_4dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
971-
cudaError_t err = cudaGetLastError();
972-
if (err != cudaSuccess) {
973-
fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
974-
cudaGetErrorString(err));
975-
return 1;
976-
}
977-
return 0;
978-
}
979966
}

gpu4pyscf/lib/gvhf-md/md_j_driver.cu

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,32 @@ int qd_offset_for_threads(int npairs, int threads)
5151
}
5252
return address;
5353
}
54+
55+
extern __global__
56+
void md_j_1dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
57+
int threadsx, int threadsy, int tilex, int tiley,
58+
uint16_t *pRt2_kl_ij, int8_t *efg_phase);
59+
extern __global__
60+
void md_j_4dm_kernel(RysIntEnvVars envs, JKMatrix jk, MDBoundsInfo bounds,
61+
int threadsx, int threadsy, int tilex, int tiley, int dm_size,
62+
uint16_t *pRt2_kl_ij, int8_t *efg_phase);
63+
extern __global__
64+
void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
65+
int threadsx, int threadsy, int tilex, int tiley,
66+
uint16_t *pRt2_kl_ij, int8_t *efg_phase);
67+
68+
extern "C" {
69+
int init_mdj_constant(int shm_size)
70+
{
71+
cudaFuncSetAttribute(md_j_1dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
72+
cudaFuncSetAttribute(md_j_4dm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
73+
cudaFuncSetAttribute(pbc_md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
74+
cudaError_t err = cudaGetLastError();
75+
if (err != cudaSuccess) {
76+
fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
77+
cudaGetErrorString(err));
78+
return 1;
79+
}
80+
return 0;
81+
}
82+
}

gpu4pyscf/lib/gvhf-md/md_pairdata.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,8 +283,9 @@ void PBC_Et_dot_dm(double *Et_dm, double *dm, int n_dm, int Et_dm_size,
283283
int nfj = (lj + 1) * (lj + 2) / 2;
284284
int Et_len = (lij + 1) * (lij + 2) * (lij + 3) / 6;
285285
double cc = ci * cj;
286+
// Be careful with the transpose of dm. Here, dm is not symmetric.
286287
double *Et_dm_ij = Et_dm + pair_loc[bas_ij];
287-
double *dm_ij = dm + ao_loc[ctr_ish] * nao + ao_loc[ctr_jsh];
288+
double *dm_ij = dm + ao_loc[ctr_jsh] * nao + ao_loc[ctr_ish];
288289
for (int img = 0; img < nimgs_uniq_pair; img++) {
289290
double cc_with_img = cc;
290291
// The diagonal elements of the AO-pairs within the
@@ -306,7 +307,7 @@ void PBC_Et_dot_dm(double *Et_dm, double *dm, int n_dm, int Et_dm_size,
306307
double rho_t = 0.;
307308
for (int i = 0; i < nfi; i++) {
308309
for (int j = 0; j < nfj; j++, n++) {
309-
rho_t += Et[n] * cc_with_img * pdm[i*nao+j];
310+
rho_t += Et[n] * cc_with_img * pdm[j*nao+i];
310311
} }
311312
rho[t] = rho_t;
312313
}

gpu4pyscf/lib/gvhf-md/pbc_md_contract_j.cu

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inline void iter_Rt_n(double *Rt, double rx, double ry, double rz, int l,
7474
}
7575

7676
// gout_pattern = ((li == 0) >> 3) | ((lj == 0) >> 2) | ((lk == 0) >> 1) | (ll == 0);
77-
__global__ static
77+
__global__
7878
void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
7979
int threadsx, int threadsy, int tilex, int tiley,
8080
uint16_t *pRt2_kl_ij, int8_t *efg_phase)
@@ -332,6 +332,7 @@ void pbc_md_j_kernel(RysIntEnvVars envs, JKMatrix jmat, MDBoundsInfo bounds,
332332

333333
extern "C" {
334334
int PBC_build_j(double *vj, double *dm, int n_dm,
335+
int dm_xyz_size, int nimgs_uniq_pair,
335336
RysIntEnvVars envs, int *scheme, int *shls_slice,
336337
int npairs_ij, int npairs_kl,
337338
int *pair_ij_mapping, int *pair_kl_mapping,
@@ -363,8 +364,6 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
363364
q_cond, cutoff};
364365

365366
double omega = env[PTR_RANGE_OMEGA];
366-
JKMatrix jmat = {vj, NULL, dm, n_dm, 0, omega};
367-
368367
int threads_ij = scheme[0];
369368
int threads_kl = scheme[1];
370369
int gout_stride = scheme[2];
@@ -384,12 +383,16 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
384383
cudaGetSymbolAddress((void**)&efg_phase, c_Rt2_efg_phase);
385384
pRt2_kl_ij += offset_for_Rt2_idx(lij, lkl);
386385
efg_phase += offset_for_Rt2_idx(0, lkl);
387-
if (1){//!pbc_md_j_unrolled(&envs, &jmat, &bounds, omega)) {
388-
bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
389-
bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
390-
pbc_md_j_kernel<<<blocks, threads, buflen>>>(
391-
envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley,
392-
pRt2_kl_ij, efg_phase);
386+
int dm_size = dm_xyz_size * nimgs_uniq_pair;
387+
for (int i_dm = 0; i_dm < n_dm; ++i_dm) {
388+
JKMatrix jmat = {vj+i_dm*dm_size, NULL, dm+i_dm*dm_size, n_dm, 0, omega};
389+
if (1){//!pbc_md_j_unrolled(&envs, &jmat, &bounds, omega)) {
390+
bounds.qd_ij_max = qd_ij_max + qd_offset_for_threads(npairs_ij, threads_ij);
391+
bounds.qd_kl_max = qd_kl_max + qd_offset_for_threads(npairs_kl, threads_kl);
392+
pbc_md_j_kernel<<<blocks, threads, buflen>>>(
393+
envs, jmat, bounds, threads_ij, threads_kl, tilex, tiley,
394+
pRt2_kl_ij, efg_phase);
395+
}
393396
}
394397
cudaError_t err = cudaGetLastError();
395398
if (err != cudaSuccess) {
@@ -398,16 +401,4 @@ int PBC_build_j(double *vj, double *dm, int n_dm,
398401
}
399402
return 0;
400403
}
401-
402-
int PBC_build_j_init(int shm_size)
403-
{
404-
cudaFuncSetAttribute(pbc_md_j_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
405-
cudaError_t err = cudaGetLastError();
406-
if (err != cudaSuccess) {
407-
fprintf(stderr, "Failed to set CUDA shm size %d: %s\n", shm_size,
408-
cudaGetErrorString(err));
409-
return 1;
410-
}
411-
return 0;
412-
}
413404
}

0 commit comments

Comments
 (0)