Skip to content

Commit 1c9489d

Browse files
authored
Faster gth pseudopotential evaluation on big system (#555)
* Faster gth pseudopotential evaluation on big system * Forgot to push one file * More check before pseudopotential local term kernel
1 parent b71a499 commit 1c9489d

File tree

3 files changed

+146
-28
lines changed

3 files changed

+146
-28
lines changed

gpu4pyscf/lib/cupy_helper.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -110,30 +110,30 @@ def reduce_to_device(array_list, inplace=False):
110110
assert len(array_list) == num_devices
111111
if num_devices == 1:
112112
return array_list[0]
113-
113+
114114
out_shape = array_list[0].shape
115115
for s in _streams:
116116
s.synchronize()
117-
117+
118118
if inplace:
119119
result = array_list[0]
120120
else:
121121
result = array_list[0].copy()
122-
122+
123123
# Transfer data chunk by chunk, reduce memory footprint,
124124
result = result.reshape(-1)
125125
for device_id, matrix in enumerate(array_list):
126126
if device_id == 0:
127127
continue
128-
128+
129129
assert matrix.device.id == device_id
130130
matrix = matrix.reshape(-1)
131131
blksize = 1024*1024*1024 // matrix.itemsize # 1GB
132132
for p0, p1 in lib.prange(0,len(matrix), blksize):
133133
result[p0:p1] += copy_array(matrix[p0:p1])
134-
#result[p0:p1] += cupy.asarray(matrix[p0:p1])
134+
#result[p0:p1] += cupy.asarray(matrix[p0:p1])
135135
return result.reshape(out_shape)
136-
136+
137137
def device2host_2d(a_cpu, a_gpu, stream=None):
138138
if stream is None:
139139
stream = cupy.cuda.get_current_stream()
@@ -183,7 +183,7 @@ def asarray(a, **kwargs):
183183
# CuPy always allocates pinned memory as a temporary buffer during array transfer.
184184
# This leads to additional memory usage, and the buffer is not managed by CuPy's
185185
# memory pool or Python's GC.
186-
# See the `cdef _ndarray_base _array_default` function in
186+
# See the `cdef _ndarray_base _array_default` function in
187187
# cupy/_core/core.pyx, where memory buffer is allocated via
188188
# mem = _alloc_async_transfer_buffer(nbytes)
189189

@@ -388,7 +388,7 @@ def _initialize_c2s_data():
388388
def block_c2s_diag(angular, counts):
389389
'''
390390
Diagonal blocked cartesian to spherical transformation
391-
Args:
391+
Args:
392392
angular (list): angular momentum type, e.g. [0,1,2,3]
393393
counts (list): count of each angular momentum
394394
'''
@@ -405,7 +405,7 @@ def block_c2s_diag(angular, counts):
405405
offsets += [c2s_offset[l]] * count
406406
rows = cupy.hstack(rows)
407407
cols = cupy.hstack(cols)
408-
408+
409409
ncart, nsph = int(rows[-1]), int(cols[-1])
410410
cart2sph = cupy.zeros([ncart, nsph])
411411
offsets = cupy.asarray(offsets, dtype='int32')
@@ -690,7 +690,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
690690
x1, rmat = _stable_qr(x1, cupy.dot, lindep=lindep)
691691
if len(x1) == 0:
692692
return cupy.zeros_like(b)
693-
693+
694694
x1 *= rmat.diagonal()[:,None]
695695

696696
innerprod = [rmat[i,i].real ** 2 for i in range(x1.shape[0])]
@@ -1153,3 +1153,33 @@ def malloc(size):
11531153
return cuda_malloc(size)
11541154
return default_mempool_malloc(size)
11551155
cupy.cuda.set_allocator(malloc)
1156+
1157+
def batched_vec3_norm2(batched_vec3):
1158+
assert type(batched_vec3) is cupy.ndarray
1159+
assert batched_vec3.dtype == cupy.float64
1160+
assert batched_vec3.ndim == 2
1161+
assert batched_vec3.shape[1] == 3
1162+
assert batched_vec3.flags.c_contiguous
1163+
1164+
fn_name = "vec3_norm2_kernel"
1165+
if fn_name not in _kernel_registery:
1166+
kernel_code = r'''
1167+
extern "C" __global__
1168+
void vec3_norm2_kernel(const double* __restrict__ vec3, double* __restrict__ norm2, const int n) {
1169+
const int i = blockDim.x * blockIdx.x + threadIdx.x;
1170+
if (i >= n) return;
1171+
const double x = vec3[i * 3 + 0];
1172+
const double y = vec3[i * 3 + 1];
1173+
const double z = vec3[i * 3 + 2];
1174+
norm2[i] = x*x + y*y + z*z;
1175+
}
1176+
'''
1177+
_kernel_registery[fn_name] = cupy.RawKernel(kernel_code, fn_name)
1178+
kernel = _kernel_registery[fn_name]
1179+
1180+
n = batched_vec3.shape[0]
1181+
assert n < np.iinfo(np.int32).max
1182+
batched_norm2 = cupy.zeros(n, dtype = cupy.float64)
1183+
kernel(((n + 1024 - 1) // 1024,), (1024,), (batched_vec3, batched_norm2, cupy.int32(n)))
1184+
1185+
return batched_norm2

gpu4pyscf/pbc/dft/multigrid.py

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from gpu4pyscf.lib import utils
2727
from gpu4pyscf.lib.cupy_helper import (
2828
load_library, tag_array, contract, sandwich_dot, block_diag, transpose_sum,
29-
dist_matrix)
29+
dist_matrix, batched_vec3_norm2)
3030
from gpu4pyscf.gto.mole import cart2sph_by_l
3131
from gpu4pyscf.dft import numint
3232
from gpu4pyscf.pbc import tools
@@ -723,22 +723,23 @@ def eval_vpplocG(cell, mesh):
723723
'''PRB, 58, 3641 Eq (5) first term
724724
'''
725725
assert cell.dimension != 2
726-
Gv, (basex, basey, basez) = cell.get_Gv_weights(mesh)[:2]
727-
basex = cp.asarray(basex)
728-
basey = cp.asarray(basey)
729-
basez = cp.asarray(basez)
726+
Gv, (basex, basey, basez) = tools.pbc._get_Gv_with_base(cell, mesh)
730727
b = cell.reciprocal_vectors()
731728
coords = cell.atom_coords()
732729
rb = cp.asarray(coords.dot(b.T))
733730
SIx = cp.exp(-1j*rb[:,0,None] * basex)
734731
SIy = cp.exp(-1j*rb[:,1,None] * basey)
735732
SIz = cp.exp(-1j*rb[:,2,None] * basez)
736-
G2 = contract('px,px->p', Gv, Gv)
733+
# G2 = contract('px,px->p', Gv, Gv)
734+
G2 = batched_vec3_norm2(Gv)
737735
charges = cell.atom_charges()
738736

739737
coulG = tools.get_coulG(cell, Gv=Gv)
740738
vlocG = cp.zeros(len(G2), dtype=np.complex128)
741739
vlocG0 = 0
740+
741+
_kernel_registery = {}
742+
742743
for ia in range(cell.natm):
743744
symb = cell.atom_symbol(ia)
744745
if symb not in cell._pseudo:
@@ -749,24 +750,97 @@ def eval_vpplocG(cell, mesh):
749750
if nexp == 0:
750751
continue
751752

752-
SI = (SIx[ia,:,None,None] * SIy[ia,:,None] * SIz[ia]).ravel()
753-
G2_red = G2 * rloc**2
754-
SI *= cp.exp(-0.5*G2_red)
755753
vlocG0 += 2*np.pi*charges[ia]*rloc**2
756-
vlocG -= charges[ia] * coulG * SI
757754

758-
# Add the C1, C2, C3, C4 contributions
759-
cfacs = 0
755+
fn_name = f"gth_loc_reciporcal_nexp_{nexp}_kernel"
756+
if fn_name not in _kernel_registery:
757+
C_declaration = ''
758+
C_contribution = ''
759+
if nexp >= 1:
760+
C_declaration += ', const double cexp0'
761+
C_contribution += 'cfacs += cexp0;'
762+
if nexp >= 2:
763+
C_declaration += ', const double cexp1'
764+
C_contribution += 'cfacs += cexp1 * (3 - G2_red);'
765+
if nexp >= 3:
766+
C_declaration += ', const double cexp2'
767+
C_contribution += 'cfacs += cexp2 * (15 - 10 * G2_red + G2_red * G2_red);'
768+
if nexp >= 4:
769+
C_declaration += ', const double cexp3'
770+
C_contribution += 'cfacs += cexp3 * (105 - 105 * G2_red + 21 * G2_red * G2_red - G2_red * G2_red * G2_red);'
771+
kernel_code = r'''
772+
#include <cupy/complex.cuh>
773+
extern "C" __global__
774+
void ''' + fn_name + '''(
775+
const double* __restrict__ grids_G2, const double* __restrict__ grids_coulG,
776+
const complex<double>* __restrict__ grids_SIx, const complex<double>* __restrict__ grids_SIy, const complex<double>* __restrict__ grids_SIz,
777+
complex<double>* __restrict__ grids_vlocG,
778+
const int n_mesh_x, const int n_mesh_y, const int n_mesh_z, const int i_atom,
779+
const double charge, const double rloc''' + C_declaration + r''')
780+
{
781+
const int i_grid = blockDim.x * blockIdx.x + threadIdx.x;
782+
const int ngrids = n_mesh_x * n_mesh_y * n_mesh_z;
783+
if (i_grid >= ngrids) return;
784+
785+
const double G2 = grids_G2[i_grid];
786+
const double coulG = grids_coulG[i_grid];
787+
const double G2_red = G2 * rloc * rloc;
788+
const int i_grid_x = i_grid / (n_mesh_y * n_mesh_z);
789+
const int i_grid_y = (i_grid - i_grid_x * (n_mesh_y * n_mesh_z)) / n_mesh_z;
790+
const int i_grid_z = i_grid - i_grid_x * (n_mesh_y * n_mesh_z) - i_grid_y * n_mesh_z;
791+
const complex<double> SIx = grids_SIx[i_atom * n_mesh_x + i_grid_x];
792+
const complex<double> SIy = grids_SIy[i_atom * n_mesh_y + i_grid_y];
793+
const complex<double> SIz = grids_SIz[i_atom * n_mesh_z + i_grid_z];
794+
const complex<double> SI = SIx * SIy * SIz * exp(-0.5 * G2_red);
795+
complex<double> vlocG = -charge * coulG * SI;
796+
797+
double cfacs = 0;
798+
''' + C_contribution + r'''
799+
vlocG += 15.749609945722419 * rloc * rloc * rloc * cfacs * SI;
800+
801+
grids_vlocG[i_grid] += vlocG;
802+
}
803+
'''
804+
_kernel_registery[fn_name] = cp.RawKernel(kernel_code, fn_name)
805+
kernel = _kernel_registery[fn_name]
806+
807+
ngrids = G2.shape[0]
808+
assert G2.shape == (ngrids,) and G2.dtype == cp.float64
809+
assert coulG.shape == (ngrids,) and coulG.dtype == cp.float64
810+
assert SIx.shape == (cell.natm, mesh[0]) and SIx.dtype == cp.complex128 and SIx.flags.c_contiguous
811+
assert SIy.shape == (cell.natm, mesh[1]) and SIy.dtype == cp.complex128 and SIy.flags.c_contiguous
812+
assert SIz.shape == (cell.natm, mesh[2]) and SIz.dtype == cp.complex128 and SIz.flags.c_contiguous
813+
assert vlocG.shape == (ngrids,) and vlocG.dtype == cp.complex128
814+
assert ngrids < np.iinfo(np.int32).max
815+
816+
kernel_parameters = [G2, coulG, SIx, SIy, SIz, vlocG, cp.int32(mesh[0]), cp.int32(mesh[1]), cp.int32(mesh[2]),
817+
cp.int32(ia), cp.float64(charges[ia]), cp.float64(rloc)]
760818
if nexp >= 1:
761-
cfacs += cexp[0]
819+
kernel_parameters.append(cp.float64(cexp[0]))
762820
if nexp >= 2:
763-
cfacs += cexp[1] * (3 - G2_red)
821+
kernel_parameters.append(cp.float64(cexp[1]))
764822
if nexp >= 3:
765-
cfacs += cexp[2] * (15 - 10*G2_red + G2_red**2)
823+
kernel_parameters.append(cp.float64(cexp[2]))
766824
if nexp >= 4:
767-
cfacs += cexp[3] * (105 - 105*G2_red + 21*G2_red**2 - G2_red**3)
768-
769-
vlocG += (2*np.pi)**(3/2.)*rloc**3 * cfacs * SI
825+
kernel_parameters.append(cp.float64(cexp[3]))
826+
kernel(((ngrids + 1024 - 1) // 1024, ), (1024, ), kernel_parameters)
827+
828+
# SI = (SIx[ia,:,None,None] * SIy[ia,:,None] * SIz[ia]).ravel()
829+
# G2_red = G2 * rloc**2
830+
# SI *= cp.exp(-0.5*G2_red)
831+
# vlocG -= charges[ia] * coulG * SI
832+
833+
# # Add the C1, C2, C3, C4 contributions
834+
# cfacs = 0
835+
# if nexp >= 1:
836+
# cfacs += cexp[0]
837+
# if nexp >= 2:
838+
# cfacs += cexp[1] * (3 - G2_red)
839+
# if nexp >= 3:
840+
# cfacs += cexp[2] * (15 - 10*G2_red + G2_red**2)
841+
# if nexp >= 4:
842+
# cfacs += cexp[3] * (105 - 105*G2_red + 21*G2_red**2 - G2_red**3)
843+
# vlocG += (2*np.pi)**(3/2.)*rloc**3 * cfacs * SI
770844

771845
vlocG[0] += vlocG0
772846
return vlocG

gpu4pyscf/pbc/tools/pbc.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def ifftk(g, mesh, expikr):
101101
return ifft(g, mesh) * expikr
102102

103103
def _get_Gv(cell, mesh):
104+
assert cell.dimension == 3
104105
# Default, the 3D uniform grids
105106
rx = cp.fft.fftfreq(mesh[0], 1./mesh[0])
106107
ry = cp.fft.fftfreq(mesh[1], 1./mesh[1])
@@ -112,6 +113,19 @@ def _get_Gv(cell, mesh):
112113
rz[:,None] * b[2])
113114
return Gv.reshape(-1, 3)
114115

116+
def _get_Gv_with_base(cell, mesh):
117+
assert cell.dimension == 3
118+
# Default, the 3D uniform grids
119+
rx = cp.fft.fftfreq(mesh[0], 1./mesh[0])
120+
ry = cp.fft.fftfreq(mesh[1], 1./mesh[1])
121+
rz = cp.fft.fftfreq(mesh[2], 1./mesh[2])
122+
b = cp.asarray(cell.reciprocal_vectors())
123+
#:Gv = lib.cartesian_prod(Gvbase).dot(b)
124+
Gv = (rx[:,None,None,None] * b[0] +
125+
ry[:,None,None] * b[1] +
126+
rz[:,None] * b[2])
127+
return Gv.reshape(-1, 3), (rx, ry, rz)
128+
115129
def _Gv_wrap_around(cell, Gv, k, mesh):
116130
'''wrap around the high frequency k+G vectors into their lower frequency
117131
counterparts. Important if you want the gamma point and k-point answers to

0 commit comments

Comments
 (0)