Faster gth pseudopotential evaluation on big system (#555)

henryw7 · web-flow · commit 1c9489d6cad8 · 2025-11-05T19:16:50.000-08:00
* Faster gth pseudopotential evaluation on big system

* Forgot to push one file

* More check before pseudopotential local term kernel
diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py
@@ -110,30 +110,30 @@ def reduce_to_device(array_list, inplace=False):
     assert len(array_list) == num_devices
     if num_devices == 1:
         return array_list[0]
-    
+
     out_shape = array_list[0].shape
     for s in _streams:
         s.synchronize()
-        
+
     if inplace:
         result = array_list[0]
     else:
         result = array_list[0].copy()
-    
+
     # Transfer data chunk by chunk, reduce memory footprint,
     result = result.reshape(-1)
     for device_id, matrix in enumerate(array_list):
         if device_id == 0:
             continue
-        
+
         assert matrix.device.id == device_id
         matrix = matrix.reshape(-1)
         blksize = 1024*1024*1024 // matrix.itemsize # 1GB
         for p0, p1 in lib.prange(0,len(matrix), blksize):
             result[p0:p1] += copy_array(matrix[p0:p1])
-            #result[p0:p1] += cupy.asarray(matrix[p0:p1]) 
+            #result[p0:p1] += cupy.asarray(matrix[p0:p1])
     return result.reshape(out_shape)
-    
+
 def device2host_2d(a_cpu, a_gpu, stream=None):
     if stream is None:
         stream = cupy.cuda.get_current_stream()
@@ -183,7 +183,7 @@ def asarray(a, **kwargs):
         # CuPy always allocates pinned memory as a temporary buffer during array transfer.
         # This leads to additional memory usage, and the buffer is not managed by CuPy's
         # memory pool or Python's GC.
-        # See the `cdef _ndarray_base _array_default` function in 
+        # See the `cdef _ndarray_base _array_default` function in
         # cupy/_core/core.pyx, where memory buffer is allocated via
         # mem = _alloc_async_transfer_buffer(nbytes)
 
@@ -388,7 +388,7 @@ def _initialize_c2s_data():
 def block_c2s_diag(angular, counts):
     '''
     Diagonal blocked cartesian to spherical transformation
-    Args: 
+    Args:
         angular (list): angular momentum type, e.g. [0,1,2,3]
         counts (list): count of each angular momentum
     '''
@@ -405,7 +405,7 @@ def block_c2s_diag(angular, counts):
         offsets += [c2s_offset[l]] * count
     rows = cupy.hstack(rows)
     cols = cupy.hstack(cols)
-    
+
     ncart, nsph = int(rows[-1]), int(cols[-1])
     cart2sph = cupy.zeros([ncart, nsph])
     offsets = cupy.asarray(offsets, dtype='int32')
@@ -690,7 +690,7 @@ def krylov(aop, b, x0=None, tol=1e-10, max_cycle=30, dot=cupy.dot,
     x1, rmat = _stable_qr(x1, cupy.dot, lindep=lindep)
     if len(x1) == 0:
         return cupy.zeros_like(b)
-    
+
     x1 *= rmat.diagonal()[:,None]
 
     innerprod = [rmat[i,i].real ** 2 for i in range(x1.shape[0])]
@@ -1153,3 +1153,33 @@ def malloc(size):
             return cuda_malloc(size)
         return default_mempool_malloc(size)
     cupy.cuda.set_allocator(malloc)
+
+def batched_vec3_norm2(batched_vec3):
+    assert type(batched_vec3) is cupy.ndarray
+    assert batched_vec3.dtype == cupy.float64
+    assert batched_vec3.ndim == 2
+    assert batched_vec3.shape[1] == 3
+    assert batched_vec3.flags.c_contiguous
+
+    fn_name = "vec3_norm2_kernel"
+    if fn_name not in _kernel_registery:
+        kernel_code = r'''
+            extern "C" __global__
+            void vec3_norm2_kernel(const double* __restrict__ vec3, double* __restrict__ norm2, const int n) {
+                const int i = blockDim.x * blockIdx.x + threadIdx.x;
+                if (i >= n) return;
+                const double x = vec3[i * 3 + 0];
+                const double y = vec3[i * 3 + 1];
+                const double z = vec3[i * 3 + 2];
+                norm2[i] = x*x + y*y + z*z;
+            }
+        '''
+        _kernel_registery[fn_name] = cupy.RawKernel(kernel_code, fn_name)
+    kernel = _kernel_registery[fn_name]
+
+    n = batched_vec3.shape[0]
+    assert n < np.iinfo(np.int32).max
+    batched_norm2 = cupy.zeros(n, dtype = cupy.float64)
+    kernel(((n + 1024 - 1) // 1024,), (1024,), (batched_vec3, batched_norm2, cupy.int32(n)))
+
+    return batched_norm2
diff --git a/gpu4pyscf/pbc/dft/multigrid.py b/gpu4pyscf/pbc/dft/multigrid.py
@@ -26,7 +26,7 @@
 from gpu4pyscf.lib import utils
 from gpu4pyscf.lib.cupy_helper import (
     load_library, tag_array, contract, sandwich_dot, block_diag, transpose_sum,
-    dist_matrix)
+    dist_matrix, batched_vec3_norm2)
 from gpu4pyscf.gto.mole import cart2sph_by_l
 from gpu4pyscf.dft import numint
 from gpu4pyscf.pbc import tools
@@ -723,22 +723,23 @@ def eval_vpplocG(cell, mesh):
     '''PRB, 58, 3641 Eq (5) first term
     '''
     assert cell.dimension != 2
-    Gv, (basex, basey, basez) = cell.get_Gv_weights(mesh)[:2]
-    basex = cp.asarray(basex)
-    basey = cp.asarray(basey)
-    basez = cp.asarray(basez)
+    Gv, (basex, basey, basez) = tools.pbc._get_Gv_with_base(cell, mesh)
     b = cell.reciprocal_vectors()
     coords = cell.atom_coords()
     rb = cp.asarray(coords.dot(b.T))
     SIx = cp.exp(-1j*rb[:,0,None] * basex)
     SIy = cp.exp(-1j*rb[:,1,None] * basey)
     SIz = cp.exp(-1j*rb[:,2,None] * basez)
-    G2 = contract('px,px->p', Gv, Gv)
+    # G2 = contract('px,px->p', Gv, Gv)
+    G2 = batched_vec3_norm2(Gv)
     charges = cell.atom_charges()
 
     coulG = tools.get_coulG(cell, Gv=Gv)
     vlocG = cp.zeros(len(G2), dtype=np.complex128)
     vlocG0 = 0
+
+    _kernel_registery = {}
+
     for ia in range(cell.natm):
         symb = cell.atom_symbol(ia)
         if symb not in cell._pseudo:
@@ -749,24 +750,97 @@ def eval_vpplocG(cell, mesh):
         if nexp == 0:
             continue
 
-        SI = (SIx[ia,:,None,None] * SIy[ia,:,None] * SIz[ia]).ravel()
-        G2_red = G2 * rloc**2
-        SI *= cp.exp(-0.5*G2_red)
         vlocG0 += 2*np.pi*charges[ia]*rloc**2
-        vlocG -= charges[ia] * coulG * SI
 
-        # Add the C1, C2, C3, C4 contributions
-        cfacs = 0
+        fn_name = f"gth_loc_reciporcal_nexp_{nexp}_kernel"
+        if fn_name not in _kernel_registery:
+            C_declaration = ''
+            C_contribution = ''
+            if nexp >= 1:
+                C_declaration += ', const double cexp0'
+                C_contribution += 'cfacs += cexp0;'
+            if nexp >= 2:
+                C_declaration += ', const double cexp1'
+                C_contribution += 'cfacs += cexp1 * (3 - G2_red);'
+            if nexp >= 3:
+                C_declaration += ', const double cexp2'
+                C_contribution += 'cfacs += cexp2 * (15 - 10 * G2_red + G2_red * G2_red);'
+            if nexp >= 4:
+                C_declaration += ', const double cexp3'
+                C_contribution += 'cfacs += cexp3 * (105 - 105 * G2_red + 21 * G2_red * G2_red - G2_red * G2_red * G2_red);'
+            kernel_code = r'''
+                #include <cupy/complex.cuh>
+                extern "C" __global__
+                void ''' + fn_name + '''(
+                    const double* __restrict__ grids_G2, const double* __restrict__ grids_coulG,
+                    const complex<double>* __restrict__ grids_SIx, const complex<double>* __restrict__ grids_SIy, const complex<double>* __restrict__ grids_SIz,
+                    complex<double>* __restrict__ grids_vlocG,
+                    const int n_mesh_x, const int n_mesh_y, const int n_mesh_z, const int i_atom,
+                    const double charge, const double rloc''' + C_declaration + r''')
+                {
+                    const int i_grid = blockDim.x * blockIdx.x + threadIdx.x;
+                    const int ngrids = n_mesh_x * n_mesh_y * n_mesh_z;
+                    if (i_grid >= ngrids) return;
+
+                    const double G2 = grids_G2[i_grid];
+                    const double coulG = grids_coulG[i_grid];
+                    const double G2_red = G2 * rloc * rloc;
+                    const int i_grid_x = i_grid / (n_mesh_y * n_mesh_z);
+                    const int i_grid_y = (i_grid - i_grid_x * (n_mesh_y * n_mesh_z)) / n_mesh_z;
+                    const int i_grid_z = i_grid - i_grid_x * (n_mesh_y * n_mesh_z) - i_grid_y * n_mesh_z;
+                    const complex<double> SIx = grids_SIx[i_atom * n_mesh_x + i_grid_x];
+                    const complex<double> SIy = grids_SIy[i_atom * n_mesh_y + i_grid_y];
+                    const complex<double> SIz = grids_SIz[i_atom * n_mesh_z + i_grid_z];
+                    const complex<double> SI = SIx * SIy * SIz * exp(-0.5 * G2_red);
+                    complex<double> vlocG = -charge * coulG * SI;
+
+                    double cfacs = 0;
+                    ''' + C_contribution + r'''
+                    vlocG += 15.749609945722419 * rloc * rloc * rloc * cfacs * SI;
+
+                    grids_vlocG[i_grid] += vlocG;
+                }
+            '''
+            _kernel_registery[fn_name] = cp.RawKernel(kernel_code, fn_name)
+        kernel = _kernel_registery[fn_name]
+
+        ngrids = G2.shape[0]
+        assert G2.shape == (ngrids,) and G2.dtype == cp.float64
+        assert coulG.shape == (ngrids,) and coulG.dtype == cp.float64
+        assert SIx.shape == (cell.natm, mesh[0]) and SIx.dtype == cp.complex128 and SIx.flags.c_contiguous
+        assert SIy.shape == (cell.natm, mesh[1]) and SIy.dtype == cp.complex128 and SIy.flags.c_contiguous
+        assert SIz.shape == (cell.natm, mesh[2]) and SIz.dtype == cp.complex128 and SIz.flags.c_contiguous
+        assert vlocG.shape == (ngrids,) and vlocG.dtype == cp.complex128
+        assert ngrids < np.iinfo(np.int32).max
+
+        kernel_parameters = [G2, coulG, SIx, SIy, SIz, vlocG, cp.int32(mesh[0]), cp.int32(mesh[1]), cp.int32(mesh[2]),
+                             cp.int32(ia), cp.float64(charges[ia]), cp.float64(rloc)]
         if nexp >= 1:
-            cfacs += cexp[0]
+            kernel_parameters.append(cp.float64(cexp[0]))
         if nexp >= 2:
-            cfacs += cexp[1] * (3 - G2_red)
+            kernel_parameters.append(cp.float64(cexp[1]))
         if nexp >= 3:
-            cfacs += cexp[2] * (15 - 10*G2_red + G2_red**2)
+            kernel_parameters.append(cp.float64(cexp[2]))
         if nexp >= 4:
-            cfacs += cexp[3] * (105 - 105*G2_red + 21*G2_red**2 - G2_red**3)
-
-        vlocG += (2*np.pi)**(3/2.)*rloc**3 * cfacs * SI
+            kernel_parameters.append(cp.float64(cexp[3]))
+        kernel(((ngrids + 1024 - 1) // 1024, ), (1024, ), kernel_parameters)
+
+        # SI = (SIx[ia,:,None,None] * SIy[ia,:,None] * SIz[ia]).ravel()
+        # G2_red = G2 * rloc**2
+        # SI *= cp.exp(-0.5*G2_red)
+        # vlocG -= charges[ia] * coulG * SI
+
+        # # Add the C1, C2, C3, C4 contributions
+        # cfacs = 0
+        # if nexp >= 1:
+        #     cfacs += cexp[0]
+        # if nexp >= 2:
+        #     cfacs += cexp[1] * (3 - G2_red)
+        # if nexp >= 3:
+        #     cfacs += cexp[2] * (15 - 10*G2_red + G2_red**2)
+        # if nexp >= 4:
+        #     cfacs += cexp[3] * (105 - 105*G2_red + 21*G2_red**2 - G2_red**3)
+        # vlocG += (2*np.pi)**(3/2.)*rloc**3 * cfacs * SI
 
     vlocG[0] += vlocG0
     return vlocG
diff --git a/gpu4pyscf/pbc/tools/pbc.py b/gpu4pyscf/pbc/tools/pbc.py
@@ -101,6 +101,7 @@ def ifftk(g, mesh, expikr):
     return ifft(g, mesh) * expikr
 
 def _get_Gv(cell, mesh):
+    assert cell.dimension == 3
     # Default, the 3D uniform grids
     rx = cp.fft.fftfreq(mesh[0], 1./mesh[0])
     ry = cp.fft.fftfreq(mesh[1], 1./mesh[1])
@@ -112,6 +113,19 @@ def _get_Gv(cell, mesh):
           rz[:,None] * b[2])
     return Gv.reshape(-1, 3)
 
+def _get_Gv_with_base(cell, mesh):
+    assert cell.dimension == 3
+    # Default, the 3D uniform grids
+    rx = cp.fft.fftfreq(mesh[0], 1./mesh[0])
+    ry = cp.fft.fftfreq(mesh[1], 1./mesh[1])
+    rz = cp.fft.fftfreq(mesh[2], 1./mesh[2])
+    b = cp.asarray(cell.reciprocal_vectors())
+    #:Gv = lib.cartesian_prod(Gvbase).dot(b)
+    Gv = (rx[:,None,None,None] * b[0] +
+          ry[:,None,None] * b[1] +
+          rz[:,None] * b[2])
+    return Gv.reshape(-1, 3), (rx, ry, rz)
+
 def _Gv_wrap_around(cell, Gv, k, mesh):
     '''wrap around the high frequency k+G vectors into their lower frequency
     counterparts. Important if you want the gamma point and k-point answers to