From 509550603b3a73783fda647eac62271d88640895 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 30 Oct 2025 16:08:58 +0800
Subject: [PATCH 01/21] begin to write

---
 gpu4pyscf/dft/tests/test_gks.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 gpu4pyscf/dft/tests/test_gks.py

diff --git a/gpu4pyscf/dft/tests/test_gks.py b/gpu4pyscf/dft/tests/test_gks.py
new file mode 100644
index 000000000..6a5e12d83
--- /dev/null
+++ b/gpu4pyscf/dft/tests/test_gks.py
@@ -0,0 +1,28 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+import numpy as np
+import unittest
+import pyscf
+from gpu4pyscf.dft import rks
+
+atom = '''
+O       0.0000000000    -0.0000000000     0.1174000000
+H      -0.7570000000    -0.0000000000    -0.4696000000
+H       0.7570000000     0.0000000000    -0.4696000000
+'''
+bas='def2-tzvpp'
+grids_level = 5
+nlcgrids_level = 2
\ No newline at end of file

From 692fa966d66f7bb4a269a19547bfaa786ab0dd50 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 31 Oct 2025 07:14:55 +0800
Subject: [PATCH 02/21] in writing

---
 gpu4pyscf/dft/gks.py      | 120 ++++++-
 gpu4pyscf/dft/numint2c.py | 721 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 838 insertions(+), 3 deletions(-)
 create mode 100644 gpu4pyscf/dft/numint2c.py

diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index 8d83e3f92..36b3f1ae6 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -12,19 +12,133 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cupy as cp
+from pyscf import lib
 from pyscf.dft import gks
 from gpu4pyscf.dft import numint
 from gpu4pyscf.dft import rks
 from gpu4pyscf.scf.ghf import GHF
+from gpu4pyscf.lib import logger
+from gpu4pyscf.lib.cupy_helper import tag_array
+
+
+def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
+    '''Coulomb + XC functional
+
+    .. note::
+        This function will change the ks object.
+
+    Args:
+        ks : an instance of :class:`RKS`
+            XC functional are controlled by ks.xc attribute.  Attribute
+            ks.grids might be initialized.
+        dm : ndarray or list of ndarrays
+            A density matrix or a list of density matrices
+
+    Kwargs:
+        dm_last : ndarray or a list of ndarrays or 0
+            The density matrix baseline.  If not 0, this function computes the
+            increment of HF potential w.r.t. the reference HF potential matrix.
+        vhf_last : ndarray or a list of ndarrays or 0
+            The reference Vxc potential matrix.
+        hermi : int
+            Whether J, K matrix is hermitian
+
+            | 0 : no hermitian or symmetric
+            | 1 : hermitian
+            | 2 : anti-hermitian
+
+    Returns:
+        matrix Veff = J + Vxc.  Veff can be a list matrices, if the input
+        dm is a list of density matrices.
+    '''
+    if mol is None: mol = ks.mol
+    if dm is None: dm = ks.make_rdm1()
+    ks.initialize_grids(mol, dm)
+
+    t0 = (logger.process_clock(), logger.perf_counter())
+
+    ground_state = isinstance(dm, cp.ndarray) and dm.ndim == 2
+
+    if hermi == 2:  # because rho = 0
+        n, exc, vxc = 0, 0, 0
+    else:
+        max_memory = ks.max_memory - lib.current_memory()[0]
+        ni = ks._numint
+        n, exc, vxc = ni.get_vxc(mol, ks.grids, ks.xc, dm,
+                                 hermi=hermi, max_memory=max_memory)
+        logger.debug(ks, 'nelec by numeric integration = %s', n)
+        if ks.do_nlc():
+            if ni.libxc.is_nlc(ks.xc):
+                xc = ks.xc
+            else:
+                assert ni.libxc.is_nlc(ks.nlc)
+                xc = ks.nlc
+            n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm,
+                                          hermi=hermi, max_memory=max_memory)
+            exc += enlc
+            vxc += vnlc
+            logger.debug(ks, 'nelec with nlc grids = %s', n)
+        t0 = logger.timer(ks, 'vxc', *t0)
+
+    incremental_jk = (ks._eri is None and ks.direct_scf and
+                      getattr(vhf_last, 'vj', None) is not None)
+    if incremental_jk:
+        _dm = cp.asarray(dm) - cp.asarray(dm_last)
+    else:
+        _dm = dm
+    if not ni.libxc.is_hybrid_xc(ks.xc):
+        vk = None
+        vj = ks.get_j(mol, _dm, hermi)
+        if incremental_jk:
+            vj += vhf_last.vj
+        vxc += vj
+    else:
+        omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
+        if omega == 0:
+            vj, vk = ks.get_jk(mol, _dm, hermi)
+            vk *= hyb
+        elif alpha == 0: # LR=0, only SR exchange
+            vj = ks.get_j(mol, _dm, hermi)
+            vk = ks.get_k(mol, _dm, hermi, omega=-omega)
+            vk *= hyb
+        elif hyb == 0: # SR=0, only LR exchange
+            vj = ks.get_j(mol, _dm, hermi)
+            vk = ks.get_k(mol, _dm, hermi, omega=omega)
+            vk *= alpha
+        else: # SR and LR exchange with different ratios
+            vj, vk = ks.get_jk(mol, _dm, hermi)
+            vk *= hyb
+            vklr = ks.get_k(mol, _dm, hermi, omega=omega)
+            vklr *= (alpha - hyb)
+            vk += vklr
+        if incremental_jk:
+            vj += vhf_last.vj
+            vk += vhf_last.vk
+        vxc += vj - vk
+
+        if ground_state:
+            exc -= cp.einsum('ij,ji', dm, vk).real * .5
+
+    if ground_state:
+        ecoul = cp.einsum('ij,ji', dm, vj).real * .5
+    else:
+        ecoul = None
+
+    vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk)
+    return vxc
+
 
 class GKS(gks.GKS, GHF):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
     def __init__(self, mol, xc='LDA,VWN'):
-        raise NotImplementedError
-
+        GHF.__init__(self, mol)
+        rks.KohnShamDFT.__init__(self, xc)
+        # self._numint = NumInt2C()
+    
+    get_veff = get_veff
     reset = rks.RKS.reset
     energy_elec = rks.RKS.energy_elec
-    get_veff = NotImplemented
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
new file mode 100644
index 000000000..034d3fe03
--- /dev/null
+++ b/gpu4pyscf/dft/numint2c.py
@@ -0,0 +1,721 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Numerical integration functions for (2-component) GKS with real AO basis
+'''
+
+import functools
+import numpy as np
+from pyscf import lib
+from pyscf.dft import numint
+from pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot, BLKSIZE
+from pyscf.dft import xc_deriv
+from pyscf import __config__
+
+
+@lib.with_doc(
+    '''Calculate the electron density and magnetization spin density in the
+    framework of 2-component real basis.
+    ''' + numint.eval_rho.__doc__)
+def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
+             with_lapl=True, verbose=None):
+    nao = ao.shape[-1]
+    assert dm.ndim == 2 and nao * 2 == dm.shape[0]
+
+    ngrids, nao = ao.shape[-2:]
+    xctype = xctype.upper()
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc
+
+    if xctype == 'LDA':
+        c0a = _dot_ao_dm(mol, ao, dm[:nao], non0tab, shls_slice, ao_loc)
+        c0b = _dot_ao_dm(mol, ao, dm[nao:], non0tab, shls_slice, ao_loc)
+        rho_m = _contract_rho_m((ao, ao), (c0a, c0b), hermi, True)
+    elif xctype == 'GGA':
+        # first 4 ~ (rho, m), second 4 ~ (0th order, dx, dy, dz)
+        if hermi:
+            rho_m = np.empty((4, 4, ngrids))
+        else:
+            rho_m = np.empty((4, 4, ngrids), dtype=np.complex128)
+        c0a = _dot_ao_dm(mol, ao[0], dm[:nao], non0tab, shls_slice, ao_loc)
+        c0b = _dot_ao_dm(mol, ao[0], dm[nao:], non0tab, shls_slice, ao_loc)
+        c0 = (c0a, c0b)
+        rho_m[:,0] = _contract_rho_m((ao[0], ao[0]), c0, hermi, True)
+        for i in range(1, 4):
+            rho_m[:,i] = _contract_rho_m((ao[i], ao[i]), c0, hermi, False)
+        if hermi:
+            rho_m[:,1:4] *= 2  # *2 for |ao> dm < dx ao| + |dx ao> dm < ao|
+        else:
+            for i in range(1, 4):
+                c1a = _dot_ao_dm(mol, ao[i], dm[:nao], non0tab, shls_slice, ao_loc)
+                c1b = _dot_ao_dm(mol, ao[i], dm[nao:], non0tab, shls_slice, ao_loc)
+                rho_m[:,i] += _contract_rho_m((ao[0], ao[0]), (c1a, c1b), hermi, False)
+    else: # meta-GGA
+        if hermi:
+            dtype = np.double
+        else:
+            dtype = np.complex128
+        if with_lapl:
+            rho_m = np.empty((4, 6, ngrids), dtype=dtype)
+            tau_idx = 5
+        else:
+            rho_m = np.empty((4, 5, ngrids), dtype=dtype)
+            tau_idx = 4
+        c0a = _dot_ao_dm(mol, ao[0], dm[:nao], non0tab, shls_slice, ao_loc)
+        c0b = _dot_ao_dm(mol, ao[0], dm[nao:], non0tab, shls_slice, ao_loc)
+        c0 = (c0a, c0b)
+        rho_m[:,0] = _contract_rho_m((ao[0], ao[0]), c0, hermi, True)
+        rho_m[:,tau_idx] = 0
+        for i in range(1, 4):
+            c1a = _dot_ao_dm(mol, ao[i], dm[:nao], non0tab, shls_slice, ao_loc)
+            c1b = _dot_ao_dm(mol, ao[i], dm[nao:], non0tab, shls_slice, ao_loc)
+            rho_m[:,tau_idx] += _contract_rho_m((ao[i], ao[i]), (c1a, c1b), hermi, True)
+
+            rho_m[:,i] = _contract_rho_m((ao[i], ao[i]), c0, hermi, False)
+            if hermi:
+                rho_m[:,i] *= 2
+            else:
+                rho_m[:,i] += _contract_rho_m((ao[0], ao[0]), (c1a, c1b), hermi, False)
+        if with_lapl:
+            # TODO: rho_m[:,4] = \nabla^2 rho
+            raise NotImplementedError
+        # tau = 1/2 (\nabla f)^2
+        rho_m[:,tau_idx] *= .5
+    return rho_m
+
+def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
+                  max_memory=2000, verbose=None):
+    xctype = ni._xc_type(xc_code)
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc_nr()
+
+    make_rho, nset, n2c = ni._gen_rho_evaluator(mol, dms, hermi, False)
+    nao = n2c // 2
+
+    nelec = np.zeros(nset)
+    excsum = np.zeros(nset)
+    vmat = np.zeros((nset,n2c,n2c), dtype=np.complex128)
+
+    if xctype in ('LDA', 'GGA', 'MGGA'):
+        f_eval_mat = {
+            ('LDA' , 'n'): (_ncol_lda_vxc_mat , 0),
+            ('LDA' , 'm'): (_mcol_lda_vxc_mat , 0),
+            ('GGA' , 'm'): (_mcol_gga_vxc_mat , 1),
+            ('MGGA', 'm'): (_mcol_mgga_vxc_mat, 1),
+        }
+        fmat, ao_deriv = f_eval_mat[(xctype, ni.collinear[0])]
+
+        if ni.collinear[0] == 'm':  # mcol
+            eval_xc = ni.mcfun_eval_xc_adapter(xc_code)
+        else:
+            eval_xc = ni.eval_xc_eff
+
+        for ao, mask, weight, coords \
+                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+            for i in range(nset):
+                rho = make_rho(i, ao, mask, xctype)
+                exc, vxc = eval_xc(xc_code, rho, deriv=1, xctype=xctype)[:2]
+                if xctype == 'LDA':
+                    den = rho[0] * weight
+                else:
+                    den = rho[0,0] * weight
+                nelec[i] += den.sum()
+                excsum[i] += np.dot(den, exc)
+                vmat[i] += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
+                                ao_loc, hermi)
+
+    elif xctype == 'HF':
+        pass
+    else:
+        raise NotImplementedError(f'numint2c.get_vxc for functional {xc_code}')
+
+    if hermi:
+        vmat = vmat + vmat.conj().transpose(0,2,1)
+    if isinstance(dms, np.ndarray) and dms.ndim == 2:
+        vmat = vmat[0]
+        nelec = nelec[0]
+        excsum = excsum[0]
+    return nelec, excsum, vmat
+
+def _gks_mcol_fxc(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
+                  rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+    assert ni.collinear[0] == 'm'  # mcol
+    xctype = ni._xc_type(xc_code)
+    if fxc is None and xctype in ('LDA', 'GGA', 'MGGA'):
+        fxc = ni.cache_xc_kernel1(mol, grids, xc_code, dm0,
+                                  max_memory=max_memory)[2]
+
+    if xctype == 'MGGA':
+        fmat, ao_deriv = (_mcol_mgga_fxc_mat , 1)
+    elif xctype == 'GGA':
+        fmat, ao_deriv = (_mcol_gga_fxc_mat  , 1)
+    else:
+        fmat, ao_deriv = (_mcol_lda_fxc_mat  , 0)
+
+    shls_slice = (0, mol.nbas)
+    ao_loc = mol.ao_loc_nr()
+    make_rho1, nset, n2c = ni._gen_rho_evaluator(mol, dms, hermi, False)
+    nao = n2c // 2
+    vmat = np.zeros((nset,n2c,n2c), dtype=np.complex128)
+
+    if xctype in ('LDA', 'GGA', 'MGGA'):
+        _rho0 = None
+        p1 = 0
+        for ao, mask, weight, coords \
+                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+            p0, p1 = p1, p1 + weight.size
+            _fxc = fxc[:,:,:,:,p0:p1]
+            for i in range(nset):
+                rho1 = make_rho1(i, ao, mask, xctype)
+                vmat[i] += fmat(mol, ao, weight, _rho0, rho1, _fxc,
+                                mask, shls_slice, ao_loc, hermi)
+    elif xctype == 'HF':
+        pass
+    else:
+        raise NotImplementedError(f'numint2c.get_fxc for functional {xc_code}')
+
+    if hermi:
+        vmat = vmat + vmat.conj().transpose(0,2,1)
+    if isinstance(dms, np.ndarray) and dms.ndim == 2:
+        vmat = vmat[0]
+    return vmat
+
+def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of non-collinear LDA'''
+    # NOTE vxc in u/d representation
+    r, mx, my, mz = rho
+    vxc = xc_deriv.ud2ts(vxc)
+    vr, vs = vxc[:,0]
+    s = lib.norm(rho[1:4], axis=0)
+
+    wv = weight * vr
+    with np.errstate(divide='ignore',invalid='ignore'):
+        ws = vs * weight / s
+    ws[s < 1e-20] = 0
+
+    # * .5 because of v+v.conj().T in r_vxc
+    if hermi:
+        wv *= .5
+        ws *= .5
+    aow = None
+    aow = _scale_ao(ao, ws*mx, out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao, ws*my, out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    if hermi:
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = np.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+    tmpx = tmpy = None
+    aow = _scale_ao(ao, wv+ws*mz, out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao, wv-ws*mz, out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    mat = np.block([[mataa, matab], [matba, matbb]])
+    return mat
+
+def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
+                 verbose=None, spin=None):
+    r'''Returns the derivative tensor against the density parameters
+
+    [[density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a],
+     [density_b, (nabla_x)_b, (nabla_y)_b, (nabla_z)_b, tau_b]].
+
+    It differs from the eval_xc method in the derivatives of non-local part.
+    The eval_xc method returns the XC functional derivatives to sigma
+    (|\nabla \rho|^2)
+
+    Args:
+        rho: 2-dimensional or 3-dimensional array
+            Total density and spin density (and their derivatives if GGA or MGGA
+            functionals) on grids
+
+    Kwargs:
+        deriv: int
+            derivative orders
+        omega: float
+            define the exponent in the attenuated Coulomb for RSH functional
+    '''
+    if omega is None: omega = ni.omega
+    if xctype is None: xctype = ni._xc_type(xc_code)
+    if ni.collinear[0] == 'c':  # collinear
+        t = rho[0]
+        s = rho[3]
+    elif ni.collinear[0] == 'n':  # ncol
+        t = rho[0]
+        m = rho[1:4]
+        s = lib.norm(m, axis=0)
+    elif ni.collinear[0] == 'm':  # mcol
+        # called by mcfun.eval_xc_eff which passes (rho, s) only
+        t, s = rho
+    else:
+        raise RuntimeError(f'Unknown collinear scheme {ni.collinear}')
+
+    rho = np.stack([(t + s) * .5, (t - s) * .5])
+    if xctype == 'MGGA' and rho.shape[1] == 6:
+        rho = np.asarray(rho[:,[0,1,2,3,5],:], order='C')
+
+    assert spin is None or spin == 1
+    spin = 1
+
+    out = ni.libxc.eval_xc1(xc_code, rho, spin, deriv, omega)
+    evfk = [out[0]]
+    for order in range(1, deriv+1):
+        evfk.append(xc_deriv.transform_xc(rho, out, xctype, spin, order))
+    if deriv < 3:
+        # The return has at least [e, v, f, k] terms
+        evfk.extend([None] * (3 - deriv))
+    return evfk
+
+# * Mcfun requires functional derivatives to total-density and spin-density.
+# * Make it a global function than a closure so as to be callable by multiprocessing
+def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
+    evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype)
+    for order in range(1, deriv+1):
+        if evfk[order] is not None:
+            evfk[order] = xc_deriv.ud2ts(evfk[order])
+    return evfk
+
+def mcfun_eval_xc_adapter(ni, xc_code):
+    '''Wrapper to generate the eval_xc function required by mcfun'''
+    try:
+        import mcfun
+    except ImportError:
+        raise ImportError('This feature requires mcfun library.\n'
+                          'Try install mcfun with `pip install mcfun`')
+
+    xctype = ni._xc_type(xc_code)
+    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype)
+    nproc = lib.num_threads()
+    def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None,
+                    verbose=None, spin=None):
+        return mcfun.eval_xc_eff(
+            fn_eval_xc, rho, deriv, spin_samples=ni.spin_samples,
+            collinear_threshold=ni.collinear_thrd,
+            collinear_samples=ni.collinear_samples, workers=nproc)
+    return eval_xc_eff
+
+def _mcol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of multi-collinear LDA'''
+    wv = weight * vxc
+    if hermi:
+        wv *= .5  # * .5 because of v+v.conj().T in r_vxc
+    wr, wmx, wmy, wmz = wv
+
+    # einsum('g,g,xgi,xgj->ij', vxc, weight, ao, ao)
+    # + einsum('xy,g,g,xgi,ygj->ij', sx, vxc, weight, ao, ao)
+    # + einsum('xy,g,g,xgi,ygj->ij', sy, vxc, weight, ao, ao)
+    # + einsum('xy,g,g,xgi,ygj->ij', sz, vxc, weight, ao, ao)
+    aow = None
+    aow = _scale_ao(ao, wmx[0], out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao, wmy[0], out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    if hermi:
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = np.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+    tmpx = tmpy = None
+    aow = _scale_ao(ao, wr[0]+wmz[0], out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao, wr[0]-wmz[0], out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
+    mat = np.block([[mataa, matab], [matba, matbb]])
+    return mat
+
+def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of multi-collinear LDA'''
+    wv = weight * vxc
+    if hermi:
+        wv[:,0] *= .5  # * .5 because of v+v.conj().T in r_vxc
+    wr, wmx, wmy, wmz = wv
+
+    aow = None
+    aow = _scale_ao(ao[:4], wr[:4]+wmz[:4], out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wr[:4]-wmz[:4], out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wmx[:4], out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wmy[:4], out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    if hermi:
+        assert vxc.dtype == np.double
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = np.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        aow = _scale_ao(ao[1:4], wmx[1:4].conj(), out=aow)  # Mx
+        tmpx += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], wmy[1:4].conj(), out=aow)  # My
+        tmpy += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+        aow = _scale_ao(ao[1:4], (wr[1:4]+wmz[1:4]).conj(), out=aow)  # Mz
+        mataa += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
+        matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+
+    mat = np.block([[mataa, matab], [matba, matbb]])
+    return mat
+
+def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
+    '''Vxc matrix of multi-collinear MGGA'''
+    wv = weight * vxc
+    tau_idx = 4
+    wv[:,tau_idx] *= .5  # *.5 for 1/2 in tau
+    if hermi:
+        wv[:,0] *= .5  # * .5 because of v+v.conj().T in r_vxc
+        wv[:,tau_idx] *= .5
+    wr, wmx, wmy, wmz = wv
+
+    aow = None
+    aow = _scale_ao(ao[:4], wr[:4]+wmz[:4], out=aow)  # Mz
+    mataa = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    mataa += _tau_dot(mol, ao, ao, wr[tau_idx]+wmz[tau_idx], mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wr[:4]-wmz[:4], out=aow)  # Mz
+    matbb = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    matbb += _tau_dot(mol, ao, ao, wr[tau_idx]-wmz[tau_idx], mask, shls_slice, ao_loc)
+
+    aow = _scale_ao(ao[:4], wmx[:4], out=aow)  # Mx
+    tmpx = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    tmpx += _tau_dot(mol, ao, ao, wmx[tau_idx], mask, shls_slice, ao_loc)
+    aow = _scale_ao(ao[:4], wmy[:4], out=aow)  # My
+    tmpy = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
+    tmpy += _tau_dot(mol, ao, ao, wmy[tau_idx], mask, shls_slice, ao_loc)
+    if hermi:
+        assert vxc.dtype == np.double
+        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
+        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
+        matab = np.zeros_like(matba)
+    else:
+        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
+        aow = _scale_ao(ao[1:4], wmx[1:4].conj(), out=aow)  # Mx
+        tmpx += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], wmy[1:4].conj(), out=aow)  # My
+        tmpy += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        matba = tmpx + tmpy * 1j
+        matab = tmpx - tmpy * 1j
+        aow = _scale_ao(ao[1:4], (wr[1:4]+wmz[1:4]).conj(), out=aow)  # Mz
+        mataa += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+        aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
+        matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
+
+    mat = np.block([[mataa, matab], [matba, matbb]])
+    return mat
+
+def _mcol_lda_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
+                      mask, shls_slice, ao_loc, hermi):
+    '''Kernel matrix of multi-collinear LDA'''
+    vxc1 = np.einsum('ag,abyg->byg', rho1, fxc[:,0,:])
+    return _mcol_lda_vxc_mat(mol, ao, weight, rho0, vxc1, mask, shls_slice,
+                             ao_loc, hermi)
+
+def _mcol_gga_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
+                      mask, shls_slice, ao_loc, hermi):
+    '''Kernel matrix of multi-collinear GGA'''
+    vxc1 = np.einsum('axg,axbyg->byg', rho1, fxc)
+    return _mcol_gga_vxc_mat(mol, ao, weight, rho0, vxc1, mask, shls_slice,
+                             ao_loc, hermi)
+
+def _mcol_mgga_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
+                       mask, shls_slice, ao_loc, hermi):
+    '''Kernel matrix of multi-collinear MGGA'''
+    vxc1 = np.einsum('axg,axbyg->byg', rho1, fxc)
+    return _mcol_mgga_vxc_mat(mol, ao, weight, rho0, vxc1, mask, shls_slice,
+                              ao_loc, hermi)
+
+def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
+    '''
+    hermi indicates whether the density matrix is hermitian.
+    bra_eq_ket indicates whether bra and ket basis are the same AOs.
+    '''
+    # rho = einsum('xgi,ij,xgj->g', ket, dm, bra.conj())
+    # mx = einsum('xy,ygi,ij,xgj->g', sx, ket, dm, bra.conj())
+    # my = einsum('xy,ygi,ij,xgj->g', sy, ket, dm, bra.conj())
+    # mz = einsum('xy,ygi,ij,xgj->g', sz, ket, dm, bra.conj())
+    ket_a, ket_b = ket
+    bra_a, bra_b = bra
+    nao = min(ket_a.shape[1], bra_a.shape[1])
+    ngrids = ket_a.shape[0]
+    if hermi:
+        raa = np.einsum('pi,pi->p', bra_a.real, ket_a[:,:nao].real)
+        raa+= np.einsum('pi,pi->p', bra_a.imag, ket_a[:,:nao].imag)
+        rab = np.einsum('pi,pi->p', bra_a.conj(), ket_b[:,:nao])
+        rbb = np.einsum('pi,pi->p', bra_b.real, ket_b[:,nao:].real)
+        rbb+= np.einsum('pi,pi->p', bra_b.imag, ket_b[:,nao:].imag)
+        rho_m = np.empty((4, ngrids))
+        rho_m[0,:] = raa + rbb     # rho
+        rho_m[1,:] = rab.real      # mx
+        rho_m[2,:] = rab.imag      # my
+        rho_m[3,:] = raa - rbb     # mz
+        if bra_eq_ket:
+            rho_m[1,:] *= 2
+            rho_m[2,:] *= 2
+        else:
+            rba = np.einsum('pi,pi->p', bra_b.conj(), ket_a[:,nao:])
+            rho_m[1,:] += rba.real
+            rho_m[2,:] -= rba.imag
+    else:
+        raa = np.einsum('pi,pi->p', bra_a.conj(), ket_a[:,:nao])
+        rba = np.einsum('pi,pi->p', bra_b.conj(), ket_a[:,nao:])
+        rab = np.einsum('pi,pi->p', bra_a.conj(), ket_b[:,:nao])
+        rbb = np.einsum('pi,pi->p', bra_b.conj(), ket_b[:,nao:])
+        rho_m = np.empty((4, ngrids), dtype=np.complex128)
+        rho_m[0,:] = raa + rbb         # rho
+        rho_m[1,:] = rab + rba         # mx
+        rho_m[2,:] = (rba - rab) * 1j  # my
+        rho_m[3,:] = raa - rbb         # mz
+    return rho_m
+
+
+class NumInt2C(lib.StreamObject, numint.LibXCMixin):
+    '''Numerical integration methods for 2-component basis (used by GKS)'''
+
+    # collinear schemes:
+    #   'col' (collinear, by default)
+    #   'ncol' (non-collinear)
+    #   'mcol' (multi-collinear)
+    collinear = getattr(__config__, 'dft_numint_RnumInt_collinear', 'col')
+    spin_samples = getattr(__config__, 'dft_numint_RnumInt_spin_samples', 770)
+    collinear_thrd = getattr(__config__, 'dft_numint_RnumInt_collinear_thrd', 0.99)
+    collinear_samples = getattr(__config__, 'dft_numint_RnumInt_collinear_samples', 200)
+
+    make_mask = staticmethod(numint.make_mask)
+    eval_ao = staticmethod(numint.eval_ao)
+    eval_rho = staticmethod(eval_rho)
+
+    def eval_rho1(self, mol, ao, dm, screen_index=None, xctype='LDA', hermi=0,
+                  with_lapl=True, cutoff=None, ao_cutoff=None, pair_mask=None,
+                  verbose=None):
+        return self.eval_rho(mol, ao, dm, screen_index, xctype, hermi,
+                             with_lapl, verbose=verbose)
+
+    def eval_rho2(self, mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
+                  with_lapl=True, verbose=None):
+        '''Calculate the electron density for LDA functional and the density
+        derivatives for GGA functional in the framework of 2-component basis.
+        '''
+        if self.collinear[0] in ('n', 'm'):
+            # TODO:
+            dm = np.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+            hermi = 1
+            rho = self.eval_rho(mol, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
+            return rho
+
+        #if mo_coeff.dtype == np.double:
+        #    nao = ao.shape[-1]
+        #    assert nao * 2 == mo_coeff.shape[0]
+        #    mo_aR = mo_coeff[:nao]
+        #    mo_bR = mo_coeff[nao:]
+        #    rho  = numint.eval_rho2(mol, ao, mo_aR, mo_occ, non0tab, xctype, with_lapl, verbose)
+        #    rho += numint.eval_rho2(mol, ao, mo_bR, mo_occ, non0tab, xctype, with_lapl, verbose)
+        #else:
+        #    dm = np.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+        #    hermi = 1
+        #    rho = self.eval_rho(mol, dm, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
+        #mx = my = mz = None
+        #return rho, mx, my, mz
+        raise NotImplementedError(self.collinear)
+
+    def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
+                        max_memory=2000):
+        '''Compute the 0th order density, Vxc and fxc.  They can be used in TDDFT,
+        DFT hessian module etc.
+        '''
+        xctype = self._xc_type(xc_code)
+        if xctype in ('GGA', 'MGGA'):
+            ao_deriv = 1
+        else:
+            ao_deriv = 0
+        n2c = mo_coeff.shape[0]
+        nao = n2c // 2
+        with_lapl = numint.MGGA_DENSITY_LAPL
+
+        if self.collinear[0] in ('m', 'n'):  # mcol or ncol
+            rho = []
+            for ao, mask, weight, coords \
+                    in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
+                rho.append(self.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype,
+                                          with_lapl))
+            rho = np.concatenate(rho,axis=-1)
+            assert rho.dtype == np.double
+            if self.collinear[0] == 'm':  # mcol
+                eval_xc = self.mcfun_eval_xc_adapter(xc_code)
+            else:
+                eval_xc = self.eval_xc_eff
+            vxc, fxc = eval_xc(xc_code, rho, deriv=2, xctype=xctype)[1:3]
+        else:
+            # rhoa and rhob must be real
+            dm = np.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+            dm_a = dm[:nao,:nao].real.copy('C')
+            dm_b = dm[nao:,nao:].real.copy('C')
+            ni = self._to_numint1c()
+            hermi = 1
+            rhoa = []
+            rhob = []
+            for ao, mask, weight, coords \
+                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+                # rhoa and rhob must be real
+                rhoa.append(ni.eval_rho(mol, ao, dm_a, mask, xctype, hermi, with_lapl))
+                rhob.append(ni.eval_rho(mol, ao, dm_b, mask, xctype, hermi, with_lapl))
+            rho = np.stack([np.concatenate(rhoa,axis=-1), np.concatenate(rhob,axis=-1)])
+            assert rho.dtype == np.double
+            vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
+        return rho, vxc, fxc
+
+    def cache_xc_kernel1(self, mol, grids, xc_code, dm, spin=0, max_memory=2000):
+        '''Compute the 0th order density, Vxc and fxc.  They can be used in TDDFT,
+        DFT hessian module etc.
+        '''
+        xctype = self._xc_type(xc_code)
+        if xctype in ('GGA', 'MGGA'):
+            ao_deriv = 1
+        else:
+            ao_deriv = 0
+        n2c = dm.shape[0]
+        nao = n2c // 2
+
+        if self.collinear[0] in ('m', 'n'):  # mcol or ncol
+            hermi = 1 # rho must be real. We need to assume dm hermitian
+            with_lapl = False
+            rho = []
+            for ao, mask, weight, coords \
+                    in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
+                rho.append(self.eval_rho1(mol, ao, dm, mask, xctype, hermi, with_lapl))
+            rho = np.concatenate(rho,axis=-1)
+            assert rho.dtype == np.double
+            if self.collinear[0] == 'm':  # mcol
+                eval_xc = self.mcfun_eval_xc_adapter(xc_code)
+            else:
+                eval_xc = self.eval_xc_eff
+            vxc, fxc = eval_xc(xc_code, rho, deriv=2, xctype=xctype)[1:3]
+        else:
+            # rhoa and rhob must be real. We need to assume dm hermitian
+            hermi = 1
+            # TODO: test if dm[:nao,:nao].imag == 0
+            dm_a = dm[:nao,:nao].real.copy('C')
+            dm_b = dm[nao:,nao:].real.copy('C')
+            ni = self._to_numint1c()
+            with_lapl = True
+            rhoa = []
+            rhob = []
+            for ao, mask, weight, coords \
+                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
+                # rhoa and rhob must be real
+                rhoa.append(ni.eval_rho(mol, ao, dm_a, mask, xctype, hermi, with_lapl))
+                rhob.append(ni.eval_rho(mol, ao, dm_b, mask, xctype, hermi, with_lapl))
+            rho = np.stack([np.concatenate(rhoa,axis=-1), np.concatenate(rhob,axis=-1)])
+            assert rho.dtype == np.double
+            vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
+        return rho, vxc, fxc
+
+    def get_rho(self, mol, dm, grids, max_memory=2000):
+        '''Density in real space
+        '''
+        nao = dm.shape[-1] // 2
+        dm_a = dm[:nao,:nao].real
+        dm_b = dm[nao:,nao:].real
+        ni = self._to_numint1c()
+        return ni.get_rho(mol, dm_a+dm_b, grids, max_memory)
+
+    _gks_mcol_vxc = _gks_mcol_vxc
+    _gks_mcol_fxc = _gks_mcol_fxc
+
+    @lib.with_doc(numint.nr_rks.__doc__)
+    def nr_vxc(self, mol, grids, xc_code, dms, spin=0, relativity=0, hermi=1,
+               max_memory=2000, verbose=None):
+        if self.collinear[0] in ('m', 'n'):  # mcol or ncol
+            n, exc, vmat = self._gks_mcol_vxc(mol, grids, xc_code, dms, relativity,
+                                              hermi, max_memory, verbose)
+        else:
+            nao = dms.shape[-1] // 2
+            # ground state density is always real
+            dm_a = dms[...,:nao,:nao].real.copy('C')
+            dm_b = dms[...,nao:,nao:].real.copy('C')
+            dm1 = (dm_a, dm_b)
+            ni = self._to_numint1c()
+            n, exc, v = ni.nr_uks(mol, grids, xc_code, dm1, relativity,
+                                  hermi, max_memory, verbose)
+            vmat = np.zeros_like(dms)
+            vmat[...,:nao,:nao] = v[0]
+            vmat[...,nao:,nao:] = v[1]
+        return n.sum(), exc, vmat
+    get_vxc = nr_gks_vxc = nr_vxc
+
+    @lib.with_doc(numint.nr_nlc_vxc.__doc__)
+    def nr_nlc_vxc(self, mol, grids, xc_code, dm, spin=0, relativity=0, hermi=1,
+                   max_memory=2000, verbose=None):
+        assert dm.ndim == 2
+        nao = dm.shape[-1] // 2
+        # ground state density is always real
+        dm_a = dm[:nao,:nao].real
+        dm_b = dm[nao:,nao:].real
+        ni = self._to_numint1c()
+        n, exc, v = ni.nr_nlc_vxc(mol, grids, xc_code, dm_a+dm_b, relativity,
+                                  hermi, max_memory, verbose)
+        vmat = np.zeros_like(dm)
+        vmat[:nao,:nao] = v
+        vmat[nao:,nao:] = v
+        return n, exc, vmat
+
+    @lib.with_doc(numint.nr_rks_fxc.__doc__)
+    def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
+               rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
+        if self.collinear[0] not in ('c', 'm'):  # col or mcol
+            raise NotImplementedError('non-collinear fxc')
+
+        if self.collinear[0] == 'm':  # mcol
+            fxcmat = self._gks_mcol_fxc(mol, grids, xc_code, dm0, dms,
+                                        relativity, hermi, rho0, vxc, fxc,
+                                        max_memory, verbose)
+        else:
+            dms = np.asarray(dms)
+            nao = dms.shape[-1] // 2
+            if dm0 is not None:
+                dm0a = dm0[:nao,:nao].real.copy('C')
+                dm0b = dm0[nao:,nao:].real.copy('C')
+                dm0 = (dm0a, dm0b)
+            # dms_a and dms_b may be complex if they are TDDFT amplitudes
+            dms_a = dms[...,:nao,:nao].copy()
+            dms_b = dms[...,nao:,nao:].copy()
+            dm1 = (dms_a, dms_b)
+            ni = self._to_numint1c()
+            vmat = ni.nr_uks_fxc(mol, grids, xc_code, dm0, dm1, relativity,
+                                 hermi, rho0, vxc, fxc, max_memory, verbose)
+            fxcmat = np.zeros_like(dms)
+            fxcmat[...,:nao,:nao] = vmat[0]
+            fxcmat[...,nao:,nao:] = vmat[1]
+        return fxcmat
+    get_fxc = nr_gks_fxc = nr_fxc
+
+    eval_xc_eff = _eval_xc_eff
+    mcfun_eval_xc_adapter = mcfun_eval_xc_adapter
+
+    block_loop = numint.NumInt.block_loop
+    _gen_rho_evaluator = numint.NumInt._gen_rho_evaluator
+
+    def _to_numint1c(self):
+        '''Converts to the associated class to handle collinear systems'''
+        return self.view(numint.NumInt)
\ No newline at end of file

From d4f7db9070e805c97d2424c5c2f753598e1e655a Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 31 Oct 2025 11:46:46 +0800
Subject: [PATCH 03/21] finish eval_rho, needs debug

---
 gpu4pyscf/dft/numint2c.py            | 214 ++++++++-------------------
 gpu4pyscf/dft/tests/test_numint2c.py | 140 ++++++++++++++++++
 2 files changed, 204 insertions(+), 150 deletions(-)
 create mode 100644 gpu4pyscf/dft/tests/test_numint2c.py

diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index 034d3fe03..b72c56eca 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -18,23 +18,20 @@
 
 import functools
 import numpy as np
-from pyscf import lib
-from pyscf.dft import numint
-from pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot, BLKSIZE
-from pyscf.dft import xc_deriv
+import cupy as cp
+from gpu4pyscf import lib
+from gpu4pyscf.dft import numint
+from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot, BLKSIZE
+from gpu4pyscf.dft import xc_deriv
 from pyscf import __config__
 
 
-@lib.with_doc(
-    '''Calculate the electron density and magnetization spin density in the
-    framework of 2-component real basis.
-    ''' + numint.eval_rho.__doc__)
 def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
              with_lapl=True, verbose=None):
-    nao = ao.shape[-1]
+    nao = ao.shape[-2]
     assert dm.ndim == 2 and nao * 2 == dm.shape[0]
 
-    ngrids, nao = ao.shape[-2:]
+    nao, ngrids = ao.shape[-2:]
     xctype = xctype.upper()
     shls_slice = (0, mol.nbas)
     ao_loc = mol.ao_loc
@@ -46,9 +43,9 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
     elif xctype == 'GGA':
         # first 4 ~ (rho, m), second 4 ~ (0th order, dx, dy, dz)
         if hermi:
-            rho_m = np.empty((4, 4, ngrids))
+            rho_m = cp.empty((4, 4, ngrids))
         else:
-            rho_m = np.empty((4, 4, ngrids), dtype=np.complex128)
+            rho_m = cp.empty((4, 4, ngrids), dtype=cp.complex128)
         c0a = _dot_ao_dm(mol, ao[0], dm[:nao], non0tab, shls_slice, ao_loc)
         c0b = _dot_ao_dm(mol, ao[0], dm[nao:], non0tab, shls_slice, ao_loc)
         c0 = (c0a, c0b)
@@ -64,14 +61,14 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
                 rho_m[:,i] += _contract_rho_m((ao[0], ao[0]), (c1a, c1b), hermi, False)
     else: # meta-GGA
         if hermi:
-            dtype = np.double
+            dtype = cp.double
         else:
-            dtype = np.complex128
+            dtype = cp.complex128
         if with_lapl:
-            rho_m = np.empty((4, 6, ngrids), dtype=dtype)
+            rho_m = cp.empty((4, 6, ngrids), dtype=dtype)
             tau_idx = 5
         else:
-            rho_m = np.empty((4, 5, ngrids), dtype=dtype)
+            rho_m = cp.empty((4, 5, ngrids), dtype=dtype)
             tau_idx = 4
         c0a = _dot_ao_dm(mol, ao[0], dm[:nao], non0tab, shls_slice, ao_loc)
         c0b = _dot_ao_dm(mol, ao[0], dm[nao:], non0tab, shls_slice, ao_loc)
@@ -104,9 +101,9 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
     make_rho, nset, n2c = ni._gen_rho_evaluator(mol, dms, hermi, False)
     nao = n2c // 2
 
-    nelec = np.zeros(nset)
-    excsum = np.zeros(nset)
-    vmat = np.zeros((nset,n2c,n2c), dtype=np.complex128)
+    nelec = cp.zeros(nset)
+    excsum = cp.zeros(nset)
+    vmat = cp.zeros((nset,n2c,n2c), dtype=cp.complex128)
 
     if xctype in ('LDA', 'GGA', 'MGGA'):
         f_eval_mat = {
@@ -132,7 +129,7 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
                 else:
                     den = rho[0,0] * weight
                 nelec[i] += den.sum()
-                excsum[i] += np.dot(den, exc)
+                excsum[i] += cp.dot(den, exc)
                 vmat[i] += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
                                 ao_loc, hermi)
 
@@ -143,7 +140,7 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
 
     if hermi:
         vmat = vmat + vmat.conj().transpose(0,2,1)
-    if isinstance(dms, np.ndarray) and dms.ndim == 2:
+    if isinstance(dms, cp.ndarray) and dms.ndim == 2:
         vmat = vmat[0]
         nelec = nelec[0]
         excsum = excsum[0]
@@ -151,46 +148,7 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
 
 def _gks_mcol_fxc(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
                   rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
-    assert ni.collinear[0] == 'm'  # mcol
-    xctype = ni._xc_type(xc_code)
-    if fxc is None and xctype in ('LDA', 'GGA', 'MGGA'):
-        fxc = ni.cache_xc_kernel1(mol, grids, xc_code, dm0,
-                                  max_memory=max_memory)[2]
-
-    if xctype == 'MGGA':
-        fmat, ao_deriv = (_mcol_mgga_fxc_mat , 1)
-    elif xctype == 'GGA':
-        fmat, ao_deriv = (_mcol_gga_fxc_mat  , 1)
-    else:
-        fmat, ao_deriv = (_mcol_lda_fxc_mat  , 0)
-
-    shls_slice = (0, mol.nbas)
-    ao_loc = mol.ao_loc_nr()
-    make_rho1, nset, n2c = ni._gen_rho_evaluator(mol, dms, hermi, False)
-    nao = n2c // 2
-    vmat = np.zeros((nset,n2c,n2c), dtype=np.complex128)
-
-    if xctype in ('LDA', 'GGA', 'MGGA'):
-        _rho0 = None
-        p1 = 0
-        for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            p0, p1 = p1, p1 + weight.size
-            _fxc = fxc[:,:,:,:,p0:p1]
-            for i in range(nset):
-                rho1 = make_rho1(i, ao, mask, xctype)
-                vmat[i] += fmat(mol, ao, weight, _rho0, rho1, _fxc,
-                                mask, shls_slice, ao_loc, hermi)
-    elif xctype == 'HF':
-        pass
-    else:
-        raise NotImplementedError(f'numint2c.get_fxc for functional {xc_code}')
-
-    if hermi:
-        vmat = vmat + vmat.conj().transpose(0,2,1)
-    if isinstance(dms, np.ndarray) and dms.ndim == 2:
-        vmat = vmat[0]
-    return vmat
+    raise NotImplementedError('non-collinear lda fxc')
 
 def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
     '''Vxc matrix of non-collinear LDA'''
@@ -201,7 +159,7 @@ def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     s = lib.norm(rho[1:4], axis=0)
 
     wv = weight * vr
-    with np.errstate(divide='ignore',invalid='ignore'):
+    with cp.errstate(divide='ignore',invalid='ignore'):
         ws = vs * weight / s
     ws[s < 1e-20] = 0
 
@@ -217,7 +175,7 @@ def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     if hermi:
         # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
         matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
-        matab = np.zeros_like(matba)
+        matab = cp.zeros_like(matba)
     else:
         # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
         matba = tmpx + tmpy * 1j
@@ -227,7 +185,7 @@ def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
     aow = _scale_ao(ao, wv-ws*mz, out=aow)  # Mz
     matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    mat = np.block([[mataa, matab], [matba, matbb]])
+    mat = cp.block([[mataa, matab], [matba, matbb]])
     return mat
 
 def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
@@ -267,9 +225,9 @@ def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
     else:
         raise RuntimeError(f'Unknown collinear scheme {ni.collinear}')
 
-    rho = np.stack([(t + s) * .5, (t - s) * .5])
+    rho = cp.stack([(t + s) * .5, (t - s) * .5])
     if xctype == 'MGGA' and rho.shape[1] == 6:
-        rho = np.asarray(rho[:,[0,1,2,3,5],:], order='C')
+        rho = cp.asarray(rho[:,[0,1,2,3,5],:], order='C')
 
     assert spin is None or spin == 1
     spin = 1
@@ -330,7 +288,7 @@ def _mcol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     if hermi:
         # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
         matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
-        matab = np.zeros_like(matba)
+        matab = cp.zeros_like(matba)
     else:
         # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
         matba = tmpx + tmpy * 1j
@@ -340,7 +298,7 @@ def _mcol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
     aow = _scale_ao(ao, wr[0]-wmz[0], out=aow)  # Mz
     matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    mat = np.block([[mataa, matab], [matba, matbb]])
+    mat = cp.block([[mataa, matab], [matba, matbb]])
     return mat
 
 def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
@@ -360,10 +318,10 @@ def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     aow = _scale_ao(ao[:4], wmy[:4], out=aow)  # My
     tmpy = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
     if hermi:
-        assert vxc.dtype == np.double
+        assert vxc.dtype == cp.double
         # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
         matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
-        matab = np.zeros_like(matba)
+        matab = cp.zeros_like(matba)
     else:
         # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
         aow = _scale_ao(ao[1:4], wmx[1:4].conj(), out=aow)  # Mx
@@ -377,7 +335,7 @@ def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
         aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
         matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
 
-    mat = np.block([[mataa, matab], [matba, matbb]])
+    mat = cp.block([[mataa, matab], [matba, matbb]])
     return mat
 
 def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
@@ -405,10 +363,10 @@ def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, herm
     tmpy = _dot_ao_ao(mol, ao[0], aow, mask, shls_slice, ao_loc)
     tmpy += _tau_dot(mol, ao, ao, wmy[tau_idx], mask, shls_slice, ao_loc)
     if hermi:
-        assert vxc.dtype == np.double
+        assert vxc.dtype == cp.double
         # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
         matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
-        matab = np.zeros_like(matba)
+        matab = cp.zeros_like(matba)
     else:
         # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
         aow = _scale_ao(ao[1:4], wmx[1:4].conj(), out=aow)  # Mx
@@ -422,29 +380,20 @@ def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, herm
         aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
         matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
 
-    mat = np.block([[mataa, matab], [matba, matbb]])
+    mat = cp.block([[mataa, matab], [matba, matbb]])
     return mat
 
 def _mcol_lda_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
                       mask, shls_slice, ao_loc, hermi):
-    '''Kernel matrix of multi-collinear LDA'''
-    vxc1 = np.einsum('ag,abyg->byg', rho1, fxc[:,0,:])
-    return _mcol_lda_vxc_mat(mol, ao, weight, rho0, vxc1, mask, shls_slice,
-                             ao_loc, hermi)
+    raise NotImplementedError('non-collinear lda fxc')
 
 def _mcol_gga_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
                       mask, shls_slice, ao_loc, hermi):
-    '''Kernel matrix of multi-collinear GGA'''
-    vxc1 = np.einsum('axg,axbyg->byg', rho1, fxc)
-    return _mcol_gga_vxc_mat(mol, ao, weight, rho0, vxc1, mask, shls_slice,
-                             ao_loc, hermi)
+    raise NotImplementedError('non-collinear gga fxc')
 
 def _mcol_mgga_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
                        mask, shls_slice, ao_loc, hermi):
-    '''Kernel matrix of multi-collinear MGGA'''
-    vxc1 = np.einsum('axg,axbyg->byg', rho1, fxc)
-    return _mcol_mgga_vxc_mat(mol, ao, weight, rho0, vxc1, mask, shls_slice,
-                              ao_loc, hermi)
+    raise NotImplementedError('non-collinear mgga fxc')
 
 def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
     '''
@@ -457,15 +406,15 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
     # mz = einsum('xy,ygi,ij,xgj->g', sz, ket, dm, bra.conj())
     ket_a, ket_b = ket
     bra_a, bra_b = bra
-    nao = min(ket_a.shape[1], bra_a.shape[1])
-    ngrids = ket_a.shape[0]
+    nao = min(ket_a.shape[-2], bra_a.shape[-2])
+    ngrids = ket_a.shape[-1]
     if hermi:
-        raa = np.einsum('pi,pi->p', bra_a.real, ket_a[:,:nao].real)
-        raa+= np.einsum('pi,pi->p', bra_a.imag, ket_a[:,:nao].imag)
-        rab = np.einsum('pi,pi->p', bra_a.conj(), ket_b[:,:nao])
-        rbb = np.einsum('pi,pi->p', bra_b.real, ket_b[:,nao:].real)
-        rbb+= np.einsum('pi,pi->p', bra_b.imag, ket_b[:,nao:].imag)
-        rho_m = np.empty((4, ngrids))
+        raa = cp.einsum('ip,ip->p', bra_a.real, ket_a[:nao].real)
+        raa+= cp.einsum('ip,ip->p', bra_a.imag, ket_a[:nao].imag)
+        rab = cp.einsum('ip,ip->p', bra_a.conj(), ket_b[:nao])
+        rbb = cp.einsum('ip,ip->p', bra_b.real, ket_b[nao:].real)
+        rbb+= cp.einsum('ip,ip->p', bra_b.imag, ket_b[nao:].imag)
+        rho_m = cp.empty((4, ngrids))
         rho_m[0,:] = raa + rbb     # rho
         rho_m[1,:] = rab.real      # mx
         rho_m[2,:] = rab.imag      # my
@@ -474,15 +423,15 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
             rho_m[1,:] *= 2
             rho_m[2,:] *= 2
         else:
-            rba = np.einsum('pi,pi->p', bra_b.conj(), ket_a[:,nao:])
+            rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
             rho_m[1,:] += rba.real
             rho_m[2,:] -= rba.imag
     else:
-        raa = np.einsum('pi,pi->p', bra_a.conj(), ket_a[:,:nao])
-        rba = np.einsum('pi,pi->p', bra_b.conj(), ket_a[:,nao:])
-        rab = np.einsum('pi,pi->p', bra_a.conj(), ket_b[:,:nao])
-        rbb = np.einsum('pi,pi->p', bra_b.conj(), ket_b[:,nao:])
-        rho_m = np.empty((4, ngrids), dtype=np.complex128)
+        raa = cp.einsum('ip,ip->p', bra_a.conj(), ket_a[:nao])
+        rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
+        rab = cp.einsum('ip,ip->p', bra_a.conj(), ket_b[:nao])
+        rbb = cp.einsum('ip,ip->p', bra_b.conj(), ket_b[nao:])
+        rho_m = cp.empty((4, ngrids), dtype=cp.complex128)
         rho_m[0,:] = raa + rbb         # rho
         rho_m[1,:] = rab + rba         # mx
         rho_m[2,:] = (rba - rab) * 1j  # my
@@ -519,12 +468,12 @@ def eval_rho2(self, mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
         '''
         if self.collinear[0] in ('n', 'm'):
             # TODO:
-            dm = np.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+            dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
             hermi = 1
             rho = self.eval_rho(mol, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
             return rho
 
-        #if mo_coeff.dtype == np.double:
+        #if mo_coeff.dtype == cp.double:
         #    nao = ao.shape[-1]
         #    assert nao * 2 == mo_coeff.shape[0]
         #    mo_aR = mo_coeff[:nao]
@@ -532,7 +481,7 @@ def eval_rho2(self, mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
         #    rho  = numint.eval_rho2(mol, ao, mo_aR, mo_occ, non0tab, xctype, with_lapl, verbose)
         #    rho += numint.eval_rho2(mol, ao, mo_bR, mo_occ, non0tab, xctype, with_lapl, verbose)
         #else:
-        #    dm = np.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+        #    dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
         #    hermi = 1
         #    rho = self.eval_rho(mol, dm, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
         #mx = my = mz = None
@@ -559,8 +508,8 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
                     in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
                 rho.append(self.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype,
                                           with_lapl))
-            rho = np.concatenate(rho,axis=-1)
-            assert rho.dtype == np.double
+            rho = cp.concatenate(rho,axis=-1)
+            assert rho.dtype == cp.double
             if self.collinear[0] == 'm':  # mcol
                 eval_xc = self.mcfun_eval_xc_adapter(xc_code)
             else:
@@ -568,7 +517,7 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
             vxc, fxc = eval_xc(xc_code, rho, deriv=2, xctype=xctype)[1:3]
         else:
             # rhoa and rhob must be real
-            dm = np.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
+            dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
             dm_a = dm[:nao,:nao].real.copy('C')
             dm_b = dm[nao:,nao:].real.copy('C')
             ni = self._to_numint1c()
@@ -580,8 +529,8 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
                 # rhoa and rhob must be real
                 rhoa.append(ni.eval_rho(mol, ao, dm_a, mask, xctype, hermi, with_lapl))
                 rhob.append(ni.eval_rho(mol, ao, dm_b, mask, xctype, hermi, with_lapl))
-            rho = np.stack([np.concatenate(rhoa,axis=-1), np.concatenate(rhob,axis=-1)])
-            assert rho.dtype == np.double
+            rho = cp.stack([cp.concatenate(rhoa,axis=-1), cp.concatenate(rhob,axis=-1)])
+            assert rho.dtype == cp.double
             vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
         return rho, vxc, fxc
 
@@ -604,8 +553,8 @@ def cache_xc_kernel1(self, mol, grids, xc_code, dm, spin=0, max_memory=2000):
             for ao, mask, weight, coords \
                     in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
                 rho.append(self.eval_rho1(mol, ao, dm, mask, xctype, hermi, with_lapl))
-            rho = np.concatenate(rho,axis=-1)
-            assert rho.dtype == np.double
+            rho = cp.concatenate(rho,axis=-1)
+            assert rho.dtype == cp.double
             if self.collinear[0] == 'm':  # mcol
                 eval_xc = self.mcfun_eval_xc_adapter(xc_code)
             else:
@@ -626,8 +575,8 @@ def cache_xc_kernel1(self, mol, grids, xc_code, dm, spin=0, max_memory=2000):
                 # rhoa and rhob must be real
                 rhoa.append(ni.eval_rho(mol, ao, dm_a, mask, xctype, hermi, with_lapl))
                 rhob.append(ni.eval_rho(mol, ao, dm_b, mask, xctype, hermi, with_lapl))
-            rho = np.stack([np.concatenate(rhoa,axis=-1), np.concatenate(rhob,axis=-1)])
-            assert rho.dtype == np.double
+            rho = cp.stack([cp.concatenate(rhoa,axis=-1), cp.concatenate(rhob,axis=-1)])
+            assert rho.dtype == cp.double
             vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
         return rho, vxc, fxc
 
@@ -658,7 +607,7 @@ def nr_vxc(self, mol, grids, xc_code, dms, spin=0, relativity=0, hermi=1,
             ni = self._to_numint1c()
             n, exc, v = ni.nr_uks(mol, grids, xc_code, dm1, relativity,
                                   hermi, max_memory, verbose)
-            vmat = np.zeros_like(dms)
+            vmat = cp.zeros_like(dms)
             vmat[...,:nao,:nao] = v[0]
             vmat[...,nao:,nao:] = v[1]
         return n.sum(), exc, vmat
@@ -667,47 +616,12 @@ def nr_vxc(self, mol, grids, xc_code, dms, spin=0, relativity=0, hermi=1,
     @lib.with_doc(numint.nr_nlc_vxc.__doc__)
     def nr_nlc_vxc(self, mol, grids, xc_code, dm, spin=0, relativity=0, hermi=1,
                    max_memory=2000, verbose=None):
-        assert dm.ndim == 2
-        nao = dm.shape[-1] // 2
-        # ground state density is always real
-        dm_a = dm[:nao,:nao].real
-        dm_b = dm[nao:,nao:].real
-        ni = self._to_numint1c()
-        n, exc, v = ni.nr_nlc_vxc(mol, grids, xc_code, dm_a+dm_b, relativity,
-                                  hermi, max_memory, verbose)
-        vmat = np.zeros_like(dm)
-        vmat[:nao,:nao] = v
-        vmat[nao:,nao:] = v
-        return n, exc, vmat
+        raise NotImplementedError('non-collinear nlc vxc')
 
     @lib.with_doc(numint.nr_rks_fxc.__doc__)
     def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
-        if self.collinear[0] not in ('c', 'm'):  # col or mcol
-            raise NotImplementedError('non-collinear fxc')
-
-        if self.collinear[0] == 'm':  # mcol
-            fxcmat = self._gks_mcol_fxc(mol, grids, xc_code, dm0, dms,
-                                        relativity, hermi, rho0, vxc, fxc,
-                                        max_memory, verbose)
-        else:
-            dms = np.asarray(dms)
-            nao = dms.shape[-1] // 2
-            if dm0 is not None:
-                dm0a = dm0[:nao,:nao].real.copy('C')
-                dm0b = dm0[nao:,nao:].real.copy('C')
-                dm0 = (dm0a, dm0b)
-            # dms_a and dms_b may be complex if they are TDDFT amplitudes
-            dms_a = dms[...,:nao,:nao].copy()
-            dms_b = dms[...,nao:,nao:].copy()
-            dm1 = (dms_a, dms_b)
-            ni = self._to_numint1c()
-            vmat = ni.nr_uks_fxc(mol, grids, xc_code, dm0, dm1, relativity,
-                                 hermi, rho0, vxc, fxc, max_memory, verbose)
-            fxcmat = np.zeros_like(dms)
-            fxcmat[...,:nao,:nao] = vmat[0]
-            fxcmat[...,nao:,nao:] = vmat[1]
-        return fxcmat
+        raise NotImplementedError('non-collinear fxc')
     get_fxc = nr_gks_fxc = nr_fxc
 
     eval_xc_eff = _eval_xc_eff
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
new file mode 100644
index 000000000..6f5cd40a2
--- /dev/null
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -0,0 +1,140 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import pyscf
+import cupy
+from pyscf import lib, scf
+from pyscf.dft.numint import NumInt as pyscf_numint
+from gpu4pyscf.dft import Grids
+from gpu4pyscf.dft import numint
+from gpu4pyscf.dft.numint import NumInt
+from gpu4pyscf import dft
+from gpu4pyscf.dft import gen_grid
+
+def setUpModule():
+    global mol, grids_cpu, grids_gpu, dm, dm0, dm1, mo_occ, mo_coeff
+    mol = pyscf.M(
+        atom = '''
+O        0.000000    0.000000    0.117790
+H        0.000000    0.755453   -0.471161
+H        0.000000   -0.755453   -0.471161''',
+        basis = 'ccpvdz',
+        charge = 1,
+        spin = 1,  # = 2S = spin_up - spin_down
+        output = '/dev/null')
+
+    np.random.seed(2)
+    mf = scf.UHF(mol)
+    mf.kernel()
+    dm1 = mf.make_rdm1().copy()
+    dm = dm1
+    mo_coeff = mf.mo_coeff
+    mo_occ = mf.mo_occ
+    dm0 = (mo_coeff[0]*mo_occ[0]).dot(mo_coeff[0].T)
+
+    grids_gpu = Grids(mol)
+    grids_gpu.level = 1
+    grids_gpu.build()
+
+    grids_cpu = grids_gpu.to_cpu()
+    grids_cpu.weights = cupy.asnumpy(grids_gpu.weights)
+    grids_cpu.coords = cupy.asnumpy(grids_gpu.coords)
+
+def tearDownModule():
+    global mol, grids_cpu, grids_gpu
+    mol.stdout.close()
+    del mol, grids_cpu, grids_gpu
+
+LDA = 'LDA_C_VWN'
+GGA_PBE = 'GGA_C_PBE'
+MGGA_M06 = 'MGGA_C_M06'
+
+class KnownValues(unittest.TestCase):
+
+    def test_eval_rho(self):
+        np.random.seed(1)
+        dm = np.random.random(dm0.shape)
+        ni_gpu = NumInt()
+        ni_cpu = pyscf_numint()
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+    def test_sparse_index(self):
+        mol = pyscf.M(atom='''
+O   0.     0.    0.
+H   5.5    0.3   2.8
+H   1.7   -2.0   0.4''',
+                      basis='def2-tzvpp')
+        with lib.temporary_env(numint, MIN_BLK_SIZE=128**2):
+            grids = gen_grid.Grids(mol).set(atom_grid=(200, 1454)).build()
+            ni = NumInt()
+            ni.build(mol, grids.coords)
+            opt = ni.gdftopt
+            opt.l_ctr_offsets
+            ao_loc = opt._sorted_mol.ao_loc
+            ngrids = grids.size
+            dat = grids.get_non0ao_idx(opt)
+            assert lib.fp(cupy.hstack([x[1] for x in dat]).get()) == 103.60117204957997
+            assert lib.fp(cupy.hstack([x[2] for x in dat]).get()) == 5.616197331343498
+            assert lib.fp(cupy.hstack([x[3] for x in dat]).get()) == -22.394314323727
+            assert lib.fp(cupy.hstack([x[4] for x in dat]).get()) == 351.2385939586691
+            assert [i.size for x in dat for i in x[1:]] == [
+                46, 18, 9, 19, 50, 20, 9, 21, 28, 12, 9, 13, 49, 19, 9, 20, 45, 17,
+                9, 18, 45, 17, 9, 18, 45, 17, 9, 18, 45, 17, 9, 18, 50, 20, 9, 21,
+                55, 21, 9, 22, 53, 19, 9, 20, 48, 18, 9, 19, 53, 19, 9, 20, 48, 18,
+                9, 19, 43, 17, 9, 18, 40, 16, 9, 17, 48, 18, 9, 19, 48, 18, 9, 19,
+                48, 18, 9, 19, 57, 21, 9, 22, 23, 11, 9, 12, 28, 12, 9, 13, 37, 15,
+                9, 16, 33, 13, 9, 14, 32, 14, 9, 15, 20, 10, 9, 11, 23, 11, 9, 12,
+                23, 11, 9, 12, 23, 11, 9, 12, 37, 15, 9, 16]
+            assert all(x.dtype == np.int32 for x in dat[0][1:])
+
+            if hasattr(numint, '_sparse_index'):
+                for i, i0 in enumerate(range(0, ngrids, numint.MIN_BLK_SIZE)):
+                    i1 = min(i0+numint.MIN_BLK_SIZE, ngrids)
+                    ref = numint._sparse_index(
+                        opt._sorted_mol, grids.coords[i0:i1], opt.l_ctr_offsets, ao_loc, opt)
+                    assert all(np.array_equal(r, x) for r, x in zip(ref[1:], dat[i][1:]))
+
+    def test_scale_ao(self):
+        ao = cupy.random.rand(1, 3, 256)
+        wv = cupy.random.rand(1, 256)
+        out = cupy.ones(6 * 256)
+        ref = cupy.einsum('nip,np->ip', ao, wv)
+        assert abs(ref - numint._scale_ao(ao, wv)).max() < 1e-12
+        assert abs(ref - numint._scale_ao(ao, wv, out=out)).max() < 1e-12
+        assert abs(ref - numint._scale_ao(ao+0j, wv, out=out)).max() < 1e-12
+        assert abs(ref - numint._scale_ao(ao[0]+0j, wv[0], out=out)).max() < 1e-12
+
+        ao = ao.transpose(1, 0, 2).copy(order='C').transpose(1, 0, 2)
+        assert abs(ref - numint._scale_ao(ao, wv)).max() < 1e-12
+
+        assert abs(ref - numint._scale_ao(ao[0], wv[0])).max() < 1e-12
+
+if __name__ == "__main__":
+    print("Full Tests for dft numint")
+    unittest.main()

From 1a157709b48bbf651455e2f6a86585d49293ec87 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 31 Oct 2025 14:52:19 +0800
Subject: [PATCH 04/21] finish debug eval_rho

---
 gpu4pyscf/dft/numint2c.py            |  8 ++--
 gpu4pyscf/dft/tests/test_numint2c.py | 69 ++++------------------------
 2 files changed, 14 insertions(+), 63 deletions(-)

diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index b72c56eca..2511cbfb8 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -19,9 +19,9 @@
 import functools
 import numpy as np
 import cupy as cp
-from gpu4pyscf import lib
+from pyscf import lib
 from gpu4pyscf.dft import numint
-from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot, BLKSIZE
+from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot
 from gpu4pyscf.dft import xc_deriv
 from pyscf import __config__
 
@@ -30,6 +30,8 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
              with_lapl=True, verbose=None):
     nao = ao.shape[-2]
     assert dm.ndim == 2 and nao * 2 == dm.shape[0]
+    if not isinstance(dm, cp.ndarray):
+        dm = cp.asarray(dm)
 
     nao, ngrids = ao.shape[-2:]
     xctype = xctype.upper()
@@ -451,7 +453,6 @@ class NumInt2C(lib.StreamObject, numint.LibXCMixin):
     collinear_thrd = getattr(__config__, 'dft_numint_RnumInt_collinear_thrd', 0.99)
     collinear_samples = getattr(__config__, 'dft_numint_RnumInt_collinear_samples', 200)
 
-    make_mask = staticmethod(numint.make_mask)
     eval_ao = staticmethod(numint.eval_ao)
     eval_rho = staticmethod(eval_rho)
 
@@ -628,7 +629,6 @@ def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
     mcfun_eval_xc_adapter = mcfun_eval_xc_adapter
 
     block_loop = numint.NumInt.block_loop
-    _gen_rho_evaluator = numint.NumInt._gen_rho_evaluator
 
     def _to_numint1c(self):
         '''Converts to the associated class to handle collinear systems'''
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 6f5cd40a2..a8380829e 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -17,10 +17,10 @@
 import pyscf
 import cupy
 from pyscf import lib, scf
-from pyscf.dft.numint import NumInt as pyscf_numint
+from pyscf.dft.numint2c import NumInt2C as pyscf_numint2c
 from gpu4pyscf.dft import Grids
-from gpu4pyscf.dft import numint
-from gpu4pyscf.dft.numint import NumInt
+from gpu4pyscf.dft import numint2c
+from gpu4pyscf.dft.numint2c import NumInt2C
 from gpu4pyscf import dft
 from gpu4pyscf.dft import gen_grid
 
@@ -34,16 +34,17 @@ def setUpModule():
         basis = 'ccpvdz',
         charge = 1,
         spin = 1,  # = 2S = spin_up - spin_down
-        output = '/dev/null')
+        output = '/dev/null'
+        )
 
     np.random.seed(2)
-    mf = scf.UHF(mol)
+    mf = scf.GHF(mol)
     mf.kernel()
     dm1 = mf.make_rdm1().copy()
     dm = dm1
     mo_coeff = mf.mo_coeff
     mo_occ = mf.mo_occ
-    dm0 = (mo_coeff[0]*mo_occ[0]).dot(mo_coeff[0].T)
+    dm0 = (mo_coeff*mo_occ).dot(mo_coeff.T)
 
     grids_gpu = Grids(mol)
     grids_gpu.level = 1
@@ -67,8 +68,8 @@ class KnownValues(unittest.TestCase):
     def test_eval_rho(self):
         np.random.seed(1)
         dm = np.random.random(dm0.shape)
-        ni_gpu = NumInt()
-        ni_cpu = pyscf_numint()
+        ni_gpu = NumInt2C()
+        ni_cpu = pyscf_numint2c()
         for xctype in ('LDA', 'GGA', 'MGGA'):
             deriv = 1
             if xctype == 'LDA':
@@ -84,57 +85,7 @@ def test_eval_rho(self):
             ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
             self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
 
-    def test_sparse_index(self):
-        mol = pyscf.M(atom='''
-O   0.     0.    0.
-H   5.5    0.3   2.8
-H   1.7   -2.0   0.4''',
-                      basis='def2-tzvpp')
-        with lib.temporary_env(numint, MIN_BLK_SIZE=128**2):
-            grids = gen_grid.Grids(mol).set(atom_grid=(200, 1454)).build()
-            ni = NumInt()
-            ni.build(mol, grids.coords)
-            opt = ni.gdftopt
-            opt.l_ctr_offsets
-            ao_loc = opt._sorted_mol.ao_loc
-            ngrids = grids.size
-            dat = grids.get_non0ao_idx(opt)
-            assert lib.fp(cupy.hstack([x[1] for x in dat]).get()) == 103.60117204957997
-            assert lib.fp(cupy.hstack([x[2] for x in dat]).get()) == 5.616197331343498
-            assert lib.fp(cupy.hstack([x[3] for x in dat]).get()) == -22.394314323727
-            assert lib.fp(cupy.hstack([x[4] for x in dat]).get()) == 351.2385939586691
-            assert [i.size for x in dat for i in x[1:]] == [
-                46, 18, 9, 19, 50, 20, 9, 21, 28, 12, 9, 13, 49, 19, 9, 20, 45, 17,
-                9, 18, 45, 17, 9, 18, 45, 17, 9, 18, 45, 17, 9, 18, 50, 20, 9, 21,
-                55, 21, 9, 22, 53, 19, 9, 20, 48, 18, 9, 19, 53, 19, 9, 20, 48, 18,
-                9, 19, 43, 17, 9, 18, 40, 16, 9, 17, 48, 18, 9, 19, 48, 18, 9, 19,
-                48, 18, 9, 19, 57, 21, 9, 22, 23, 11, 9, 12, 28, 12, 9, 13, 37, 15,
-                9, 16, 33, 13, 9, 14, 32, 14, 9, 15, 20, 10, 9, 11, 23, 11, 9, 12,
-                23, 11, 9, 12, 23, 11, 9, 12, 37, 15, 9, 16]
-            assert all(x.dtype == np.int32 for x in dat[0][1:])
-
-            if hasattr(numint, '_sparse_index'):
-                for i, i0 in enumerate(range(0, ngrids, numint.MIN_BLK_SIZE)):
-                    i1 = min(i0+numint.MIN_BLK_SIZE, ngrids)
-                    ref = numint._sparse_index(
-                        opt._sorted_mol, grids.coords[i0:i1], opt.l_ctr_offsets, ao_loc, opt)
-                    assert all(np.array_equal(r, x) for r, x in zip(ref[1:], dat[i][1:]))
-
-    def test_scale_ao(self):
-        ao = cupy.random.rand(1, 3, 256)
-        wv = cupy.random.rand(1, 256)
-        out = cupy.ones(6 * 256)
-        ref = cupy.einsum('nip,np->ip', ao, wv)
-        assert abs(ref - numint._scale_ao(ao, wv)).max() < 1e-12
-        assert abs(ref - numint._scale_ao(ao, wv, out=out)).max() < 1e-12
-        assert abs(ref - numint._scale_ao(ao+0j, wv, out=out)).max() < 1e-12
-        assert abs(ref - numint._scale_ao(ao[0]+0j, wv[0], out=out)).max() < 1e-12
-
-        ao = ao.transpose(1, 0, 2).copy(order='C').transpose(1, 0, 2)
-        assert abs(ref - numint._scale_ao(ao, wv)).max() < 1e-12
-
-        assert abs(ref - numint._scale_ao(ao[0], wv[0])).max() < 1e-12
 
 if __name__ == "__main__":
-    print("Full Tests for dft numint")
+    print("Full Tests for dft numint2c")
     unittest.main()

From 0b2a3ebadf697e80acd56d4f0aa632a73d6f82b5 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 31 Oct 2025 16:11:02 +0800
Subject: [PATCH 05/21] test eval_rho2

---
 gpu4pyscf/dft/numint2c.py            |  4 ++++
 gpu4pyscf/dft/tests/test_numint2c.py | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index 2511cbfb8..f90a03066 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -467,6 +467,10 @@ def eval_rho2(self, mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
         '''Calculate the electron density for LDA functional and the density
         derivatives for GGA functional in the framework of 2-component basis.
         '''
+        if not isinstance(mo_occ, cp.ndarray):
+            mo_occ = cp.asarray(mo_occ)
+        if not isinstance(mo_coeff, cp.ndarray):
+            mo_coeff = cp.asarray(mo_coeff)
         if self.collinear[0] in ('n', 'm'):
             # TODO:
             dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index a8380829e..13801a86c 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -85,6 +85,26 @@ def test_eval_rho(self):
             ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
             self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
 
+    def test_eval_rho2(self):
+        np.random.seed(1)
+        mo_coeff_test = np.random.random(mo_coeff.shape)
+        ni_gpu = NumInt2C()
+        ni_cpu = pyscf_numint2c()
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            
+            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=0, with_lapl=False)
+            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=0, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=1, with_lapl=False)
+            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=1, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
 
 if __name__ == "__main__":
     print("Full Tests for dft numint2c")

From a2c46a89af1b4b56a4dc66872e3f51f98ff0937a Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Mon, 3 Nov 2025 09:32:03 +0800
Subject: [PATCH 06/21] finish debug the eval_rho2

---
 gpu4pyscf/dft/numint2c.py            |  2 +-
 gpu4pyscf/dft/tests/test_numint2c.py | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index f90a03066..1fe420882 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -446,7 +446,7 @@ class NumInt2C(lib.StreamObject, numint.LibXCMixin):
 
     # collinear schemes:
     #   'col' (collinear, by default)
-    #   'ncol' (non-collinear)
+    #   'ncol' (non-collinear, also known as locally collinear)
     #   'mcol' (multi-collinear)
     collinear = getattr(__config__, 'dft_numint_RnumInt_collinear', 'col')
     spin_samples = getattr(__config__, 'dft_numint_RnumInt_spin_samples', 770)
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 13801a86c..374e10773 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -89,7 +89,9 @@ def test_eval_rho2(self):
         np.random.seed(1)
         mo_coeff_test = np.random.random(mo_coeff.shape)
         ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
         ni_cpu = pyscf_numint2c()
+        ni_cpu.collinear='m'
         for xctype in ('LDA', 'GGA', 'MGGA'):
             deriv = 1
             if xctype == 'LDA':
@@ -97,12 +99,8 @@ def test_eval_rho2(self):
             ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
             ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
             
-            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=0, with_lapl=False)
-            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=0, with_lapl=False)
-            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
-
-            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=1, with_lapl=False)
-            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, hermi=1, with_lapl=False)
+            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
             self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
 
 

From 3bb4200dd06cb7d6569d71a0a73ef5c4c0e6f06f Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Mon, 3 Nov 2025 09:53:04 +0800
Subject: [PATCH 07/21] add gpu-version mcfun

---
 gpu4pyscf/dft/mcfun_gpu.py | 458 +++++++++++++++++++++++++++++++++++++
 1 file changed, 458 insertions(+)
 create mode 100644 gpu4pyscf/dft/mcfun_gpu.py

diff --git a/gpu4pyscf/dft/mcfun_gpu.py b/gpu4pyscf/dft/mcfun_gpu.py
new file mode 100644
index 000000000..59bd02e13
--- /dev/null
+++ b/gpu4pyscf/dft/mcfun_gpu.py
@@ -0,0 +1,458 @@
+# Copyright 2021-2024 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import warnings
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+import numpy as np
+from .LebedevGrid import MakeAngularGrid
+
+MAX_GRIDS_PER_TASK = 200
+
+def eval_xc_eff(func, rho_tm, deriv=1, spin_samples=770,
+                collinear_threshold=None, collinear_samples=200, workers=1):
+    '''Multi-collinear effective potential and effective kernel.
+
+    Parameters
+    ----------
+    func : Function to evaluate collinear functionals.
+        The function signature is
+            exc, vxc, fxc, ... = func((rho, s), deriv)
+        The input rho and s have shape
+        * (1, Ngrids) for LDA functionals
+        * (4, Ngrids) for GGA functionals where the four variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * (5, Ngrids) for meta-GGA functionals where the five variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        The returns of func should have the shape
+        * (2, Nvar, Ngrids) for vxc
+        * (2, Nvar, 2, Nvar, Ngrids) for fxc
+        * (2, Nvar, 2, Nvar, 2, Nvar, Ngrids) for kxc
+        The dimension Nvar can be
+        * 1 for LDA
+        * 4 for GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        Note: for GGA functionals the required returns have different conventions
+        to the returns of libxc or xcfun in functional derivatives. Here func
+        needs to return the derivatives to rho, nabla_x(rho), nabla_y(rho),
+        nabla_z(rho) while libxc (or xcfun) returns derivatives to rho, sigma_uu,
+        sigma_ud, sigma_dd (sigma_ud = nabla(rho_u) dot nabla(rho_d)). Please
+        see example 04-xc_wrapper.py for the transformation between the two
+        conventions.
+    rho_tm : np array with shape (4, Nvar, Ngrids)
+        rho_tm[0] is density. rho_tm[1:4] is the magnetization spin vector. Nvar can be
+        * 1 for LDA functionals
+        * 4 for GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+    deriv : int, optional
+        Functional derivatives. The current version (the kernel) supports maximum value 2.
+    spin_samples : int, optional
+        Number of grid points on spherical surface
+    workers : int, optional
+        Parallel execution if workers > 1
+    collinear_threshold : float, optional
+        if specified, filters the points of strongly polarized spins and calls the
+        eval_xc_collinear_spin for these points. Recommended value 0.99.
+    collinear_samples : int, optional
+        Number of samples for eval_xc_collinear_spin.
+
+    Returns
+    -------
+    XC functional and derivatives for each grid.
+    * [exc] if deriv = 0
+    * [exc, vxc] if deriv = 1
+    * [exc, vxc, fxc] if deriv = 2
+    '''
+
+    assert deriv < 5
+    rho_tm = np.asarray(rho_tm)
+    if rho_tm.dtype != np.double:
+        raise RuntimeError('rho and m must be real')
+    ngrids = rho_tm.shape[-1]
+    grids_per_task = min(ngrids//(workers*3)+1, MAX_GRIDS_PER_TASK)
+    if workers == 1:
+        results = []
+        for p0, p1 in _prange(0, ngrids, grids_per_task):
+            r = _eval_xc_lebedev(func, rho_tm[...,p0:p1], deriv, spin_samples,
+                                 collinear_threshold, collinear_samples)
+            results.append(r)
+    else:
+        executor = ThreadPoolExecutor
+
+        with executor(max_workers=workers) as ex:
+            futures = []
+            for p0, p1 in _prange(0, ngrids, grids_per_task):
+                f = ex.submit(_eval_xc_lebedev, func, rho_tm[...,p0:p1], deriv,
+                              spin_samples, collinear_threshold, collinear_samples)
+                futures.append(f)
+            results = [f.result() for f in futures]
+
+    return [None if x[0] is None else np.concatenate(x, axis=-1) for x in zip(*results)]
+
+
+def eval_xc_eff_sf(func, rho_tmz, deriv=1, collinear_samples=200, workers=1):
+    assert deriv < 5
+    if rho_tmz.dtype != np.double:
+        raise RuntimeError('rho and mz must be real')
+    ngrids = rho_tmz.shape[-1]
+    grids_per_task = min(ngrids//(workers*3)+1, MAX_GRIDS_PER_TASK)
+
+    if workers == 1:
+        results = []
+        for p0, p1 in _prange(0, ngrids, grids_per_task):
+            r = _eval_xc_sf(func, rho_tmz[...,p0:p1], deriv, collinear_samples)
+            results.append(r)
+    else:
+        print(collinear_samples)
+        executor = ThreadPoolExecutor
+
+        with executor(max_workers=workers) as ex:
+            futures = []
+            for p0, p1 in _prange(0, ngrids, grids_per_task):
+                f = ex.submit(_eval_xc_sf, func, rho_tmz[...,p0:p1], deriv, collinear_samples)
+                futures.append(f)
+            results = [f.result() for f in futures]
+
+    return [None if x[0] is None else np.concatenate(x, axis=-1) for x in zip(*results)]
+
+def _eval_xc_sf(func, rho_tmz, deriv, collinear_samples):
+    ngrids = rho_tmz.shape[-1]
+    # samples on z=cos(theta) and their weights between [0, 1]
+    sgridz, weights = _make_paxis_samples(collinear_samples)
+    blksize = int(np.ceil(1e5 / ngrids)) * 8
+
+    if rho_tmz.ndim == 2:
+        nvar = 1
+    else:
+        nvar = rho_tmz.shape[1]
+    fxc_sf = np.zeros((nvar,nvar,ngrids))
+    kxc_sf = np.zeros((nvar,nvar,2,nvar,ngrids))
+    for p0, p1 in _prange(0, weights.size, blksize):
+        rho = _project_spin_paxis2(rho_tmz, sgridz[p0:p1])
+        xc_orig = func(rho, deriv)
+        if deriv > 1:
+            fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids, p1-p0)
+            fxc_sf += fxc[1,:,1].dot(weights[p0:p1])
+
+        if deriv > 2:
+            kxc = xc_orig[3].reshape(2, nvar, 2, nvar, 2, nvar, ngrids, p1-p0)
+            kxc_sf[:,:,0] += kxc[1,:,1,:,0].dot(weights[p0:p1])
+            kxc_sf[:,:,1] += kxc[1,:,1,:,1].dot(weights[p0:p1]*sgridz[p0:p1])
+    return None,None,fxc_sf,kxc_sf
+
+def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
+    '''Multi-collinear functional derivatives for collinear spins
+
+    Parameters
+    ----------
+    func : Function to evaluate collinear functionals.
+        The function signature is
+            exc, vxc, fxc, ... = func((rho, s), deriv)
+        The input rho and s have shape
+        * (1, Ngrids) for LDA functionals
+        * (4, Ngrids) for GGA functionals where the four variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * (5, Ngrids) for meta-GGA functionals where the five variables are
+        rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        The returns of func should have the shape
+        * (2, Nvar, Ngrids) for vxc
+        * (2, Nvar, 2, Nvar, Ngrids) for fxc
+        * (2, Nvar, 2, Nvar, 2, Nvar, Ngrids) for kxc
+        The dimension Nvar can be
+        * 1 for LDA
+        * 4 for GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+        Note: for GGA functionals the required returns have different conventions
+        to the returns of libxc or xcfun in functional derivatives. Here func
+        needs to return the derivatives to rho, nabla_x(rho), nabla_y(rho),
+        nabla_z(rho) while libxc (or xcfun) returns derivatives to rho, sigma_uu,
+        sigma_ud, sigma_dd (sigma_ud = nabla(rho_u) dot nabla(rho_d)).
+    rho_tm : np array with shape (4, Nvar, Ngrids)
+        rho_tm[0] is density. rho_tm[1:4] is the magnetization spin vector. Nvar can be
+        * 1 for LDA functionals
+        * 4 for GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
+        * 5 for meta-GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho), tau
+    deriv : int, optional
+        Functional derivatives. The current version (the kernel) supports maximum value 2.
+    spin_samples : int, optional
+        Number of grid points on principal axis
+
+    Returns
+    -------
+    XC functional and derivatives for each grid.
+    * [exc] if deriv = 0
+    * [exc, vxc] if deriv = 1
+    * [exc, vxc, fxc] if deriv = 2
+    '''
+    ngrids = rho_tm.shape[-1]
+    # samples on z=cos(theta) and their weights between [0, 1]
+    sgridz, weights = _make_paxis_samples(spin_samples)
+    blksize = int(np.ceil(1e5 / ngrids)) * 8
+
+    if rho_tm.ndim == 2:
+        nvar = 1
+    else:
+        nvar = rho_tm.shape[1]
+
+    rho_ts = _project_spin_paxis(rho_tm)
+
+    # TODO: filter s, nabla(s) ~= 0
+    m = rho_tm[1:].reshape(3, nvar, ngrids)
+    s = rho_ts[1].reshape(nvar, ngrids)[0]
+    with np.errstate(divide='ignore', invalid='ignore'):
+        omega = m[:,0] / s
+    omega[:,s==0] = 0
+
+    xc_orig = func(rho_ts, deriv)
+    exc_eff = xc_orig[0]
+
+    omega = omega.reshape(3, ngrids)
+    if deriv > 0:
+        vxc = xc_orig[1].reshape(2, nvar, ngrids)
+        vxc_eff = np.vstack((vxc[:1], np.einsum('xg,rg->rxg', vxc[1], omega)))
+
+    if deriv > 1:
+        # spin-conserve part
+        fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids)
+        fxc_eff = np.empty((4, nvar, 4, nvar, ngrids))
+        fxc_eff[0,:,0] = fxc[0,:,0]
+        fz1 = np.einsum('xyg,rg->rxyg', fxc[1,:,0], omega)
+        fxc_eff[1:,:,0 ] = fz1
+        fxc_eff[0 ,:,1:] = fz1.transpose(1,0,2,3)
+        tmp = np.einsum('xyg,rg->rxyg', fxc[1,:,1,:], omega)
+        fxc_eff[1:,:,1:] = np.einsum('rxyg,sg->rxsyg', tmp, omega)
+
+        # spin-flip part
+        fxc_sf = 0
+        for p0, p1 in _prange(0, weights.size, blksize):
+            rho = _project_spin_paxis(rho_tm, sgridz[p0:p1])
+            fxc = func(rho, deriv)[2]
+            fxc = fxc.reshape(2, nvar, 2, nvar, ngrids, p1 - p0)
+            # only considers the xx+yy part
+            fxc_sf += fxc[1,:,1].dot(weights[p0:p1])
+
+        for i in range(1, 4):
+            fxc_eff[i,:,i] += fxc_sf
+        tmp = np.einsum('xyg,rg->rxyg', fxc_sf, omega)
+        fxc_eff[1:,:,1:] -= np.einsum('rxyg,sg->rxsyg', tmp, omega)
+
+    ret = [exc_eff]
+    if deriv > 0:
+        ret.append(vxc_eff)
+    if deriv > 1:
+        ret.append(fxc_eff)
+    if deriv > 2:
+        raise NotImplementedError
+    return ret
+
+def _eval_xc_lebedev(func, rho_tm, deriv, spin_samples,
+                     collinear_threshold=None, collinear_samples=200):
+    '''Multi-collinear effective potential and effective kernel with projection
+    samples on spherical surface (the Lebedev grid samples)
+    '''
+    ngrids = rho_tm.shape[-1]
+    sgrids, weights = _make_sph_samples(spin_samples)
+    blksize = int(np.ceil(1e4 / ngrids)) * 8
+    # import pdb
+    # pdb.set_trace()
+    if rho_tm.ndim == 2:
+        nvar = 1
+    else:
+        nvar = rho_tm.shape[1]
+    exc_eff = vxc_eff = fxc_eff = kxc_eff = 0
+    for p0, p1 in _prange(0, weights.size, blksize):
+        nsg = p1 - p0
+        p_sgrids = sgrids[p0:p1]
+        p_weights = weights[p0:p1]
+        rho = _project_spin_sph(rho_tm, p_sgrids)
+
+        xc_orig = func(rho, deriv+1)
+
+        exc = xc_orig[0].reshape(ngrids, nsg)
+        vxc = xc_orig[1].reshape(2, nvar, ngrids, nsg)
+
+        rho = rho.reshape(2, nvar, ngrids, nsg)
+        s = rho[1]
+        rho_pure = rho[0,0]
+        exc_rho = exc * rho_pure + np.einsum('xgo,xgo->go', vxc[1], s)
+        exc_eff += np.einsum('go,o->g', exc_rho, p_weights)
+
+        if deriv > 0:
+            fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids, nsg)
+            # vs * 2 + s*f_s_st
+            vxc[1] *= 2
+            vxc += np.einsum('xbygo,xgo->bygo', fxc[1], s)
+            c_tm = _ts2tm_transformation(p_sgrids)
+            cw_tm = c_tm * p_weights
+            vxc_eff += np.einsum('rao,axgo->rxg', cw_tm, vxc)
+
+        if deriv > 1:
+            kxc = xc_orig[3].reshape(2, nvar, 2, nvar, 2, nvar, ngrids, nsg)
+            fxc[1,:,1] *= 3
+            fxc[0,:,1] *= 2
+            fxc[1,:,0] *= 2
+            fxc += np.einsum('xbyczgo,xgo->byczgo', kxc[1], s)
+            fxc = np.einsum('rao,axbygo->rxbygo', c_tm, fxc)
+            fxc_eff += np.einsum('sbo,rxbygo->rxsyg', cw_tm, fxc)
+
+        if deriv > 2:
+            lxc = xc_orig[4].reshape(2, nvar, 2, nvar, 2, nvar, 2, nvar, ngrids, nsg)
+            kxc[1,:,1,:,1] *= 4
+            kxc[1,:,1,:,0] *= 3
+            kxc[1,:,0,:,1] *= 3
+            kxc[0,:,1,:,1] *= 3
+            kxc[1,:,0,:,0] *= 2
+            kxc[0,:,1,:,0] *= 2
+            kxc[0,:,0,:,1] *= 2
+
+            kxc += np.einsum('wbxcydzgo,wgo->bxcydzgo', lxc[1], s)
+            kxc = np.einsum('rao,axbyczgo->rxbyczgo', c_tm, kxc)
+            kxc = np.einsum('sbo,rxbyczgo->rxsyczgo', c_tm, kxc)
+            # kxc = np.einsum('rao,sbo,axbyczgo->rxsyczgo', c_tm, c_tm,kxc)
+            kxc_eff += np.einsum('tco,rxsyczgo->rxsytzg', cw_tm, kxc)
+
+    # exc in libxc is defined as Exc per particle. exc_eff calculated above is exc*rho.
+    # Divide exc_eff by rho so as to follow the convention of libxc
+    if rho_tm.ndim == 2:
+        rho_pure = rho_tm[0]
+    else:
+        rho_pure = rho_tm[0,0]
+
+    exc_eff[rho_pure == 0] = 0
+    exc_eff[rho_pure != 0] /= rho_pure[rho_pure != 0]
+
+    # Strongly spin-polarized points (rho ~= |m|) can be considered as collinear spins
+    if collinear_threshold is not None:
+        rho, s = _project_spin_paxis(rho_tm.reshape(4, nvar, ngrids)[:,0])
+        cs_idx = np.where(s >= rho * collinear_threshold)[0]
+        if cs_idx.size > 0:
+            xc_cs = eval_xc_collinear_spin(func, rho_tm[...,cs_idx], deriv,
+                                           collinear_samples)
+            exc_eff[...,cs_idx] = xc_cs[0]
+            if deriv > 0:
+                vxc_eff[...,cs_idx] = xc_cs[1]
+            if deriv > 1:
+                fxc_eff[...,cs_idx] = xc_cs[2]
+            if deriv > 2:
+                kxc_eff[...,cs_idx] = xc_cs[3]
+
+    ret = [exc_eff]
+    if deriv > 0:
+        ret.append(vxc_eff)
+    if deriv > 1:
+        ret.append(fxc_eff)
+    if deriv > 2:
+        ret.append(kxc_eff)
+    if deriv > 3:
+        raise NotImplementedError
+    return ret
+
+def _make_sph_samples(spin_samples):
+    '''Integration samples on spherical surface'''
+    ang_grids = MakeAngularGrid(spin_samples)
+    directions = ang_grids[:,:3].copy(order='F')
+    weights = ang_grids[:,3].copy()
+    return directions, weights
+
+def _prange(start, end, step):
+    '''Partitions range into segments: i0:i1, i1:i2, i2:i3, ...'''
+    if start < end:
+        for i in range(start, end, step):
+            yield i, min(i+step, end)
+
+def _project_spin_sph(rho_tm, sgrids):
+    '''Projects spin onto spherical surface'''
+    rho = rho_tm[0]
+    m = rho_tm[1:]
+    nsg = sgrids.shape[0]
+    ngrids = rho.shape[-1]
+    if rho_tm.ndim == 2:
+        rho_ts = np.empty((2, ngrids, nsg))
+        rho_ts[0] = rho[:, np.newaxis]
+        rho_ts[1] = np.einsum('mg,om->go', m, sgrids)
+        rho_ts = rho_ts.reshape(2, ngrids*nsg)
+    else:
+        nvar = rho_tm.shape[1]
+        rho_ts = np.empty((2, nvar, ngrids, nsg))
+        rho_ts[0] = rho[:, :, np.newaxis]
+        rho_ts[1] = np.einsum('mxg,om->xgo', m, sgrids)
+        rho_ts = rho_ts.reshape(2, nvar, ngrids*nsg)
+    return rho_ts
+
+def _ts2tm_transformation(sgrids):
+    '''
+    Transformation that projects v_ts(rho,s) to rho/m representation (rho,mx,my,mz)
+    '''
+    nsg = sgrids.shape[0]
+    c_tm = np.zeros((4, 2, nsg))
+    c_tm[0,0] = 1
+    c_tm[1:,1] = sgrids.T
+    return c_tm
+
+def _make_paxis_samples(spin_samples):
+    '''Samples on principal axis between [0, 1]'''
+    rt, wt = np.polynomial.legendre.leggauss(spin_samples)
+    rt = rt * .5 + .5
+    wt *= .5  # normalized to 1
+    return rt, wt
+
+def _project_spin_paxis(rho_tm, sgridz=None):
+    '''Projects spins onto the principal axis'''
+    rho = rho_tm[0]
+    m = rho_tm[1:]
+
+    s = np.linalg.norm(m, axis=0)
+    if sgridz is None:
+        rho_ts = np.stack([rho, s])
+    else:
+        ngrids = rho.shape[-1]
+        nsg = sgridz.shape[0]
+        if rho_tm.ndim == 2:
+            rho_ts = np.empty((2, ngrids, nsg))
+            rho_ts[0] = rho[:,np.newaxis]
+            rho_ts[1] = s[:,np.newaxis] * sgridz
+            rho_ts = rho_ts.reshape(2, ngrids * nsg)
+        else:
+            print('222')
+            nvar = rho_tm.shape[1]
+            rho_ts = np.empty((2, nvar, ngrids, nsg))
+            rho_ts[0] = rho[:,:,np.newaxis]
+            rho_ts[1] = s[:,:,np.newaxis] * sgridz
+            rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
+    return rho_ts
+
+def _project_spin_paxis2(rho_tm, sgridz=None):
+    # ToDo: be written into the function _project_spin_paxis().
+    # Because use mz rather than |mz| here
+    '''Projects spins onto the principal axis'''
+    rho = rho_tm[0]
+    mz = rho_tm[1]
+
+    if sgridz is None:
+        rho_ts = np.stack([rho, mz])
+    else:
+        ngrids = rho.shape[-1]
+        nsg = sgridz.shape[0]
+        if rho_tm.ndim == 2:
+            rho_ts = np.empty((2, ngrids, nsg))
+            rho_ts[0] = rho[:,np.newaxis]
+            rho_ts[1] = mz[:,np.newaxis] * sgridz
+            rho_ts = rho_ts.reshape(2, ngrids * nsg)
+        else:
+            nvar = rho_tm.shape[1]
+            rho_ts = np.empty((2, nvar, ngrids, nsg))
+            rho_ts[0] = rho[:,:,np.newaxis]
+            rho_ts[1] = mz[:,:,np.newaxis] * sgridz
+            rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
+    return rho_ts

From a427fbb10425327f32a2e023b6428d77e3689db8 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 4 Nov 2025 08:42:05 +0800
Subject: [PATCH 08/21] debug meta-GGA

---
 gpu4pyscf/dft/mcfun_gpu.py           | 194 ++++++----------
 gpu4pyscf/dft/numint.py              |  68 ++++++
 gpu4pyscf/dft/numint2c.py            | 333 ++++++++++++---------------
 gpu4pyscf/dft/tests/test_numint2c.py |  91 +++++---
 4 files changed, 347 insertions(+), 339 deletions(-)

diff --git a/gpu4pyscf/dft/mcfun_gpu.py b/gpu4pyscf/dft/mcfun_gpu.py
index 59bd02e13..c243fd1c1 100644
--- a/gpu4pyscf/dft/mcfun_gpu.py
+++ b/gpu4pyscf/dft/mcfun_gpu.py
@@ -14,14 +14,14 @@
 
 
 import warnings
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+import cupy as cp
 import numpy as np
-from .LebedevGrid import MakeAngularGrid
+from pyscf.dft.LebedevGrid import MakeAngularGrid
 
-MAX_GRIDS_PER_TASK = 200
+MAX_GRIDS_PER_TASK = 4096
 
 def eval_xc_eff(func, rho_tm, deriv=1, spin_samples=770,
-                collinear_threshold=None, collinear_samples=200, workers=1):
+                collinear_threshold=None, collinear_samples=200):
     '''Multi-collinear effective potential and effective kernel.
 
     Parameters
@@ -50,7 +50,7 @@ def eval_xc_eff(func, rho_tm, deriv=1, spin_samples=770,
         sigma_ud, sigma_dd (sigma_ud = nabla(rho_u) dot nabla(rho_d)). Please
         see example 04-xc_wrapper.py for the transformation between the two
         conventions.
-    rho_tm : np array with shape (4, Nvar, Ngrids)
+    rho_tm : cp array with shape (4, Nvar, Ngrids)
         rho_tm[0] is density. rho_tm[1:4] is the magnetization spin vector. Nvar can be
         * 1 for LDA functionals
         * 4 for GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
@@ -59,8 +59,6 @@ def eval_xc_eff(func, rho_tm, deriv=1, spin_samples=770,
         Functional derivatives. The current version (the kernel) supports maximum value 2.
     spin_samples : int, optional
         Number of grid points on spherical surface
-    workers : int, optional
-        Parallel execution if workers > 1
     collinear_threshold : float, optional
         if specified, filters the points of strongly polarized spins and calls the
         eval_xc_collinear_spin for these points. Recommended value 0.99.
@@ -76,80 +74,19 @@ def eval_xc_eff(func, rho_tm, deriv=1, spin_samples=770,
     '''
 
     assert deriv < 5
-    rho_tm = np.asarray(rho_tm)
-    if rho_tm.dtype != np.double:
+    rho_tm = cp.asarray(rho_tm)
+    if rho_tm.dtype != cp.double:
         raise RuntimeError('rho and m must be real')
     ngrids = rho_tm.shape[-1]
-    grids_per_task = min(ngrids//(workers*3)+1, MAX_GRIDS_PER_TASK)
-    if workers == 1:
-        results = []
-        for p0, p1 in _prange(0, ngrids, grids_per_task):
-            r = _eval_xc_lebedev(func, rho_tm[...,p0:p1], deriv, spin_samples,
-                                 collinear_threshold, collinear_samples)
-            results.append(r)
-    else:
-        executor = ThreadPoolExecutor
-
-        with executor(max_workers=workers) as ex:
-            futures = []
-            for p0, p1 in _prange(0, ngrids, grids_per_task):
-                f = ex.submit(_eval_xc_lebedev, func, rho_tm[...,p0:p1], deriv,
-                              spin_samples, collinear_threshold, collinear_samples)
-                futures.append(f)
-            results = [f.result() for f in futures]
-
-    return [None if x[0] is None else np.concatenate(x, axis=-1) for x in zip(*results)]
-
-
-def eval_xc_eff_sf(func, rho_tmz, deriv=1, collinear_samples=200, workers=1):
-    assert deriv < 5
-    if rho_tmz.dtype != np.double:
-        raise RuntimeError('rho and mz must be real')
-    ngrids = rho_tmz.shape[-1]
-    grids_per_task = min(ngrids//(workers*3)+1, MAX_GRIDS_PER_TASK)
-
-    if workers == 1:
-        results = []
-        for p0, p1 in _prange(0, ngrids, grids_per_task):
-            r = _eval_xc_sf(func, rho_tmz[...,p0:p1], deriv, collinear_samples)
-            results.append(r)
-    else:
-        print(collinear_samples)
-        executor = ThreadPoolExecutor
-
-        with executor(max_workers=workers) as ex:
-            futures = []
-            for p0, p1 in _prange(0, ngrids, grids_per_task):
-                f = ex.submit(_eval_xc_sf, func, rho_tmz[...,p0:p1], deriv, collinear_samples)
-                futures.append(f)
-            results = [f.result() for f in futures]
 
-    return [None if x[0] is None else np.concatenate(x, axis=-1) for x in zip(*results)]
+    results = []
+    for p0, p1 in _prange(0, ngrids, MAX_GRIDS_PER_TASK):
+        r = _eval_xc_lebedev(func, rho_tm[...,p0:p1], deriv, spin_samples,
+                             collinear_threshold, collinear_samples)
+        results.append(r)
 
-def _eval_xc_sf(func, rho_tmz, deriv, collinear_samples):
-    ngrids = rho_tmz.shape[-1]
-    # samples on z=cos(theta) and their weights between [0, 1]
-    sgridz, weights = _make_paxis_samples(collinear_samples)
-    blksize = int(np.ceil(1e5 / ngrids)) * 8
-
-    if rho_tmz.ndim == 2:
-        nvar = 1
-    else:
-        nvar = rho_tmz.shape[1]
-    fxc_sf = np.zeros((nvar,nvar,ngrids))
-    kxc_sf = np.zeros((nvar,nvar,2,nvar,ngrids))
-    for p0, p1 in _prange(0, weights.size, blksize):
-        rho = _project_spin_paxis2(rho_tmz, sgridz[p0:p1])
-        xc_orig = func(rho, deriv)
-        if deriv > 1:
-            fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids, p1-p0)
-            fxc_sf += fxc[1,:,1].dot(weights[p0:p1])
+    return [None if x[0] is None else cp.concatenate(x, axis=-1) for x in zip(*results)]
 
-        if deriv > 2:
-            kxc = xc_orig[3].reshape(2, nvar, 2, nvar, 2, nvar, ngrids, p1-p0)
-            kxc_sf[:,:,0] += kxc[1,:,1,:,0].dot(weights[p0:p1])
-            kxc_sf[:,:,1] += kxc[1,:,1,:,1].dot(weights[p0:p1]*sgridz[p0:p1])
-    return None,None,fxc_sf,kxc_sf
 
 def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
     '''Multi-collinear functional derivatives for collinear spins
@@ -178,7 +115,7 @@ def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
         needs to return the derivatives to rho, nabla_x(rho), nabla_y(rho),
         nabla_z(rho) while libxc (or xcfun) returns derivatives to rho, sigma_uu,
         sigma_ud, sigma_dd (sigma_ud = nabla(rho_u) dot nabla(rho_d)).
-    rho_tm : np array with shape (4, Nvar, Ngrids)
+    rho_tm : cp array with shape (4, Nvar, Ngrids)
         rho_tm[0] is density. rho_tm[1:4] is the magnetization spin vector. Nvar can be
         * 1 for LDA functionals
         * 4 for GGA functionals: rho, nabla_x(rho), nabla_y(rho), nabla_z(rho)
@@ -198,7 +135,7 @@ def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
     ngrids = rho_tm.shape[-1]
     # samples on z=cos(theta) and their weights between [0, 1]
     sgridz, weights = _make_paxis_samples(spin_samples)
-    blksize = int(np.ceil(1e5 / ngrids)) * 8
+    blksize = int(cp.ceil(1e5 / ngrids)) * 8
 
     if rho_tm.ndim == 2:
         nvar = 1
@@ -210,28 +147,27 @@ def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
     # TODO: filter s, nabla(s) ~= 0
     m = rho_tm[1:].reshape(3, nvar, ngrids)
     s = rho_ts[1].reshape(nvar, ngrids)[0]
-    with np.errstate(divide='ignore', invalid='ignore'):
-        omega = m[:,0] / s
-    omega[:,s==0] = 0
+    omega = cp.where(s == 0, 0, m[:, 0] / s)
 
     xc_orig = func(rho_ts, deriv)
     exc_eff = xc_orig[0]
+    exc_eff = exc_eff[:,0]
 
     omega = omega.reshape(3, ngrids)
     if deriv > 0:
         vxc = xc_orig[1].reshape(2, nvar, ngrids)
-        vxc_eff = np.vstack((vxc[:1], np.einsum('xg,rg->rxg', vxc[1], omega)))
+        vxc_eff = cp.vstack((vxc[:1], cp.einsum('xg,rg->rxg', vxc[1], omega)))
 
     if deriv > 1:
         # spin-conserve part
         fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids)
-        fxc_eff = np.empty((4, nvar, 4, nvar, ngrids))
+        fxc_eff = cp.empty((4, nvar, 4, nvar, ngrids))
         fxc_eff[0,:,0] = fxc[0,:,0]
-        fz1 = np.einsum('xyg,rg->rxyg', fxc[1,:,0], omega)
+        fz1 = cp.einsum('xyg,rg->rxyg', fxc[1,:,0], omega)
         fxc_eff[1:,:,0 ] = fz1
         fxc_eff[0 ,:,1:] = fz1.transpose(1,0,2,3)
-        tmp = np.einsum('xyg,rg->rxyg', fxc[1,:,1,:], omega)
-        fxc_eff[1:,:,1:] = np.einsum('rxyg,sg->rxsyg', tmp, omega)
+        tmp = cp.einsum('xyg,rg->rxyg', fxc[1,:,1,:], omega)
+        fxc_eff[1:,:,1:] = cp.einsum('rxyg,sg->rxsyg', tmp, omega)
 
         # spin-flip part
         fxc_sf = 0
@@ -244,8 +180,8 @@ def eval_xc_collinear_spin(func, rho_tm, deriv, spin_samples):
 
         for i in range(1, 4):
             fxc_eff[i,:,i] += fxc_sf
-        tmp = np.einsum('xyg,rg->rxyg', fxc_sf, omega)
-        fxc_eff[1:,:,1:] -= np.einsum('rxyg,sg->rxsyg', tmp, omega)
+        tmp = cp.einsum('xyg,rg->rxyg', fxc_sf, omega)
+        fxc_eff[1:,:,1:] -= cp.einsum('rxyg,sg->rxsyg', tmp, omega)
 
     ret = [exc_eff]
     if deriv > 0:
@@ -263,7 +199,9 @@ def _eval_xc_lebedev(func, rho_tm, deriv, spin_samples,
     '''
     ngrids = rho_tm.shape[-1]
     sgrids, weights = _make_sph_samples(spin_samples)
-    blksize = int(np.ceil(1e4 / ngrids)) * 8
+    sgrids = cp.asarray(sgrids)
+    weights = cp.asarray(weights)
+    blksize = int(cp.ceil(1e4 / ngrids)) * 8
     # import pdb
     # pdb.set_trace()
     if rho_tm.ndim == 2:
@@ -285,26 +223,26 @@ def _eval_xc_lebedev(func, rho_tm, deriv, spin_samples,
         rho = rho.reshape(2, nvar, ngrids, nsg)
         s = rho[1]
         rho_pure = rho[0,0]
-        exc_rho = exc * rho_pure + np.einsum('xgo,xgo->go', vxc[1], s)
-        exc_eff += np.einsum('go,o->g', exc_rho, p_weights)
+        exc_rho = exc * rho_pure + cp.einsum('xgo,xgo->go', vxc[1], s)
+        exc_eff += cp.einsum('go,o->g', exc_rho, p_weights)
 
         if deriv > 0:
             fxc = xc_orig[2].reshape(2, nvar, 2, nvar, ngrids, nsg)
             # vs * 2 + s*f_s_st
             vxc[1] *= 2
-            vxc += np.einsum('xbygo,xgo->bygo', fxc[1], s)
+            vxc += cp.einsum('xbygo,xgo->bygo', fxc[1], s)
             c_tm = _ts2tm_transformation(p_sgrids)
             cw_tm = c_tm * p_weights
-            vxc_eff += np.einsum('rao,axgo->rxg', cw_tm, vxc)
+            vxc_eff += cp.einsum('rao,axgo->rxg', cw_tm, vxc)
 
         if deriv > 1:
             kxc = xc_orig[3].reshape(2, nvar, 2, nvar, 2, nvar, ngrids, nsg)
             fxc[1,:,1] *= 3
             fxc[0,:,1] *= 2
             fxc[1,:,0] *= 2
-            fxc += np.einsum('xbyczgo,xgo->byczgo', kxc[1], s)
-            fxc = np.einsum('rao,axbygo->rxbygo', c_tm, fxc)
-            fxc_eff += np.einsum('sbo,rxbygo->rxsyg', cw_tm, fxc)
+            fxc += cp.einsum('xbyczgo,xgo->byczgo', kxc[1], s)
+            fxc = cp.einsum('rao,axbygo->rxbygo', c_tm, fxc)
+            fxc_eff += cp.einsum('sbo,rxbygo->rxsyg', cw_tm, fxc)
 
         if deriv > 2:
             lxc = xc_orig[4].reshape(2, nvar, 2, nvar, 2, nvar, 2, nvar, ngrids, nsg)
@@ -316,11 +254,11 @@ def _eval_xc_lebedev(func, rho_tm, deriv, spin_samples,
             kxc[0,:,1,:,0] *= 2
             kxc[0,:,0,:,1] *= 2
 
-            kxc += np.einsum('wbxcydzgo,wgo->bxcydzgo', lxc[1], s)
-            kxc = np.einsum('rao,axbyczgo->rxbyczgo', c_tm, kxc)
-            kxc = np.einsum('sbo,rxbyczgo->rxsyczgo', c_tm, kxc)
-            # kxc = np.einsum('rao,sbo,axbyczgo->rxsyczgo', c_tm, c_tm,kxc)
-            kxc_eff += np.einsum('tco,rxsyczgo->rxsytzg', cw_tm, kxc)
+            kxc += cp.einsum('wbxcydzgo,wgo->bxcydzgo', lxc[1], s)
+            kxc = cp.einsum('rao,axbyczgo->rxbyczgo', c_tm, kxc)
+            kxc = cp.einsum('sbo,rxbyczgo->rxsyczgo', c_tm, kxc)
+            # kxc = cp.einsum('rao,sbo,axbyczgo->rxsyczgo', c_tm, c_tm,kxc)
+            kxc_eff += cp.einsum('tco,rxsyczgo->rxsytzg', cw_tm, kxc)
 
     # exc in libxc is defined as Exc per particle. exc_eff calculated above is exc*rho.
     # Divide exc_eff by rho so as to follow the convention of libxc
@@ -335,10 +273,16 @@ def _eval_xc_lebedev(func, rho_tm, deriv, spin_samples,
     # Strongly spin-polarized points (rho ~= |m|) can be considered as collinear spins
     if collinear_threshold is not None:
         rho, s = _project_spin_paxis(rho_tm.reshape(4, nvar, ngrids)[:,0])
-        cs_idx = np.where(s >= rho * collinear_threshold)[0]
+        cs_idx = cp.where(s >= rho * collinear_threshold)[0]
         if cs_idx.size > 0:
             xc_cs = eval_xc_collinear_spin(func, rho_tm[...,cs_idx], deriv,
                                            collinear_samples)
+            print("debug 1")
+            print("exc_eff", exc_eff.shape)
+            print("cs_idx", cs_idx.shape)
+            print("xc_cs", len(xc_cs))
+            print("xc_cs[0]", xc_cs[0].shape)
+            print("rho_tm", rho_tm.shape)
             exc_eff[...,cs_idx] = xc_cs[0]
             if deriv > 0:
                 vxc_eff[...,cs_idx] = xc_cs[1]
@@ -378,15 +322,15 @@ def _project_spin_sph(rho_tm, sgrids):
     nsg = sgrids.shape[0]
     ngrids = rho.shape[-1]
     if rho_tm.ndim == 2:
-        rho_ts = np.empty((2, ngrids, nsg))
-        rho_ts[0] = rho[:, np.newaxis]
-        rho_ts[1] = np.einsum('mg,om->go', m, sgrids)
+        rho_ts = cp.empty((2, ngrids, nsg))
+        rho_ts[0] = rho[:, cp.newaxis]
+        rho_ts[1] = cp.einsum('mg,om->go', m, sgrids)
         rho_ts = rho_ts.reshape(2, ngrids*nsg)
     else:
         nvar = rho_tm.shape[1]
-        rho_ts = np.empty((2, nvar, ngrids, nsg))
-        rho_ts[0] = rho[:, :, np.newaxis]
-        rho_ts[1] = np.einsum('mxg,om->xgo', m, sgrids)
+        rho_ts = cp.empty((2, nvar, ngrids, nsg))
+        rho_ts[0] = rho[:, :, cp.newaxis]
+        rho_ts[1] = cp.einsum('mxg,om->xgo', m, sgrids)
         rho_ts = rho_ts.reshape(2, nvar, ngrids*nsg)
     return rho_ts
 
@@ -395,7 +339,7 @@ def _ts2tm_transformation(sgrids):
     Transformation that projects v_ts(rho,s) to rho/m representation (rho,mx,my,mz)
     '''
     nsg = sgrids.shape[0]
-    c_tm = np.zeros((4, 2, nsg))
+    c_tm = cp.zeros((4, 2, nsg))
     c_tm[0,0] = 1
     c_tm[1:,1] = sgrids.T
     return c_tm
@@ -403,6 +347,8 @@ def _ts2tm_transformation(sgrids):
 def _make_paxis_samples(spin_samples):
     '''Samples on principal axis between [0, 1]'''
     rt, wt = np.polynomial.legendre.leggauss(spin_samples)
+    rt = cp.asarray(rt)
+    wt = cp.asarray(wt)
     rt = rt * .5 + .5
     wt *= .5  # normalized to 1
     return rt, wt
@@ -412,23 +358,23 @@ def _project_spin_paxis(rho_tm, sgridz=None):
     rho = rho_tm[0]
     m = rho_tm[1:]
 
-    s = np.linalg.norm(m, axis=0)
+    s = cp.linalg.norm(m, axis=0)
     if sgridz is None:
-        rho_ts = np.stack([rho, s])
+        rho_ts = cp.stack([rho, s])
     else:
         ngrids = rho.shape[-1]
         nsg = sgridz.shape[0]
         if rho_tm.ndim == 2:
-            rho_ts = np.empty((2, ngrids, nsg))
-            rho_ts[0] = rho[:,np.newaxis]
-            rho_ts[1] = s[:,np.newaxis] * sgridz
+            rho_ts = cp.empty((2, ngrids, nsg))
+            rho_ts[0] = rho[:,cp.newaxis]
+            rho_ts[1] = s[:,cp.newaxis] * sgridz
             rho_ts = rho_ts.reshape(2, ngrids * nsg)
         else:
             print('222')
             nvar = rho_tm.shape[1]
-            rho_ts = np.empty((2, nvar, ngrids, nsg))
-            rho_ts[0] = rho[:,:,np.newaxis]
-            rho_ts[1] = s[:,:,np.newaxis] * sgridz
+            rho_ts = cp.empty((2, nvar, ngrids, nsg))
+            rho_ts[0] = rho[:,:,cp.newaxis]
+            rho_ts[1] = s[:,:,cp.newaxis] * sgridz
             rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
     return rho_ts
 
@@ -440,19 +386,19 @@ def _project_spin_paxis2(rho_tm, sgridz=None):
     mz = rho_tm[1]
 
     if sgridz is None:
-        rho_ts = np.stack([rho, mz])
+        rho_ts = cp.stack([rho, mz])
     else:
         ngrids = rho.shape[-1]
         nsg = sgridz.shape[0]
         if rho_tm.ndim == 2:
-            rho_ts = np.empty((2, ngrids, nsg))
-            rho_ts[0] = rho[:,np.newaxis]
-            rho_ts[1] = mz[:,np.newaxis] * sgridz
+            rho_ts = cp.empty((2, ngrids, nsg))
+            rho_ts[0] = rho[:,cp.newaxis]
+            rho_ts[1] = mz[:,cp.newaxis] * sgridz
             rho_ts = rho_ts.reshape(2, ngrids * nsg)
         else:
             nvar = rho_tm.shape[1]
-            rho_ts = np.empty((2, nvar, ngrids, nsg))
-            rho_ts[0] = rho[:,:,np.newaxis]
-            rho_ts[1] = mz[:,:,np.newaxis] * sgridz
+            rho_ts = cp.empty((2, nvar, ngrids, nsg))
+            rho_ts[0] = rho[:,:,cp.newaxis]
+            rho_ts[1] = mz[:,:,cp.newaxis] * sgridz
             rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
     return rho_ts
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 55c25dfab..81e1be013 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -2429,6 +2429,74 @@ def unsort_orbitals(self, sorted_mat, axis=[], out=None):
         out[tuple(fancy_index)] = sorted_mat
         return out
 
+    def sort_orbitals_2c(self, mat, axis=[]):
+        ''' Transform given axis of a 2-component matrix (GKS) into sorted AO
+        
+        This assumes the axes specified in 'axis' have a dimension of 2*nao,
+        representing alpha (:nao) and beta (nao:) components. Both components
+        are sorted using the same AO sorting index.
+        '''
+        idx = self._ao_idx
+        nao = len(idx)
+        
+        # Create the 2-component sorting index:
+        # [sorted_alpha_indices, sorted_beta_indices]
+        # E.g., if idx = [0, 2, 1] (nao=3),
+        # idx_2c = [0, 2, 1, 3+0, 3+2, 3+1] = [0, 2, 1, 3, 5, 4]
+        # We use numpy to build the index, consistent with self._ao_idx
+        idx_2c = np.concatenate([idx, idx + nao])
+
+        shape_ones = (1,) * mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(mat.shape):
+            if dim in axis:
+                # Check if the dimension matches the 2-component size
+                if n != 2 * nao:
+                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component sorting")
+                indices = idx_2c
+            else:
+                # Use cupy.arange for non-sorted axes, as in the original sort_orbitals
+                indices = cupy.arange(n)
+            
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+            
+        # Perform the sorting using advanced indexing
+        return mat[tuple(fancy_index)]
+
+    def unsort_orbitals_2c(self, sorted_mat, axis=[], out=None):
+        ''' Transform given axis of a 2-component matrix from sorted AO to original AO
+        
+        This assumes the axes specified in 'axis' have a dimension of 2*nao.
+        This is the inverse operation of sort_orbitals_2c.
+        '''
+        idx = self._ao_idx
+        nao = len(idx)
+        
+        # The 2-component index is created identically to sort_orbitals_2c
+        idx_2c = np.concatenate([idx, idx + nao])
+
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                # Check if the dimension matches the 2-component size
+                if n != 2 * nao:
+                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component unsorting")
+                indices = idx_2c
+            else:
+                indices = cupy.arange(n)
+            
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+            
+        if out is None:
+            out = cupy.empty_like(sorted_mat)
+            
+        # Perform the unsorting assignment
+        out[tuple(fancy_index)] = sorted_mat
+        return out
+
 class GTOValEnvVars(ctypes.Structure):
     _fields_ = [
         ("natm", ctypes.c_int),
diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index 1fe420882..9fb9580fc 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -20,7 +20,7 @@
 import numpy as np
 import cupy as cp
 from pyscf import lib
-from gpu4pyscf.dft import numint
+from gpu4pyscf.dft import numint, mcfun_gpu
 from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot
 from gpu4pyscf.dft import xc_deriv
 from pyscf import __config__
@@ -29,6 +29,8 @@
 def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
              with_lapl=True, verbose=None):
     nao = ao.shape[-2]
+    print("in eval_rho")
+    print(ao.shape, dm.shape, nao, ao.shape)
     assert dm.ndim == 2 and nao * 2 == dm.shape[0]
     if not isinstance(dm, cp.ndarray):
         dm = cp.asarray(dm)
@@ -39,9 +41,12 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
     ao_loc = mol.ao_loc
 
     if xctype == 'LDA':
+        print("debug 0", hermi)
         c0a = _dot_ao_dm(mol, ao, dm[:nao], non0tab, shls_slice, ao_loc)
         c0b = _dot_ao_dm(mol, ao, dm[nao:], non0tab, shls_slice, ao_loc)
         rho_m = _contract_rho_m((ao, ao), (c0a, c0b), hermi, True)
+        print(rho_m.shape)
+        print(rho_m)
     elif xctype == 'GGA':
         # first 4 ~ (rho, m), second 4 ~ (0th order, dx, dy, dz)
         if hermi:
@@ -99,13 +104,14 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
     xctype = ni._xc_type(xc_code)
     shls_slice = (0, mol.nbas)
     ao_loc = mol.ao_loc_nr()
-
-    make_rho, nset, n2c = ni._gen_rho_evaluator(mol, dms, hermi, False)
+    n2c = dms[0].shape[-1]
     nao = n2c // 2
+    opt = ni.gdftopt
+    _sorted_mol = opt._sorted_mol
 
-    nelec = cp.zeros(nset)
-    excsum = cp.zeros(nset)
-    vmat = cp.zeros((nset,n2c,n2c), dtype=cp.complex128)
+    nelec = 0
+    excsum = 0
+    vmat = cp.zeros((n2c,n2c), dtype=cp.complex128)
 
     if xctype in ('LDA', 'GGA', 'MGGA'):
         f_eval_mat = {
@@ -119,21 +125,24 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
         if ni.collinear[0] == 'm':  # mcol
             eval_xc = ni.mcfun_eval_xc_adapter(xc_code)
         else:
+            raise NotImplementedError('locally-collinear vxc is not implemented')
             eval_xc = ni.eval_xc_eff
 
         for ao, mask, weight, coords \
-                in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-            for i in range(nset):
-                rho = make_rho(i, ao, mask, xctype)
-                exc, vxc = eval_xc(xc_code, rho, deriv=1, xctype=xctype)[:2]
-                if xctype == 'LDA':
-                    den = rho[0] * weight
-                else:
-                    den = rho[0,0] * weight
-                nelec[i] += den.sum()
-                excsum[i] += cp.dot(den, exc)
-                vmat[i] += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
-                                ao_loc, hermi)
+                in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory):
+            rho = eval_rho(_sorted_mol, ao, dms, non0tab=None, xctype=xctype, hermi=hermi,
+                    with_lapl=False, verbose=None)
+            print(rho.shape)
+            print(rho)
+            exc, vxc = eval_xc(xc_code, rho, deriv=1, xctype=xctype)[:2]
+            if xctype == 'LDA':
+                den = rho[0] * weight
+            else:
+                den = rho[0,0] * weight
+            nelec += den.sum()
+            excsum += cp.dot(den, exc)
+            vmat += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
+                            ao_loc, hermi)
 
     elif xctype == 'HF':
         pass
@@ -141,11 +150,8 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
         raise NotImplementedError(f'numint2c.get_vxc for functional {xc_code}')
 
     if hermi:
-        vmat = vmat + vmat.conj().transpose(0,2,1)
-    if isinstance(dms, cp.ndarray) and dms.ndim == 2:
-        vmat = vmat[0]
-        nelec = nelec[0]
-        excsum = excsum[0]
+        vmat = vmat + vmat.conj().transpose(1,0)
+
     return nelec, excsum, vmat
 
 def _gks_mcol_fxc(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
@@ -190,58 +196,58 @@ def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     mat = cp.block([[mataa, matab], [matba, matbb]])
     return mat
 
-def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
-                 verbose=None, spin=None):
-    r'''Returns the derivative tensor against the density parameters
-
-    [[density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a],
-     [density_b, (nabla_x)_b, (nabla_y)_b, (nabla_z)_b, tau_b]].
-
-    It differs from the eval_xc method in the derivatives of non-local part.
-    The eval_xc method returns the XC functional derivatives to sigma
-    (|\nabla \rho|^2)
-
-    Args:
-        rho: 2-dimensional or 3-dimensional array
-            Total density and spin density (and their derivatives if GGA or MGGA
-            functionals) on grids
-
-    Kwargs:
-        deriv: int
-            derivative orders
-        omega: float
-            define the exponent in the attenuated Coulomb for RSH functional
-    '''
-    if omega is None: omega = ni.omega
-    if xctype is None: xctype = ni._xc_type(xc_code)
-    if ni.collinear[0] == 'c':  # collinear
-        t = rho[0]
-        s = rho[3]
-    elif ni.collinear[0] == 'n':  # ncol
-        t = rho[0]
-        m = rho[1:4]
-        s = lib.norm(m, axis=0)
-    elif ni.collinear[0] == 'm':  # mcol
-        # called by mcfun.eval_xc_eff which passes (rho, s) only
-        t, s = rho
-    else:
-        raise RuntimeError(f'Unknown collinear scheme {ni.collinear}')
-
-    rho = cp.stack([(t + s) * .5, (t - s) * .5])
-    if xctype == 'MGGA' and rho.shape[1] == 6:
-        rho = cp.asarray(rho[:,[0,1,2,3,5],:], order='C')
-
-    assert spin is None or spin == 1
-    spin = 1
-
-    out = ni.libxc.eval_xc1(xc_code, rho, spin, deriv, omega)
-    evfk = [out[0]]
-    for order in range(1, deriv+1):
-        evfk.append(xc_deriv.transform_xc(rho, out, xctype, spin, order))
-    if deriv < 3:
-        # The return has at least [e, v, f, k] terms
-        evfk.extend([None] * (3 - deriv))
-    return evfk
+# def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
+#                  verbose=None, spin=None):
+#     r'''Returns the derivative tensor against the density parameters
+
+#     [[density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a],
+#      [density_b, (nabla_x)_b, (nabla_y)_b, (nabla_z)_b, tau_b]].
+
+#     It differs from the eval_xc method in the derivatives of non-local part.
+#     The eval_xc method returns the XC functional derivatives to sigma
+#     (|\nabla \rho|^2)
+
+#     Args:
+#         rho: 2-dimensional or 3-dimensional array
+#             Total density and spin density (and their derivatives if GGA or MGGA
+#             functionals) on grids
+
+#     Kwargs:
+#         deriv: int
+#             derivative orders
+#         omega: float
+#             define the exponent in the attenuated Coulomb for RSH functional
+#     '''
+#     if omega is None: omega = ni.omega
+#     if xctype is None: xctype = ni._xc_type(xc_code)
+#     if ni.collinear[0] == 'c':  # collinear
+#         t = rho[0]
+#         s = rho[3]
+#     elif ni.collinear[0] == 'n':  # ncol
+#         t = rho[0]
+#         m = rho[1:4]
+#         s = lib.norm(m, axis=0)
+#     elif ni.collinear[0] == 'm':  # mcol
+#         # called by mcfun.eval_xc_eff which passes (rho, s) only
+#         t, s = rho
+#     else:
+#         raise RuntimeError(f'Unknown collinear scheme {ni.collinear}')
+
+#     rho = cp.stack([(t + s) * .5, (t - s) * .5])
+#     if xctype == 'MGGA' and rho.shape[1] == 6:
+#         rho = cp.asarray(rho[:,[0,1,2,3,5],:], order='C')
+
+#     assert spin is None or spin == 1
+#     spin = 1
+
+#     out = ni.libxc.eval_xc1(xc_code, rho, spin, deriv, omega)
+#     evfk = [out[0]]
+#     for order in range(1, deriv+1):
+#         evfk.append(xc_deriv.transform_xc(rho, out, xctype, spin, order))
+#     if deriv < 3:
+#         # The return has at least [e, v, f, k] terms
+#         evfk.extend([None] * (3 - deriv))
+#     return evfk
 
 # * Mcfun requires functional derivatives to total-density and spin-density.
 # * Make it a global function than a closure so as to be callable by multiprocessing
@@ -252,23 +258,32 @@ def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
             evfk[order] = xc_deriv.ud2ts(evfk[order])
     return evfk
 
+def __mcfun_fn_eval_xc2(ni, xc_code, xctype, rho, deriv):
+    t, s = rho
+    if not isinstance(t, cp.ndarray):
+        t = cp.asarray(t)
+    if not isinstance(s, cp.ndarray):
+        s = cp.asarray(s)
+    rho = cp.stack([(t + s) * .5, (t - s) * .5])
+    spin = 1
+    evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype, spin=spin)
+    evfk = list(evfk)
+    for order in range(1, deriv+1):
+        if evfk[order] is not None:
+            evfk[order] = xc_deriv.ud2ts(evfk[order])
+    return evfk
+
 def mcfun_eval_xc_adapter(ni, xc_code):
     '''Wrapper to generate the eval_xc function required by mcfun'''
-    try:
-        import mcfun
-    except ImportError:
-        raise ImportError('This feature requires mcfun library.\n'
-                          'Try install mcfun with `pip install mcfun`')
 
     xctype = ni._xc_type(xc_code)
-    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype)
-    nproc = lib.num_threads()
+    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc2, ni, xc_code, xctype)
     def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None,
                     verbose=None, spin=None):
-        return mcfun.eval_xc_eff(
+        return mcfun_gpu.eval_xc_eff(
             fn_eval_xc, rho, deriv, spin_samples=ni.spin_samples,
             collinear_threshold=ni.collinear_thrd,
-            collinear_samples=ni.collinear_samples, workers=nproc)
+            collinear_samples=ni.collinear_samples)
     return eval_xc_eff
 
 def _mcol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
@@ -300,7 +315,10 @@ def _mcol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
     aow = _scale_ao(ao, wr[0]-wmz[0], out=aow)  # Mz
     matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    mat = cp.block([[mataa, matab], [matba, matbb]])
+    row1 = cp.concatenate([mataa, matab], axis=1)
+    row2 = cp.concatenate([matba, matbb], axis=1)
+
+    mat = cp.concatenate([row1, row2], axis=0)
     return mat
 
 def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
@@ -337,7 +355,22 @@ def _mcol_gga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
         aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
         matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
 
-    mat = cp.block([[mataa, matab], [matba, matbb]])
+    row1 = cp.concatenate([mataa, matab], axis=1)
+    row2 = cp.concatenate([matba, matbb], axis=1)
+
+    mat = cp.concatenate([row1, row2], axis=0)
+    return mat
+
+def _tau_dot(mol, bra, ket, wv, mask, shls_slice, ao_loc):
+    '''nabla_ao dot nabla_ao
+    numpy.einsum('p,xpi,xpj->ij', wv, bra[1:4].conj(), ket[1:4])
+    '''
+    aow = _scale_ao(ket[1], wv)
+    mat = _dot_ao_ao(mol, bra[1], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ket[2], wv, aow)
+    mat += _dot_ao_ao(mol, bra[2], aow, mask, shls_slice, ao_loc)
+    aow = _scale_ao(ket[3], wv, aow)
+    mat += _dot_ao_ao(mol, bra[3], aow, mask, shls_slice, ao_loc)
     return mat
 
 def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
@@ -382,7 +415,10 @@ def _mcol_mgga_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, herm
         aow = _scale_ao(ao[1:4], (wr[1:4]-wmz[1:4]).conj(), out=aow)  # Mz
         matbb += _dot_ao_ao(mol, aow, ao[0], mask, shls_slice, ao_loc)
 
-    mat = cp.block([[mataa, matab], [matba, matbb]])
+    row1 = cp.concatenate([mataa, matab], axis=1)
+    row2 = cp.concatenate([matba, matbb], axis=1)
+
+    mat = cp.concatenate([row1, row2], axis=0)
     return mat
 
 def _mcol_lda_fxc_mat(mol, ao, weight, rho0, rho1, fxc,
@@ -417,6 +453,8 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
         rbb = cp.einsum('ip,ip->p', bra_b.real, ket_b[nao:].real)
         rbb+= cp.einsum('ip,ip->p', bra_b.imag, ket_b[nao:].imag)
         rho_m = cp.empty((4, ngrids))
+        print("debug")
+        print(rho_m.dtype)
         rho_m[0,:] = raa + rbb     # rho
         rho_m[1,:] = rab.real      # mx
         rho_m[2,:] = rab.imag      # my
@@ -428,6 +466,7 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
             rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
             rho_m[1,:] += rba.real
             rho_m[2,:] -= rba.imag
+        print(rho_m.dtype)
     else:
         raa = cp.einsum('ip,ip->p', bra_a.conj(), ket_a[:nao])
         rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
@@ -443,6 +482,8 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
 
 class NumInt2C(lib.StreamObject, numint.LibXCMixin):
     '''Numerical integration methods for 2-component basis (used by GKS)'''
+    _keys = {'gdftopt'}
+    gdftopt      = None
 
     # collinear schemes:
     #   'col' (collinear, by default)
@@ -456,6 +497,12 @@ class NumInt2C(lib.StreamObject, numint.LibXCMixin):
     eval_ao = staticmethod(numint.eval_ao)
     eval_rho = staticmethod(eval_rho)
 
+    def build(self, mol, coords):
+        self.gdftopt = numint._GDFTOpt.from_mol(mol)
+        self.grid_blksize = None
+        self.non0ao_idx = {}
+        return self
+
     def eval_rho1(self, mol, ao, dm, screen_index=None, xctype='LDA', hermi=0,
                   with_lapl=True, cutoff=None, ao_cutoff=None, pair_mask=None,
                   verbose=None):
@@ -478,19 +525,6 @@ def eval_rho2(self, mol, ao, mo_coeff, mo_occ, non0tab=None, xctype='LDA',
             rho = self.eval_rho(mol, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
             return rho
 
-        #if mo_coeff.dtype == cp.double:
-        #    nao = ao.shape[-1]
-        #    assert nao * 2 == mo_coeff.shape[0]
-        #    mo_aR = mo_coeff[:nao]
-        #    mo_bR = mo_coeff[nao:]
-        #    rho  = numint.eval_rho2(mol, ao, mo_aR, mo_occ, non0tab, xctype, with_lapl, verbose)
-        #    rho += numint.eval_rho2(mol, ao, mo_bR, mo_occ, non0tab, xctype, with_lapl, verbose)
-        #else:
-        #    dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
-        #    hermi = 1
-        #    rho = self.eval_rho(mol, dm, ao, dm, non0tab, xctype, hermi, with_lapl, verbose)
-        #mx = my = mz = None
-        #return rho, mx, my, mz
         raise NotImplementedError(self.collinear)
 
     def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
@@ -498,92 +532,13 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
         '''Compute the 0th order density, Vxc and fxc.  They can be used in TDDFT,
         DFT hessian module etc.
         '''
-        xctype = self._xc_type(xc_code)
-        if xctype in ('GGA', 'MGGA'):
-            ao_deriv = 1
-        else:
-            ao_deriv = 0
-        n2c = mo_coeff.shape[0]
-        nao = n2c // 2
-        with_lapl = numint.MGGA_DENSITY_LAPL
-
-        if self.collinear[0] in ('m', 'n'):  # mcol or ncol
-            rho = []
-            for ao, mask, weight, coords \
-                    in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                rho.append(self.eval_rho2(mol, ao, mo_coeff, mo_occ, mask, xctype,
-                                          with_lapl))
-            rho = cp.concatenate(rho,axis=-1)
-            assert rho.dtype == cp.double
-            if self.collinear[0] == 'm':  # mcol
-                eval_xc = self.mcfun_eval_xc_adapter(xc_code)
-            else:
-                eval_xc = self.eval_xc_eff
-            vxc, fxc = eval_xc(xc_code, rho, deriv=2, xctype=xctype)[1:3]
-        else:
-            # rhoa and rhob must be real
-            dm = cp.dot(mo_coeff * mo_occ, mo_coeff.conj().T)
-            dm_a = dm[:nao,:nao].real.copy('C')
-            dm_b = dm[nao:,nao:].real.copy('C')
-            ni = self._to_numint1c()
-            hermi = 1
-            rhoa = []
-            rhob = []
-            for ao, mask, weight, coords \
-                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                # rhoa and rhob must be real
-                rhoa.append(ni.eval_rho(mol, ao, dm_a, mask, xctype, hermi, with_lapl))
-                rhob.append(ni.eval_rho(mol, ao, dm_b, mask, xctype, hermi, with_lapl))
-            rho = cp.stack([cp.concatenate(rhoa,axis=-1), cp.concatenate(rhob,axis=-1)])
-            assert rho.dtype == cp.double
-            vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
-        return rho, vxc, fxc
+        raise NotImplementedError("Kxc calculation is not supported.")
 
     def cache_xc_kernel1(self, mol, grids, xc_code, dm, spin=0, max_memory=2000):
         '''Compute the 0th order density, Vxc and fxc.  They can be used in TDDFT,
         DFT hessian module etc.
         '''
-        xctype = self._xc_type(xc_code)
-        if xctype in ('GGA', 'MGGA'):
-            ao_deriv = 1
-        else:
-            ao_deriv = 0
-        n2c = dm.shape[0]
-        nao = n2c // 2
-
-        if self.collinear[0] in ('m', 'n'):  # mcol or ncol
-            hermi = 1 # rho must be real. We need to assume dm hermitian
-            with_lapl = False
-            rho = []
-            for ao, mask, weight, coords \
-                    in self.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                rho.append(self.eval_rho1(mol, ao, dm, mask, xctype, hermi, with_lapl))
-            rho = cp.concatenate(rho,axis=-1)
-            assert rho.dtype == cp.double
-            if self.collinear[0] == 'm':  # mcol
-                eval_xc = self.mcfun_eval_xc_adapter(xc_code)
-            else:
-                eval_xc = self.eval_xc_eff
-            vxc, fxc = eval_xc(xc_code, rho, deriv=2, xctype=xctype)[1:3]
-        else:
-            # rhoa and rhob must be real. We need to assume dm hermitian
-            hermi = 1
-            # TODO: test if dm[:nao,:nao].imag == 0
-            dm_a = dm[:nao,:nao].real.copy('C')
-            dm_b = dm[nao:,nao:].real.copy('C')
-            ni = self._to_numint1c()
-            with_lapl = True
-            rhoa = []
-            rhob = []
-            for ao, mask, weight, coords \
-                    in ni.block_loop(mol, grids, nao, ao_deriv, max_memory):
-                # rhoa and rhob must be real
-                rhoa.append(ni.eval_rho(mol, ao, dm_a, mask, xctype, hermi, with_lapl))
-                rhob.append(ni.eval_rho(mol, ao, dm_b, mask, xctype, hermi, with_lapl))
-            rho = cp.stack([cp.concatenate(rhoa,axis=-1), cp.concatenate(rhob,axis=-1)])
-            assert rho.dtype == cp.double
-            vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3]
-        return rho, vxc, fxc
+        raise NotImplementedError("Kxc calculation is not supported.")
 
     def get_rho(self, mol, dm, grids, max_memory=2000):
         '''Density in real space
@@ -600,9 +555,19 @@ def get_rho(self, mol, dm, grids, max_memory=2000):
     @lib.with_doc(numint.nr_rks.__doc__)
     def nr_vxc(self, mol, grids, xc_code, dms, spin=0, relativity=0, hermi=1,
                max_memory=2000, verbose=None):
+        if not isinstance(dms, cp.ndarray):
+            dms = cp.asarray(dms)
         if self.collinear[0] in ('m', 'n'):  # mcol or ncol
+            opt = getattr(self, 'gdftopt', None)
+            if opt is None:
+                self.build(mol, grids.coords)
+                opt = self.gdftopt
+                assert dms.ndim == 2
+                dms = cp.asarray(dms)
+                dms = opt.sort_orbitals_2c(dms, axis=[0,1])
             n, exc, vmat = self._gks_mcol_vxc(mol, grids, xc_code, dms, relativity,
                                               hermi, max_memory, verbose)
+            vmat = opt.unsort_orbitals_2c(vmat, axis=[0,1])
         else:
             nao = dms.shape[-1] // 2
             # ground state density is always real
@@ -628,8 +593,10 @@ def nr_fxc(self, mol, grids, xc_code, dm0, dms, spin=0, relativity=0, hermi=0,
                rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None):
         raise NotImplementedError('non-collinear fxc')
     get_fxc = nr_gks_fxc = nr_fxc
-
-    eval_xc_eff = _eval_xc_eff
+    
+    def _init_xcfuns(self, xc_code, spin=0):
+        return numint._init_xcfuns(xc_code, spin)
+    eval_xc_eff = numint.eval_xc_eff
     mcfun_eval_xc_adapter = mcfun_eval_xc_adapter
 
     block_loop = numint.NumInt.block_loop
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 374e10773..5588e6266 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -34,7 +34,7 @@ def setUpModule():
         basis = 'ccpvdz',
         charge = 1,
         spin = 1,  # = 2S = spin_up - spin_down
-        output = '/dev/null'
+        # output = '/dev/null'
         )
 
     np.random.seed(2)
@@ -65,45 +65,72 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
 
-    def test_eval_rho(self):
-        np.random.seed(1)
-        dm = np.random.random(dm0.shape)
-        ni_gpu = NumInt2C()
-        ni_cpu = pyscf_numint2c()
-        for xctype in ('LDA', 'GGA', 'MGGA'):
-            deriv = 1
-            if xctype == 'LDA':
-                deriv = 0
-            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
-            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+    # def test_eval_rho(self):
+    #     np.random.seed(1)
+    #     dm = np.random.random(dm0.shape)
+    #     ni_gpu = NumInt2C()
+    #     ni_cpu = pyscf_numint2c()
+    #     for xctype in ('LDA', 'GGA', 'MGGA'):
+    #         deriv = 1
+    #         if xctype == 'LDA':
+    #             deriv = 0
+    #         ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+    #         ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
             
-            rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
-            ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
-            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+    #         rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+    #         ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+    #         self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
 
-            rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
-            ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
-            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+    #         rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+    #         ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+    #         self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
 
-    def test_eval_rho2(self):
-        np.random.seed(1)
-        mo_coeff_test = np.random.random(mo_coeff.shape)
+    # def test_eval_rho2(self):
+    #     np.random.seed(1)
+    #     mo_coeff_test = np.random.random(mo_coeff.shape)
+    #     ni_gpu = NumInt2C()
+    #     ni_gpu.collinear='m'
+    #     ni_cpu = pyscf_numint2c()
+    #     ni_cpu.collinear='m'
+    #     for xctype in ('LDA', 'GGA', 'MGGA'):
+    #         deriv = 1
+    #         if xctype == 'LDA':
+    #             deriv = 0
+    #         ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+    #         ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            
+    #         rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+    #         ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+    #         self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+    def test_eval_xc_eff(self):
         ni_gpu = NumInt2C()
         ni_gpu.collinear='m'
         ni_cpu = pyscf_numint2c()
         ni_cpu.collinear='m'
-        for xctype in ('LDA', 'GGA', 'MGGA'):
-            deriv = 1
-            if xctype == 'LDA':
-                deriv = 0
-            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
-            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
-            
-            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
-            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
-            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
-
+        np.random.seed(1)
+        dm = dm0*1.0
+        print(dm.shape, mol.nao)
+        print(dm.dtype)
+        for xc_code in (MGGA_M06, ):
+            # xctype = ni_gpu._xc_type(xc_code)
+            # deriv = 1
+            # if xctype == 'LDA':
+            #     deriv = 0
+            # ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            # rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            # eval_xc_eff_gpu = ni_gpu.mcfun_eval_xc_adapter(xc_code)
+            n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, xc_code, dm)
+            n_cpu, exc_cpu, vmat_cpu = ni_cpu.nr_vxc(mol, grids_cpu, xc_code, dm)
+            print("n_gpu", n_gpu)
+            print("n_cpu", n_cpu)
+            print("exc_gpu", exc_gpu)
+            print("exc_cpu", exc_cpu)
+            print("vmat_gpu", vmat_gpu)
+            print("vmat_cpu", vmat_cpu)
+            print()
 
 if __name__ == "__main__":
     print("Full Tests for dft numint2c")
     unittest.main()
+

From e3d749b53de0cd8fc727e90219ecf6ddeece4df2 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Tue, 4 Nov 2025 13:44:43 +0800
Subject: [PATCH 09/21] finish debug numint2c

---
 gpu4pyscf/dft/mcfun_gpu.py           |  24 ------
 gpu4pyscf/dft/numint2c.py            |  84 ++------------------
 gpu4pyscf/dft/tests/test_numint2c.py | 110 ++++++++++++++-------------
 3 files changed, 63 insertions(+), 155 deletions(-)

diff --git a/gpu4pyscf/dft/mcfun_gpu.py b/gpu4pyscf/dft/mcfun_gpu.py
index c243fd1c1..ed010311c 100644
--- a/gpu4pyscf/dft/mcfun_gpu.py
+++ b/gpu4pyscf/dft/mcfun_gpu.py
@@ -378,27 +378,3 @@ def _project_spin_paxis(rho_tm, sgridz=None):
             rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
     return rho_ts
 
-def _project_spin_paxis2(rho_tm, sgridz=None):
-    # ToDo: be written into the function _project_spin_paxis().
-    # Because use mz rather than |mz| here
-    '''Projects spins onto the principal axis'''
-    rho = rho_tm[0]
-    mz = rho_tm[1]
-
-    if sgridz is None:
-        rho_ts = cp.stack([rho, mz])
-    else:
-        ngrids = rho.shape[-1]
-        nsg = sgridz.shape[0]
-        if rho_tm.ndim == 2:
-            rho_ts = cp.empty((2, ngrids, nsg))
-            rho_ts[0] = rho[:,cp.newaxis]
-            rho_ts[1] = mz[:,cp.newaxis] * sgridz
-            rho_ts = rho_ts.reshape(2, ngrids * nsg)
-        else:
-            nvar = rho_tm.shape[1]
-            rho_ts = cp.empty((2, nvar, ngrids, nsg))
-            rho_ts[0] = rho[:,:,cp.newaxis]
-            rho_ts[1] = mz[:,:,cp.newaxis] * sgridz
-            rho_ts = rho_ts.reshape(2, nvar, ngrids * nsg)
-    return rho_ts
diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index 9fb9580fc..c4183fcf8 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -29,8 +29,6 @@
 def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
              with_lapl=True, verbose=None):
     nao = ao.shape[-2]
-    print("in eval_rho")
-    print(ao.shape, dm.shape, nao, ao.shape)
     assert dm.ndim == 2 and nao * 2 == dm.shape[0]
     if not isinstance(dm, cp.ndarray):
         dm = cp.asarray(dm)
@@ -41,12 +39,9 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0,
     ao_loc = mol.ao_loc
 
     if xctype == 'LDA':
-        print("debug 0", hermi)
         c0a = _dot_ao_dm(mol, ao, dm[:nao], non0tab, shls_slice, ao_loc)
         c0b = _dot_ao_dm(mol, ao, dm[nao:], non0tab, shls_slice, ao_loc)
         rho_m = _contract_rho_m((ao, ao), (c0a, c0b), hermi, True)
-        print(rho_m.shape)
-        print(rho_m)
     elif xctype == 'GGA':
         # first 4 ~ (rho, m), second 4 ~ (0th order, dx, dy, dz)
         if hermi:
@@ -126,14 +121,13 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
             eval_xc = ni.mcfun_eval_xc_adapter(xc_code)
         else:
             raise NotImplementedError('locally-collinear vxc is not implemented')
-            eval_xc = ni.eval_xc_eff
 
         for ao, mask, weight, coords \
                 in ni.block_loop(_sorted_mol, grids, nao, ao_deriv, max_memory):
-            rho = eval_rho(_sorted_mol, ao, dms, non0tab=None, xctype=xctype, hermi=hermi,
+            mask_2c = np.concatenate([mask, mask + nao])
+            dm_mask = dms[mask_2c[:,None],mask_2c]
+            rho = eval_rho(_sorted_mol, ao, dm_mask, non0tab=None, xctype=xctype, hermi=hermi,
                     with_lapl=False, verbose=None)
-            print(rho.shape)
-            print(rho)
             exc, vxc = eval_xc(xc_code, rho, deriv=1, xctype=xctype)[:2]
             if xctype == 'LDA':
                 den = rho[0] * weight
@@ -141,7 +135,7 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
                 den = rho[0,0] * weight
             nelec += den.sum()
             excsum += cp.dot(den, exc)
-            vmat += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
+            vmat += fmat(mol, ao, weight, rho, vxc, mask_2c, shls_slice,
                             ao_loc, hermi)
 
     elif xctype == 'HF':
@@ -196,69 +190,10 @@ def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi
     mat = cp.block([[mataa, matab], [matba, matbb]])
     return mat
 
-# def _eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None,
-#                  verbose=None, spin=None):
-#     r'''Returns the derivative tensor against the density parameters
-
-#     [[density_a, (nabla_x)_a, (nabla_y)_a, (nabla_z)_a, tau_a],
-#      [density_b, (nabla_x)_b, (nabla_y)_b, (nabla_z)_b, tau_b]].
-
-#     It differs from the eval_xc method in the derivatives of non-local part.
-#     The eval_xc method returns the XC functional derivatives to sigma
-#     (|\nabla \rho|^2)
-
-#     Args:
-#         rho: 2-dimensional or 3-dimensional array
-#             Total density and spin density (and their derivatives if GGA or MGGA
-#             functionals) on grids
-
-#     Kwargs:
-#         deriv: int
-#             derivative orders
-#         omega: float
-#             define the exponent in the attenuated Coulomb for RSH functional
-#     '''
-#     if omega is None: omega = ni.omega
-#     if xctype is None: xctype = ni._xc_type(xc_code)
-#     if ni.collinear[0] == 'c':  # collinear
-#         t = rho[0]
-#         s = rho[3]
-#     elif ni.collinear[0] == 'n':  # ncol
-#         t = rho[0]
-#         m = rho[1:4]
-#         s = lib.norm(m, axis=0)
-#     elif ni.collinear[0] == 'm':  # mcol
-#         # called by mcfun.eval_xc_eff which passes (rho, s) only
-#         t, s = rho
-#     else:
-#         raise RuntimeError(f'Unknown collinear scheme {ni.collinear}')
-
-#     rho = cp.stack([(t + s) * .5, (t - s) * .5])
-#     if xctype == 'MGGA' and rho.shape[1] == 6:
-#         rho = cp.asarray(rho[:,[0,1,2,3,5],:], order='C')
-
-#     assert spin is None or spin == 1
-#     spin = 1
-
-#     out = ni.libxc.eval_xc1(xc_code, rho, spin, deriv, omega)
-#     evfk = [out[0]]
-#     for order in range(1, deriv+1):
-#         evfk.append(xc_deriv.transform_xc(rho, out, xctype, spin, order))
-#     if deriv < 3:
-#         # The return has at least [e, v, f, k] terms
-#         evfk.extend([None] * (3 - deriv))
-#     return evfk
 
 # * Mcfun requires functional derivatives to total-density and spin-density.
 # * Make it a global function than a closure so as to be callable by multiprocessing
 def __mcfun_fn_eval_xc(ni, xc_code, xctype, rho, deriv):
-    evfk = ni.eval_xc_eff(xc_code, rho, deriv=deriv, xctype=xctype)
-    for order in range(1, deriv+1):
-        if evfk[order] is not None:
-            evfk[order] = xc_deriv.ud2ts(evfk[order])
-    return evfk
-
-def __mcfun_fn_eval_xc2(ni, xc_code, xctype, rho, deriv):
     t, s = rho
     if not isinstance(t, cp.ndarray):
         t = cp.asarray(t)
@@ -277,7 +212,7 @@ def mcfun_eval_xc_adapter(ni, xc_code):
     '''Wrapper to generate the eval_xc function required by mcfun'''
 
     xctype = ni._xc_type(xc_code)
-    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc2, ni, xc_code, xctype)
+    fn_eval_xc = functools.partial(__mcfun_fn_eval_xc, ni, xc_code, xctype)
     def eval_xc_eff(xc_code, rho, deriv=1, omega=None, xctype=None,
                     verbose=None, spin=None):
         return mcfun_gpu.eval_xc_eff(
@@ -453,8 +388,6 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
         rbb = cp.einsum('ip,ip->p', bra_b.real, ket_b[nao:].real)
         rbb+= cp.einsum('ip,ip->p', bra_b.imag, ket_b[nao:].imag)
         rho_m = cp.empty((4, ngrids))
-        print("debug")
-        print(rho_m.dtype)
         rho_m[0,:] = raa + rbb     # rho
         rho_m[1,:] = rab.real      # mx
         rho_m[2,:] = rab.imag      # my
@@ -466,7 +399,6 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
             rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
             rho_m[1,:] += rba.real
             rho_m[2,:] -= rba.imag
-        print(rho_m.dtype)
     else:
         raa = cp.einsum('ip,ip->p', bra_a.conj(), ket_a[:nao])
         rba = cp.einsum('ip,ip->p', bra_b.conj(), ket_a[nao:])
@@ -534,12 +466,6 @@ def cache_xc_kernel(self, mol, grids, xc_code, mo_coeff, mo_occ, spin=0,
         '''
         raise NotImplementedError("Kxc calculation is not supported.")
 
-    def cache_xc_kernel1(self, mol, grids, xc_code, dm, spin=0, max_memory=2000):
-        '''Compute the 0th order density, Vxc and fxc.  They can be used in TDDFT,
-        DFT hessian module etc.
-        '''
-        raise NotImplementedError("Kxc calculation is not supported.")
-
     def get_rho(self, mol, dm, grids, max_memory=2000):
         '''Density in real space
         '''
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 5588e6266..4f41fe54d 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -18,9 +18,11 @@
 import cupy
 from pyscf import lib, scf
 from pyscf.dft.numint2c import NumInt2C as pyscf_numint2c
+from pyscf.dft.numint import NumInt as pyscf_numint
 from gpu4pyscf.dft import Grids
 from gpu4pyscf.dft import numint2c
 from gpu4pyscf.dft.numint2c import NumInt2C
+from gpu4pyscf.dft.numint import NumInt
 from gpu4pyscf import dft
 from gpu4pyscf.dft import gen_grid
 
@@ -65,43 +67,59 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
 
-    # def test_eval_rho(self):
-    #     np.random.seed(1)
-    #     dm = np.random.random(dm0.shape)
-    #     ni_gpu = NumInt2C()
-    #     ni_cpu = pyscf_numint2c()
-    #     for xctype in ('LDA', 'GGA', 'MGGA'):
-    #         deriv = 1
-    #         if xctype == 'LDA':
-    #             deriv = 0
-    #         ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
-    #         ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+    def test_eval_rho(self):
+        np.random.seed(1)
+        dm = np.random.random(dm0.shape)
+        ni_gpu = NumInt2C()
+        ni_cpu = pyscf_numint2c()
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
             
-    #         rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
-    #         ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
-    #         self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
-
-    #         rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
-    #         ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
-    #         self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
-
-    # def test_eval_rho2(self):
-    #     np.random.seed(1)
-    #     mo_coeff_test = np.random.random(mo_coeff.shape)
-    #     ni_gpu = NumInt2C()
-    #     ni_gpu.collinear='m'
-    #     ni_cpu = pyscf_numint2c()
-    #     ni_cpu.collinear='m'
-    #     for xctype in ('LDA', 'GGA', 'MGGA'):
-    #         deriv = 1
-    #         if xctype == 'LDA':
-    #             deriv = 0
-    #         ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
-    #         ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm, xctype=xctype, hermi=0, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+            rho = ni_gpu.eval_rho(mol, ao_gpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            ref = ni_cpu.eval_rho(mol, ao_cpu, dm0, xctype=xctype, hermi=1, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+    def test_eval_rho2(self):
+        np.random.seed(1)
+        mo_coeff_test = np.random.random(mo_coeff.shape)
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+        ni_cpu = pyscf_numint2c()
+        ni_cpu.collinear='m'
+        for xctype in ('LDA', 'GGA', 'MGGA'):
+            deriv = 1
+            if xctype == 'LDA':
+                deriv = 0
+            ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
+            ao_cpu = ni_cpu.eval_ao(mol, grids_cpu.coords, deriv=deriv)
             
-    #         rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
-    #         ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
-    #         self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+            rho = ni_gpu.eval_rho2(mol, ao_gpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+            ref = ni_cpu.eval_rho2(mol, ao_cpu, mo_coeff_test, mo_occ, xctype=xctype, with_lapl=False)
+            self.assertAlmostEqual(abs(rho[...,:grids_cpu.size].get() - ref).max(), 0, 10)
+
+    def test_get_rho(self):
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+
+        np.random.seed(1)
+        ni_gpu_1c = NumInt()
+        dm_test = np.random.random(dm0.shape)
+        dm_test = dm_test + dm_test.T
+        n2c = dm_test.shape[0]
+        nao = n2c//2
+        dm_1c_test = dm_test[:nao,:nao] + dm_test[nao:,nao:]
+        rho_gpu = ni_gpu.get_rho(mol, dm_test, grids_gpu)
+        rho_1c_gpu = ni_gpu_1c.get_rho(mol, dm_1c_test, grids_gpu)
+        self.assertAlmostEqual(abs(rho_gpu.get() - rho_1c_gpu.get()).max(), 0, 10)
+
 
     def test_eval_xc_eff(self):
         ni_gpu = NumInt2C()
@@ -110,25 +128,13 @@ def test_eval_xc_eff(self):
         ni_cpu.collinear='m'
         np.random.seed(1)
         dm = dm0*1.0
-        print(dm.shape, mol.nao)
-        print(dm.dtype)
-        for xc_code in (MGGA_M06, ):
-            # xctype = ni_gpu._xc_type(xc_code)
-            # deriv = 1
-            # if xctype == 'LDA':
-            #     deriv = 0
-            # ao_gpu = ni_gpu.eval_ao(mol, grids_gpu.coords, deriv=deriv, transpose=False)
-            # rho = ni_gpu.eval_rho(mol, ao_gpu, dm, xctype=xctype, hermi=0, with_lapl=False)
-            # eval_xc_eff_gpu = ni_gpu.mcfun_eval_xc_adapter(xc_code)
+        for xc_code in (LDA, GGA_PBE, MGGA_M06):
+            ni_gpu.gdftopt = None
             n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, xc_code, dm)
             n_cpu, exc_cpu, vmat_cpu = ni_cpu.nr_vxc(mol, grids_cpu, xc_code, dm)
-            print("n_gpu", n_gpu)
-            print("n_cpu", n_cpu)
-            print("exc_gpu", exc_gpu)
-            print("exc_cpu", exc_cpu)
-            print("vmat_gpu", vmat_gpu)
-            print("vmat_cpu", vmat_cpu)
-            print()
+            self.assertAlmostEqual(abs(n_gpu.get() - n_cpu).max(), 0, 10)
+            self.assertAlmostEqual(abs(exc_gpu.get() - exc_cpu).max(), 0, 10)
+            self.assertAlmostEqual(abs(vmat_gpu.get() - vmat_cpu).max(), 0, 10)
 
 if __name__ == "__main__":
     print("Full Tests for dft numint2c")

From c26e035c21114d119d12e80f57699e2fda23b96d Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 5 Nov 2025 13:58:13 +0800
Subject: [PATCH 10/21] finish numint2c

---
 gpu4pyscf/dft/numint2c.py            |  38 +-------
 gpu4pyscf/dft/tests/test_numint2c.py | 127 +++++++++++++++++++++++++--
 2 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index c4183fcf8..00c86dfae 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -135,7 +135,7 @@ def _gks_mcol_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=0,
                 den = rho[0,0] * weight
             nelec += den.sum()
             excsum += cp.dot(den, exc)
-            vmat += fmat(mol, ao, weight, rho, vxc, mask_2c, shls_slice,
+            vmat += fmat(mol, ao, weight, rho, vxc, mask, shls_slice,
                             ao_loc, hermi)
 
     elif xctype == 'HF':
@@ -155,40 +155,7 @@ def _gks_mcol_fxc(ni, mol, grids, xc_code, dm0, dms, relativity=0, hermi=0,
 def _ncol_lda_vxc_mat(mol, ao, weight, rho, vxc, mask, shls_slice, ao_loc, hermi):
     '''Vxc matrix of non-collinear LDA'''
     # NOTE vxc in u/d representation
-    r, mx, my, mz = rho
-    vxc = xc_deriv.ud2ts(vxc)
-    vr, vs = vxc[:,0]
-    s = lib.norm(rho[1:4], axis=0)
-
-    wv = weight * vr
-    with cp.errstate(divide='ignore',invalid='ignore'):
-        ws = vs * weight / s
-    ws[s < 1e-20] = 0
-
-    # * .5 because of v+v.conj().T in r_vxc
-    if hermi:
-        wv *= .5
-        ws *= .5
-    aow = None
-    aow = _scale_ao(ao, ws*mx, out=aow)  # Mx
-    tmpx = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    aow = _scale_ao(ao, ws*my, out=aow)  # My
-    tmpy = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    if hermi:
-        # conj(mx+my*1j) == mx-my*1j, tmpx and tmpy should be real
-        matba = (tmpx + tmpx.T) + (tmpy + tmpy.T) * 1j
-        matab = cp.zeros_like(matba)
-    else:
-        # conj(mx+my*1j) != mx-my*1j, tmpx and tmpy should be complex
-        matba = tmpx + tmpy * 1j
-        matab = tmpx - tmpy * 1j
-    tmpx = tmpy = None
-    aow = _scale_ao(ao, wv+ws*mz, out=aow)  # Mz
-    mataa = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    aow = _scale_ao(ao, wv-ws*mz, out=aow)  # Mz
-    matbb = _dot_ao_ao(mol, ao, aow, mask, shls_slice, ao_loc)
-    mat = cp.block([[mataa, matab], [matba, matbb]])
-    return mat
+    raise NotImplementedError('non-collinear lda vxc mat')
 
 
 # * Mcfun requires functional derivatives to total-density and spin-density.
@@ -526,6 +493,7 @@ def _init_xcfuns(self, xc_code, spin=0):
     mcfun_eval_xc_adapter = mcfun_eval_xc_adapter
 
     block_loop = numint.NumInt.block_loop
+    reset = numint.NumInt.reset
 
     def _to_numint1c(self):
         '''Converts to the associated class to handle collinear systems'''
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 4f41fe54d..7a1e11656 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -21,6 +21,8 @@
 from pyscf.dft.numint import NumInt as pyscf_numint
 from gpu4pyscf.dft import Grids
 from gpu4pyscf.dft import numint2c
+from gpu4pyscf.dft import numint
+from pyscf.dft import numint2c as pyscf_numint2c_file
 from gpu4pyscf.dft.numint2c import NumInt2C
 from gpu4pyscf.dft.numint import NumInt
 from gpu4pyscf import dft
@@ -69,7 +71,8 @@ class KnownValues(unittest.TestCase):
 
     def test_eval_rho(self):
         np.random.seed(1)
-        dm = np.random.random(dm0.shape)
+        dm = np.random.random(dm0.shape) + np.random.random(dm0.shape)*1.0j
+        dm = dm + dm.conj().T
         ni_gpu = NumInt2C()
         ni_cpu = pyscf_numint2c()
         for xctype in ('LDA', 'GGA', 'MGGA'):
@@ -89,7 +92,7 @@ def test_eval_rho(self):
 
     def test_eval_rho2(self):
         np.random.seed(1)
-        mo_coeff_test = np.random.random(mo_coeff.shape)
+        mo_coeff_test = np.random.random(mo_coeff.shape) + np.random.random(mo_coeff.shape)*1.0j
         ni_gpu = NumInt2C()
         ni_gpu.collinear='m'
         ni_cpu = pyscf_numint2c()
@@ -111,13 +114,14 @@ def test_get_rho(self):
 
         np.random.seed(1)
         ni_gpu_1c = NumInt()
-        dm_test = np.random.random(dm0.shape)
-        dm_test = dm_test + dm_test.T
+        dm_test = np.random.random(dm0.shape) + np.random.random(dm0.shape)*1.0j
+        dm_test = dm_test + dm_test.T.conj()
+
         n2c = dm_test.shape[0]
         nao = n2c//2
         dm_1c_test = dm_test[:nao,:nao] + dm_test[nao:,nao:]
         rho_gpu = ni_gpu.get_rho(mol, dm_test, grids_gpu)
-        rho_1c_gpu = ni_gpu_1c.get_rho(mol, dm_1c_test, grids_gpu)
+        rho_1c_gpu = ni_gpu_1c.get_rho(mol, dm_1c_test.real, grids_gpu)
         self.assertAlmostEqual(abs(rho_gpu.get() - rho_1c_gpu.get()).max(), 0, 10)
 
 
@@ -127,8 +131,8 @@ def test_eval_xc_eff(self):
         ni_cpu = pyscf_numint2c()
         ni_cpu.collinear='m'
         np.random.seed(1)
-        dm = dm0*1.0
-        for xc_code in (LDA, GGA_PBE, MGGA_M06):
+        dm = dm0*1.0 + 0.0j
+        for xc_code in (LDA, ):
             ni_gpu.gdftopt = None
             n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, xc_code, dm)
             n_cpu, exc_cpu, vmat_cpu = ni_cpu.nr_vxc(mol, grids_cpu, xc_code, dm)
@@ -136,6 +140,115 @@ def test_eval_xc_eff(self):
             self.assertAlmostEqual(abs(exc_gpu.get() - exc_cpu).max(), 0, 10)
             self.assertAlmostEqual(abs(vmat_gpu.get() - vmat_cpu).max(), 0, 10)
 
+    def test_mcol_lda_vxc_mat(self):
+        xc_code = 'lda,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .0001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=0, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='LDA', hermi=1, with_lapl=False)
+        ni_CPU = pyscf_numint2c()
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        ni_CPU.collinear = 'mcol'
+        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_cpu = pyscf_numint2c_file._mcol_lda_vxc_mat(mol, ao.transpose(1,0).get(), weight, rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+
+    def test_mcol_gga_vxc_mat(self):
+        xc_code = 'pbe,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='GGA', hermi=1, with_lapl=False)
+        ni_CPU = pyscf_numint2c()
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        ni_CPU.collinear = 'mcol'
+        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_cpu = pyscf_numint2c_file._mcol_gga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+
+    def test_mcol_mgga_vxc_mat(self):
+        xc_code = 'tpss'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='MGGA', hermi=1, with_lapl=False)
+        ni_CPU = pyscf_numint2c()
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        ni_CPU.collinear = 'mcol'
+        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_cpu = pyscf_numint2c_file._mcol_mgga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+
+
 if __name__ == "__main__":
     print("Full Tests for dft numint2c")
     unittest.main()

From fce5bafa5511bf18c02e93932bab7cdd3d49bdfb Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 5 Nov 2025 14:01:07 +0800
Subject: [PATCH 11/21] change some codes of GKS

---
 gpu4pyscf/dft/gks.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index 36b3f1ae6..018cf6293 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -15,7 +15,7 @@
 import cupy as cp
 from pyscf import lib
 from pyscf.dft import gks
-from gpu4pyscf.dft import numint
+from gpu4pyscf.dft import numint2c
 from gpu4pyscf.dft import rks
 from gpu4pyscf.scf.ghf import GHF
 from gpu4pyscf.lib import logger
@@ -129,16 +129,42 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     return vxc
 
 
-class GKS(gks.GKS, GHF):
+class GKS(rks.KohnShamDFT, GHF):
     from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
 
     def __init__(self, mol, xc='LDA,VWN'):
+        raise NotImplementedError("GKS is not implemented")
         GHF.__init__(self, mol)
         rks.KohnShamDFT.__init__(self, xc)
-        # self._numint = NumInt2C()
+        self._numint = numint2c.NumInt2C()
+
+    def dump_flags(self, verbose=None):
+        GHF.dump_flags(self, verbose)
+        rks.KohnShamDFT.dump_flags(self, verbose)
+        logger.info(self, 'collinear = %s', self._numint.collinear)
+        if self._numint.collinear[0] == 'm':
+            logger.info(self, 'mcfun spin_samples = %s', self._numint.spin_samples)
+            logger.info(self, 'mcfun collinear_thrd = %s', self._numint.collinear_thrd)
+            logger.info(self, 'mcfun collinear_samples = %s', self._numint.collinear_samples)
+        return self
+
+    @property
+    def collinear(self):
+        return self._numint.collinear
+    @collinear.setter
+    def collinear(self, val):
+        self._numint.collinear = val
+
+    @property
+    def spin_samples(self):
+        return self._numint.spin_samples
+    @spin_samples.setter
+    def spin_samples(self, val):
+        self._numint.spin_samples = val
     
     get_veff = get_veff
     reset = rks.RKS.reset
     energy_elec = rks.RKS.energy_elec
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
+

From a402df08e3dac6d6a4cbd3ed3654e93b733f0276 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 5 Nov 2025 14:02:08 +0800
Subject: [PATCH 12/21] change some typos

---
 gpu4pyscf/dft/numint2c.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index 00c86dfae..fceca2286 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -21,7 +21,7 @@
 import cupy as cp
 from pyscf import lib
 from gpu4pyscf.dft import numint, mcfun_gpu
-from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao, _tau_dot
+from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao
 from gpu4pyscf.dft import xc_deriv
 from pyscf import __config__
 

From 739f594e92a32e27387266695a5c027b4556311e Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 5 Nov 2025 14:30:14 +0800
Subject: [PATCH 13/21] fix a bug

---
 gpu4pyscf/dft/tests/test_numint2c.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 7a1e11656..25e5514b1 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -38,7 +38,7 @@ def setUpModule():
         basis = 'ccpvdz',
         charge = 1,
         spin = 1,  # = 2S = spin_up - spin_down
-        # output = '/dev/null'
+        output = '/dev/null'
         )
 
     np.random.seed(2)

From 97bb2cf461bd2ef1ae6af1dc410acb84736ce26a Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 5 Nov 2025 16:50:07 +0800
Subject: [PATCH 14/21] add ghf support for complex

---
 gpu4pyscf/scf/ghf.py            | 34 ++++++++----
 gpu4pyscf/scf/hf.py             |  4 ++
 gpu4pyscf/scf/tests/test_ghf.py | 92 +++++++++++++++++++++++++++++++--
 requirements.txt                |  3 +-
 setup.py                        |  1 +
 5 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/gpu4pyscf/scf/ghf.py b/gpu4pyscf/scf/ghf.py
index cd8336d7f..8e8b3efee 100644
--- a/gpu4pyscf/scf/ghf.py
+++ b/gpu4pyscf/scf/ghf.py
@@ -85,6 +85,7 @@ def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True,
         return vj, vk
 
     def get_j(self, mol=None, dm=None, hermi=1, omega=None):
+        assert hermi == 1, 'hermi must be 1'
         dm = asarray(dm)
         dm_shape = dm.shape
         nso = dm.shape[-1]
@@ -92,8 +93,8 @@ def get_j(self, mol=None, dm=None, hermi=1, omega=None):
         dm = dm.reshape(-1,nso,nso)
         n_dm = dm.shape[0]
         dm = dm[:,:nao,:nao] + dm[:,nao:,nao:]
-        jtmp = hf.SCF.get_j(self, mol, dm, hermi, omega)
-        vj = cp.zeros((n_dm,nso,nso))
+        jtmp = hf.SCF.get_j(self, mol, dm.real, hermi, omega)
+        vj = cp.zeros((n_dm,nso,nso), dtype=dm.dtype)
         vj[:,:nao,:nao] = vj[:,nao:,nao:] = jtmp
         return vj.reshape(dm_shape)
 
@@ -108,14 +109,27 @@ def get_k(self, mol=None, dm=None, hermi=1, omega=None):
         dmbb = dm[:,nao:,nao:]
         dmab = dm[:,:nao,nao:]
         dmba = dm[:,nao:,:nao]
-        dm = cp.vstack((dmaa, dmbb, dmab, dmba))
-        ktmp = super().get_k(mol, dm, hermi=0, omega=omega)
-        ktmp = ktmp.reshape(4,n_dm,nao,nao)
-        vk = cp.zeros((n_dm,nso,nso), dm.dtype)
-        vk[:,:nao,:nao] = ktmp[0]
-        vk[:,nao:,nao:] = ktmp[1]
-        vk[:,:nao,nao:] = ktmp[2]
-        vk[:,nao:,:nao] = ktmp[3]
+        if dm.dtype == cp.complex128:
+            dm_real = cp.vstack((dmaa.real, dmbb.real, dmab.real, dmba.real))
+            ktmp_real = super().get_k(mol, dm_real, hermi=0, omega=omega)
+            ktmp_real = ktmp_real.reshape(4,n_dm,nao,nao)
+            dm_imag = cp.vstack((dmaa.imag, dmbb.imag, dmab.imag, dmba.imag))
+            ktmp_imag = super().get_k(mol, dm_imag, hermi=0, omega=omega)
+            ktmp_imag = ktmp_imag.reshape(4,n_dm,nao,nao)
+            vk = cp.zeros((n_dm,nso,nso), dm.dtype)
+            vk[:,:nao,:nao] = ktmp_real[0] + 1j*ktmp_imag[0]
+            vk[:,nao:,nao:] = ktmp_real[1] + 1j*ktmp_imag[1]
+            vk[:,:nao,nao:] = ktmp_real[2] + 1j*ktmp_imag[2]
+            vk[:,nao:,:nao] = ktmp_real[3] + 1j*ktmp_imag[3]
+        else:
+            dm = cp.vstack((dmaa, dmbb, dmab, dmba))
+            ktmp = super().get_k(mol, dm, hermi=0, omega=omega)
+            ktmp = ktmp.reshape(4,n_dm,nao,nao)
+            vk = cp.zeros((n_dm,nso,nso), dm.dtype)
+            vk[:,:nao,:nao] = ktmp[0]
+            vk[:,nao:,nao:] = ktmp[1]
+            vk[:,:nao,nao:] = ktmp[2]
+            vk[:,nao:,:nao] = ktmp[3]
         return vk.reshape(dm_shape)
 
     def get_veff(mf, mol=None, dm=None, dm_last=None, vhf_last=None, hermi=1):
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index f384f78dd..fbce65008 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -145,6 +145,8 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     if dm is None: dm = mf.make_rdm1()
     s1e = cupy.asarray(s1e)
     dm = cupy.asarray(dm)
+    if dm.dtype == cupy.complex128:
+        s1e = s1e.astype(cupy.complex128)
     if diis_start_cycle is None:
         diis_start_cycle = mf.diis_start_cycle
     if level_shift_factor is None:
@@ -661,6 +663,8 @@ def eig(self, fock, s):
         x = None
         if hasattr(self, 'overlap_canonical_decomposed_x') and self.overlap_canonical_decomposed_x is not None:
             x = cupy.asarray(self.overlap_canonical_decomposed_x)
+        if fock.dtype == cupy.complex128:
+            s = s.astype(cupy.complex128)
         if x is None:
             mo_energy, mo_coeff = eigh(fock, s)
             return mo_energy, mo_coeff
diff --git a/gpu4pyscf/scf/tests/test_ghf.py b/gpu4pyscf/scf/tests/test_ghf.py
index 3bea9800a..b8dda3305 100644
--- a/gpu4pyscf/scf/tests/test_ghf.py
+++ b/gpu4pyscf/scf/tests/test_ghf.py
@@ -14,19 +14,103 @@
 
 import unittest
 import pyscf
+import numpy
+import cupy as cp
+
+def setUpModule():
+    global mol, mol1
+    mol = pyscf.gto.Mole()
+    mol.atom = '''
+        O    0    0    0
+        H    0.   -0.757   0.587
+        H    0.   0.757    0.587'''
+    mol.spin = None
+    mol.basis = 'cc-pvdz'
+    mol.verbose = 7
+    mol.output = '/dev/null'
+    mol.build()
+
+    mol1 = pyscf.gto.M(
+        verbose = 0,
+        atom = '''
+            O    0    0    0
+            H    0.   -0.757   0.587
+            H    0.   0.757    0.587''',
+        charge = 1,
+        spin = 1,
+        basis = 'cc-pvdz',
+        output = '/dev/null')
+
+def tearDownModule():
+    global mol, mol1
+    mol.stdout.close()
+    mol1.stdout.close()
+    del mol, mol1
 
 class KnownValues(unittest.TestCase):
     def test_ghf_scf(self):
-        mol = pyscf.M(atom='''
-O     0    0        0
-H     0    -0.757   0.587
-H     0    0.757    0.587''', basis = 'cc-pvdz')
         mf = mol.GHF().to_gpu()
         assert mf.device == 'gpu'
         e_tot = mf.kernel()
         e_ref = mf.to_cpu().kernel()
         assert abs(e_tot - e_ref) < 1e-5
 
+    def test_ghf_scf(self):
+        mf = mol.GHF().to_gpu()
+        assert mf.device == 'gpu'
+        e_tot = mf.kernel()
+        dm1 = mf.make_rdm1()
+        numpy.random.seed(1)
+        n2c = mol.nao_nr() * 2
+        dm_perturb = numpy.random.random((n2c,n2c)) + 1j*numpy.random.random((n2c,n2c))
+        dm_perturb = dm_perturb + dm_perturb.T.conj()
+        dm_perturb = cp.asarray(dm_perturb)
+        dm_1 = dm1 + dm_perturb*0.001
+        e_ref = mf.to_cpu().kernel(dm_1.get())
+        e_tot = mf.kernel(dm_1)
+        assert abs(e_tot - e_ref) < 1e-5
+
+    def test_get_jk_complex(self):
+        mf = mol.GHF().to_gpu()
+        nao = mol.nao_nr()*2
+        numpy.random.seed(1)
+        d1 = numpy.random.random((nao,nao)) + 1j*numpy.random.random((nao,nao))
+        d = d1 + d1.T.conj()
+        d_real = d.real
+        d_imag = d.imag
+        vj_gpu = mf.get_j(mol, d_real)
+        vk_gpu = mf.get_k(mol, d)
+
+        mf_cpu = mf.to_cpu()
+        vj_cpu = mf_cpu.get_j(mol, d_real)
+        vk_cpu = mf_cpu.get_k(mol, d)
+
+        assert numpy.allclose(vj_gpu, vj_cpu)
+        assert numpy.allclose(vk_gpu, vk_cpu)
+
+    def test_get_jk_real(self):
+        mf = mol.GHF().to_gpu()
+
+        nao = mol.nao_nr()*2
+        numpy.random.seed(1)
+        d1 = numpy.random.random((nao,nao))
+        d = d1 + d1.T.conj()
+        d_real = d.real
+        d_imag = d.imag
+        vj_gpu = mf.get_j(mol, d_real)
+        vk_gpu = mf.get_k(mol, d)
+        print("vj_gpu", vj_gpu)
+        print("vk_gpu", vk_gpu)
+
+        mf_cpu = mf.to_cpu()
+        vj_cpu = mf_cpu.get_j(mol, d_real)
+        vk_cpu = mf_cpu.get_k(mol, d)
+        print("vj_cpu", vj_cpu)
+        print("vk_cpu", vk_cpu)
+
+        assert numpy.allclose(vj_gpu, vj_cpu)
+        assert numpy.allclose(vk_gpu, vk_cpu)
+
     #def test_ghf_x2c(self):
     #    pass
 
diff --git a/requirements.txt b/requirements.txt
index 70660747d..6ef95eab3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ cupy-cuda12x==13.4.1
 pyscf==2.8.0
 basis-set-exchange==0.11
 pyscf-dispersion==1.3.0
-geometric==1.1.0
\ No newline at end of file
+geometric==1.1.0
+MCfun==0.2.5
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 111e4f3c2..2070dfd4b 100755
--- a/setup.py
+++ b/setup.py
@@ -139,5 +139,6 @@ def initialize_with_default_plat_name(self):
         f'cupy-cuda{CUDA_VERSION}>=13.0,!=13.4.0', # Due to expm in cupyx.scipy.linalg and cutensor 2.0
         'geometric',
         f'gpu4pyscf-libxc-cuda{CUDA_VERSION}==0.5',
+        'MCfun==0.2.5',
     ]
 )

From dab3064b88e7efcda440a4babc583aa288ef93d8 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Wed, 5 Nov 2025 16:53:23 +0800
Subject: [PATCH 15/21] fix some typos

---
 gpu4pyscf/scf/tests/test_ghf.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gpu4pyscf/scf/tests/test_ghf.py b/gpu4pyscf/scf/tests/test_ghf.py
index b8dda3305..2a9fce3b6 100644
--- a/gpu4pyscf/scf/tests/test_ghf.py
+++ b/gpu4pyscf/scf/tests/test_ghf.py
@@ -55,7 +55,7 @@ def test_ghf_scf(self):
         e_ref = mf.to_cpu().kernel()
         assert abs(e_tot - e_ref) < 1e-5
 
-    def test_ghf_scf(self):
+    def test_ghf_scf_complex_dm(self):
         mf = mol.GHF().to_gpu()
         assert mf.device == 'gpu'
         e_tot = mf.kernel()
@@ -77,7 +77,6 @@ def test_get_jk_complex(self):
         d1 = numpy.random.random((nao,nao)) + 1j*numpy.random.random((nao,nao))
         d = d1 + d1.T.conj()
         d_real = d.real
-        d_imag = d.imag
         vj_gpu = mf.get_j(mol, d_real)
         vk_gpu = mf.get_k(mol, d)
 
@@ -96,7 +95,6 @@ def test_get_jk_real(self):
         d1 = numpy.random.random((nao,nao))
         d = d1 + d1.T.conj()
         d_real = d.real
-        d_imag = d.imag
         vj_gpu = mf.get_j(mol, d_real)
         vk_gpu = mf.get_k(mol, d)
         print("vj_gpu", vj_gpu)

From eb660f9611cca50015fdb750e2b182398e5f1de5 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Thu, 6 Nov 2025 16:44:26 +0800
Subject: [PATCH 16/21] finish GKS tests

---
 gpu4pyscf/dft/gks.py                 |  13 ++-
 gpu4pyscf/dft/numint2c.py            |  33 +++---
 gpu4pyscf/dft/tests/test_gks.py      | 162 ++++++++++++++++++++++++---
 gpu4pyscf/dft/tests/test_numint2c.py |   4 +-
 gpu4pyscf/scf/ghf.py                 |  11 +-
 gpu4pyscf/scf/tests/test_ghf.py      |  17 ++-
 6 files changed, 198 insertions(+), 42 deletions(-)

diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index 018cf6293..bd9259a6f 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -19,6 +19,7 @@
 from gpu4pyscf.dft import rks
 from gpu4pyscf.scf.ghf import GHF
 from gpu4pyscf.lib import logger
+from gpu4pyscf.lib import utils
 from gpu4pyscf.lib.cupy_helper import tag_array
 
 
@@ -65,6 +66,8 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
     else:
         max_memory = ks.max_memory - lib.current_memory()[0]
         ni = ks._numint
+        if ni.collinear[0].lower() != 'm':
+            raise NotImplementedError('Only multi-colinear GKS is implemented')
         n, exc, vxc = ni.get_vxc(mol, ks.grids, ks.xc, dm,
                                  hermi=hermi, max_memory=max_memory)
         logger.debug(ks, 'nelec by numeric integration = %s', n)
@@ -130,10 +133,10 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
 
 
 class GKS(rks.KohnShamDFT, GHF):
-    from gpu4pyscf.lib.utils import to_cpu, to_gpu, device
+    to_gpu = utils.to_gpu
+    device = utils.device
 
     def __init__(self, mol, xc='LDA,VWN'):
-        raise NotImplementedError("GKS is not implemented")
         GHF.__init__(self, mol)
         rks.KohnShamDFT.__init__(self, xc)
         self._numint = numint2c.NumInt2C()
@@ -167,4 +170,8 @@ def spin_samples(self, val):
     energy_elec = rks.RKS.energy_elec
     nuc_grad_method = NotImplemented
     to_hf = NotImplemented
-
+    
+    def to_cpu(self):
+        mf = gks.GKS(self.mol)
+        utils.to_cpu(self, out=mf)
+        return mf
diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index fceca2286..cdc78e087 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -20,9 +20,11 @@
 import numpy as np
 import cupy as cp
 from pyscf import lib
+from pyscf.dft import numint2c
 from gpu4pyscf.dft import numint, mcfun_gpu
 from gpu4pyscf.dft.numint import _dot_ao_dm, _dot_ao_ao, _scale_ao
 from gpu4pyscf.dft import xc_deriv
+from gpu4pyscf.lib import utils
 from pyscf import __config__
 
 
@@ -382,6 +384,9 @@ def _contract_rho_m(bra, ket, hermi=0, bra_eq_ket=False):
 class NumInt2C(lib.StreamObject, numint.LibXCMixin):
     '''Numerical integration methods for 2-component basis (used by GKS)'''
     _keys = {'gdftopt'}
+    to_gpu = utils.to_gpu
+    device = utils.device
+
     gdftopt      = None
 
     # collinear schemes:
@@ -446,33 +451,23 @@ def get_rho(self, mol, dm, grids, max_memory=2000):
     _gks_mcol_fxc = _gks_mcol_fxc
 
     @lib.with_doc(numint.nr_rks.__doc__)
-    def nr_vxc(self, mol, grids, xc_code, dms, spin=0, relativity=0, hermi=1,
+    def nr_vxc(self, mol, grids, xc_code, dms, relativity=0, hermi=1,
                max_memory=2000, verbose=None):
         if not isinstance(dms, cp.ndarray):
             dms = cp.asarray(dms)
-        if self.collinear[0] in ('m', 'n'):  # mcol or ncol
+        if self.collinear[0] in ('m',):  # mcol or ncol
             opt = getattr(self, 'gdftopt', None)
             if opt is None:
                 self.build(mol, grids.coords)
                 opt = self.gdftopt
-                assert dms.ndim == 2
-                dms = cp.asarray(dms)
-                dms = opt.sort_orbitals_2c(dms, axis=[0,1])
+            assert dms.ndim == 2
+            dms = cp.asarray(dms)
+            dms = opt.sort_orbitals_2c(dms, axis=[0,1])
             n, exc, vmat = self._gks_mcol_vxc(mol, grids, xc_code, dms, relativity,
                                               hermi, max_memory, verbose)
             vmat = opt.unsort_orbitals_2c(vmat, axis=[0,1])
         else:
-            nao = dms.shape[-1] // 2
-            # ground state density is always real
-            dm_a = dms[...,:nao,:nao].real.copy('C')
-            dm_b = dms[...,nao:,nao:].real.copy('C')
-            dm1 = (dm_a, dm_b)
-            ni = self._to_numint1c()
-            n, exc, v = ni.nr_uks(mol, grids, xc_code, dm1, relativity,
-                                  hermi, max_memory, verbose)
-            vmat = cp.zeros_like(dms)
-            vmat[...,:nao,:nao] = v[0]
-            vmat[...,nao:,nao:] = v[1]
+            raise NotImplementedError("Locally collinear and collinear is not implemented")
         return n.sum(), exc, vmat
     get_vxc = nr_gks_vxc = nr_vxc
 
@@ -497,4 +492,8 @@ def _init_xcfuns(self, xc_code, spin=0):
 
     def _to_numint1c(self):
         '''Converts to the associated class to handle collinear systems'''
-        return self.view(numint.NumInt)
\ No newline at end of file
+        return self.view(numint.NumInt)
+    
+    def to_cpu(self):
+        ni = numint2c.NumInt2C()
+        return ni
\ No newline at end of file
diff --git a/gpu4pyscf/dft/tests/test_gks.py b/gpu4pyscf/dft/tests/test_gks.py
index 6a5e12d83..d9e420b45 100644
--- a/gpu4pyscf/dft/tests/test_gks.py
+++ b/gpu4pyscf/dft/tests/test_gks.py
@@ -12,17 +12,153 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pickle
-import numpy as np
 import unittest
-import pyscf
-from gpu4pyscf.dft import rks
-
-atom = '''
-O       0.0000000000    -0.0000000000     0.1174000000
-H      -0.7570000000    -0.0000000000    -0.4696000000
-H       0.7570000000     0.0000000000    -0.4696000000
-'''
-bas='def2-tzvpp'
-grids_level = 5
-nlcgrids_level = 2
\ No newline at end of file
+from pyscf import gto
+from pyscf import lib
+from gpu4pyscf.dft import gks
+from pyscf.dft import gks as gks_cpu
+
+
+def setUpModule():
+    global mol, mol1
+    mol = gto.Mole()
+    mol.atom = '''
+        O    0    0    0
+        H    0.   -0.757   0.587
+        H    0.   0.757    0.587'''
+    mol.spin = None
+    mol.basis = 'sto3g'
+    mol.output = '/dev/null'
+    mol.build()
+
+    mol1 = gto.M(
+        atom = '''
+            O    0    0    0
+            H    0.   -0.757   0.587
+            H    0.   0.757    0.587''',
+        charge = 1,
+        spin = 1,
+        basis = 'sto3g',
+        output = '/dev/null'
+        )
+
+
+def tearDownModule():
+    global mol, mol1
+    mol.stdout.close()
+    mol1.stdout.close()
+    del mol, mol1
+
+
+class KnownValues(unittest.TestCase):
+    def test_mcol_gks_lda(self):
+        mf_cpu = gks_cpu.GKS(mol)
+        mf_cpu.xc = 'lda,'
+        mf_cpu.collinear = 'mcol'
+        mf_cpu._numint.spin_samples = 6
+        eks4_cpu = mf_cpu.kernel()
+
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'lda,'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, eks4_cpu)
+        self.assertAlmostEqual(eks4_gpu, -74.0600297733097, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -26.421986983504258, 5)
+
+        mf_cpu = gks_cpu.GKS(mol1)
+        mf_cpu.xc = 'lda,vwn'
+        mf_cpu.collinear = 'mcol'
+        mf_cpu._numint.spin_samples = 50
+        eks4_cpu = mf_cpu.kernel()
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'lda,vwn'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, eks4_cpu, 6)
+        self.assertAlmostEqual(eks4_gpu, -74.3741809222222, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
+
+    def test_mcol_gks_gga(self):
+        mf_cpu = gks_cpu.GKS(mol)
+        mf_cpu.xc = 'pbe'
+        mf_cpu.collinear = 'mcol'
+        mf_cpu._numint.spin_samples = 6
+        eks4_cpu = mf_cpu.kernel()
+
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'pbe'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, eks4_cpu)
+        self.assertAlmostEqual(eks4_gpu, -75.2256398121708, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -26.81184613393452, 5)
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'pbe'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.869954771937, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.92164954706164, 5) # pyscf result
+
+    def test_mcol_gks_hyb(self):
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'b3lyp'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -75.312587317089, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.2469582128507, 5) # pyscf result
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'b3lyp'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.9528036305753, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -28.49145406025193, 5) # pyscf result
+
+    def test_mcol_gks_mgga(self):
+        mf_gpu = gks.GKS(mol)
+        mf_gpu.xc = 'm06l'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 6
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -75.3053691716776, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.03099891671804, 5) # pyscf result
+
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'm06l'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+        self.assertAlmostEqual(eks4_gpu, -74.9468853267496, 6) # pyscf result
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -28.188215296679516, 5) # pyscf result
+
+    def test_to_cpu(self):
+        mf_gpu = gks.GKS(mol1)
+        mf_gpu.xc = 'lda,vwn'
+        mf_gpu.collinear = 'mcol'
+        mf_gpu._numint.spin_samples = 50
+        eks4_gpu = mf_gpu.kernel()
+
+        mf_cpu = mf_gpu.to_cpu()
+        eks4_cpu = mf_cpu.kernel()
+
+        self.assertAlmostEqual(eks4_gpu, eks4_cpu, 6)
+        self.assertAlmostEqual(eks4_gpu, -74.3741809222222, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
+
+
+if __name__ == "__main__":
+    print("Test GKS")
+    unittest.main()
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 25e5514b1..e9a69b337 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -124,14 +124,14 @@ def test_get_rho(self):
         rho_1c_gpu = ni_gpu_1c.get_rho(mol, dm_1c_test.real, grids_gpu)
         self.assertAlmostEqual(abs(rho_gpu.get() - rho_1c_gpu.get()).max(), 0, 10)
 
-
     def test_eval_xc_eff(self):
         ni_gpu = NumInt2C()
         ni_gpu.collinear='m'
         ni_cpu = pyscf_numint2c()
         ni_cpu.collinear='m'
         np.random.seed(1)
-        dm = dm0*1.0 + 0.0j
+        dm = dm0*1.0 + dm0 * 0.1j
+        dm = dm + dm.T.conj()
         for xc_code in (LDA, ):
             ni_gpu.gdftopt = None
             n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, xc_code, dm)
diff --git a/gpu4pyscf/scf/ghf.py b/gpu4pyscf/scf/ghf.py
index 8e8b3efee..a2fcc5613 100644
--- a/gpu4pyscf/scf/ghf.py
+++ b/gpu4pyscf/scf/ghf.py
@@ -22,6 +22,15 @@
 from gpu4pyscf.lib.cupy_helper import asarray, return_cupy_array
 from gpu4pyscf.lib import utils
 
+def _from_rhf_init_dm(dma, breaksym=True):
+    dma = dma * .5
+    dm = block_diag(dma, dma)
+    if breaksym:
+        nao = dma.shape[0]
+        idx, idy = cp.diag_indices(nao)
+        dm[idx+nao,idy] = dm[idx,idy+nao] = dma.diagonal() * .05
+    return dm
+
 class GHF(hf.SCF):
     to_gpu = utils.to_gpu
     device = utils.device
@@ -55,7 +64,7 @@ class GHF(hf.SCF):
 
     def get_init_guess(self, mol=None, key='minao', **kwargs):
         dma = hf.RHF.get_init_guess(self, mol, key, **kwargs)
-        return block_diag(dma, dma)
+        return _from_rhf_init_dm(dma)
 
     def get_hcore(self, mol=None):
         if mol is None: mol = self.mol
diff --git a/gpu4pyscf/scf/tests/test_ghf.py b/gpu4pyscf/scf/tests/test_ghf.py
index 2a9fce3b6..f65fe15bc 100644
--- a/gpu4pyscf/scf/tests/test_ghf.py
+++ b/gpu4pyscf/scf/tests/test_ghf.py
@@ -17,6 +17,7 @@
 import numpy
 import cupy as cp
 
+
 def setUpModule():
     global mol, mol1
     mol = pyscf.gto.Mole()
@@ -41,6 +42,7 @@ def setUpModule():
         basis = 'cc-pvdz',
         output = '/dev/null')
 
+
 def tearDownModule():
     global mol, mol1
     mol.stdout.close()
@@ -97,20 +99,23 @@ def test_get_jk_real(self):
         d_real = d.real
         vj_gpu = mf.get_j(mol, d_real)
         vk_gpu = mf.get_k(mol, d)
-        print("vj_gpu", vj_gpu)
-        print("vk_gpu", vk_gpu)
 
         mf_cpu = mf.to_cpu()
         vj_cpu = mf_cpu.get_j(mol, d_real)
         vk_cpu = mf_cpu.get_k(mol, d)
-        print("vj_cpu", vj_cpu)
-        print("vk_cpu", vk_cpu)
 
         assert numpy.allclose(vj_gpu, vj_cpu)
         assert numpy.allclose(vk_gpu, vk_cpu)
 
-    #def test_ghf_x2c(self):
-    #    pass
+    def test_to_cpu(self):
+        mf = mol.GHF().to_gpu()
+        assert mf.device == 'gpu'
+        e_tot = mf.kernel()
+        mf = mf.to_cpu()
+        assert mf.device == 'cpu'
+        e_ref = mf.kernel()
+        assert abs(e_tot - e_ref) < 1e-5
+
 
 if __name__ == "__main__":
     print("Full Tests for ghf")

From 0d37aee245b68b2f8b87803b2de8184d7a4bac2a Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 7 Nov 2025 10:28:29 +0800
Subject: [PATCH 17/21] fix a bug

---
 gpu4pyscf/scf/tests/test_ghf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu4pyscf/scf/tests/test_ghf.py b/gpu4pyscf/scf/tests/test_ghf.py
index f65fe15bc..d82b77e2e 100644
--- a/gpu4pyscf/scf/tests/test_ghf.py
+++ b/gpu4pyscf/scf/tests/test_ghf.py
@@ -112,7 +112,7 @@ def test_to_cpu(self):
         assert mf.device == 'gpu'
         e_tot = mf.kernel()
         mf = mf.to_cpu()
-        assert mf.device == 'cpu'
+        assert getattr(mf, 'device', None) is None
         e_ref = mf.kernel()
         assert abs(e_tot - e_ref) < 1e-5
 

From 5c36b0539be32c778b5e398e5eb5f54d926642c0 Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 7 Nov 2025 11:27:03 +0800
Subject: [PATCH 18/21] change the tests

---
 .github/workflows/unittest.yml       |   2 +-
 gpu4pyscf/dft/tests/test_gks.py      |  29 +----
 gpu4pyscf/dft/tests/test_numint2c.py | 168 ++++++++++++++++++++++++---
 requirements.txt                     |   3 +-
 setup.py                             |   1 -
 5 files changed, 162 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index bae786e4b..3f93144da 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -33,7 +33,7 @@ jobs:
           -v $GITHUB_WORKSPACE:/workspace \
           -v ~/.cache/pip:/root/.cache/pip \
           pyscf/gpu4pyscf-devel:pyscf-2.8 \
-          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
+          /bin/bash -c "cd /workspace && pip3 install -r requirements.txt && pip3 install MCfun && source build.sh && pytest -m 'not slow and not benchmark' --cov=/workspace --durations=50 && rm -rf .pytest_cache"
 
   multi-gpu:
     runs-on: [self-hosted, Linux, X64, 2T4]
diff --git a/gpu4pyscf/dft/tests/test_gks.py b/gpu4pyscf/dft/tests/test_gks.py
index d9e420b45..74a999f93 100644
--- a/gpu4pyscf/dft/tests/test_gks.py
+++ b/gpu4pyscf/dft/tests/test_gks.py
@@ -17,6 +17,10 @@
 from pyscf import lib
 from gpu4pyscf.dft import gks
 from pyscf.dft import gks as gks_cpu
+try:
+    import mcfun
+except ImportError:
+    mcfun = None
 
 
 def setUpModule():
@@ -52,53 +56,31 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_mcol_gks_lda(self):
-        mf_cpu = gks_cpu.GKS(mol)
-        mf_cpu.xc = 'lda,'
-        mf_cpu.collinear = 'mcol'
-        mf_cpu._numint.spin_samples = 6
-        eks4_cpu = mf_cpu.kernel()
-
+        
         mf_gpu = gks.GKS(mol)
         mf_gpu.xc = 'lda,'
         mf_gpu.collinear = 'mcol'
         mf_gpu._numint.spin_samples = 6
         eks4_gpu = mf_gpu.kernel()
-        self.assertAlmostEqual(eks4_gpu, eks4_cpu)
         self.assertAlmostEqual(eks4_gpu, -74.0600297733097, 6)
-        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -26.421986983504258, 5)
 
-        mf_cpu = gks_cpu.GKS(mol1)
-        mf_cpu.xc = 'lda,vwn'
-        mf_cpu.collinear = 'mcol'
-        mf_cpu._numint.spin_samples = 50
-        eks4_cpu = mf_cpu.kernel()
-
         mf_gpu = gks.GKS(mol1)
         mf_gpu.xc = 'lda,vwn'
         mf_gpu.collinear = 'mcol'
         mf_gpu._numint.spin_samples = 50
         eks4_gpu = mf_gpu.kernel()
-        self.assertAlmostEqual(eks4_gpu, eks4_cpu, 6)
         self.assertAlmostEqual(eks4_gpu, -74.3741809222222, 6)
-        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
 
     def test_mcol_gks_gga(self):
-        mf_cpu = gks_cpu.GKS(mol)
-        mf_cpu.xc = 'pbe'
-        mf_cpu.collinear = 'mcol'
-        mf_cpu._numint.spin_samples = 6
-        eks4_cpu = mf_cpu.kernel()
 
         mf_gpu = gks.GKS(mol)
         mf_gpu.xc = 'pbe'
         mf_gpu.collinear = 'mcol'
         mf_gpu._numint.spin_samples = 6
         eks4_gpu = mf_gpu.kernel()
-        self.assertAlmostEqual(eks4_gpu, eks4_cpu)
         self.assertAlmostEqual(eks4_gpu, -75.2256398121708, 6)
-        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -26.81184613393452, 5)
 
         mf_gpu = gks.GKS(mol1)
@@ -143,6 +125,7 @@ def test_mcol_gks_mgga(self):
         self.assertAlmostEqual(eks4_gpu, -74.9468853267496, 6) # pyscf result
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -28.188215296679516, 5) # pyscf result
 
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_to_cpu(self):
         mf_gpu = gks.GKS(mol1)
         mf_gpu.xc = 'lda,vwn'
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index e9a69b337..c73cedbd5 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -27,6 +27,10 @@
 from gpu4pyscf.dft.numint import NumInt
 from gpu4pyscf import dft
 from gpu4pyscf.dft import gen_grid
+try:
+    import mcfun
+except ImportError:
+    mcfun = None
 
 def setUpModule():
     global mol, grids_cpu, grids_gpu, dm, dm0, dm1, mo_occ, mo_coeff
@@ -124,6 +128,7 @@ def test_get_rho(self):
         rho_1c_gpu = ni_gpu_1c.get_rho(mol, dm_1c_test.real, grids_gpu)
         self.assertAlmostEqual(abs(rho_gpu.get() - rho_1c_gpu.get()).max(), 0, 10)
 
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_eval_xc_eff(self):
         ni_gpu = NumInt2C()
         ni_gpu.collinear='m'
@@ -132,14 +137,39 @@ def test_eval_xc_eff(self):
         np.random.seed(1)
         dm = dm0*1.0 + dm0 * 0.1j
         dm = dm + dm.T.conj()
-        for xc_code in (LDA, ):
-            ni_gpu.gdftopt = None
+        for xc_code in (LDA, GGA_PBE, MGGA_M06):
             n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, xc_code, dm)
             n_cpu, exc_cpu, vmat_cpu = ni_cpu.nr_vxc(mol, grids_cpu, xc_code, dm)
             self.assertAlmostEqual(abs(n_gpu.get() - n_cpu).max(), 0, 10)
             self.assertAlmostEqual(abs(exc_gpu.get() - exc_cpu).max(), 0, 10)
             self.assertAlmostEqual(abs(vmat_gpu.get() - vmat_cpu).max(), 0, 10)
 
+    def test_eval_xc_eff_fp(self):
+        ni_gpu = NumInt2C()
+        ni_gpu.collinear='m'
+        np.random.seed(1)
+        dm = dm0*1.0 + dm0 * 0.1j
+        dm = dm + dm.T.conj()
+
+        n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, LDA, dm)
+        self.assertAlmostEqual(abs(n_gpu.get() - 17.9999262659497).max(), 0, 10)
+        self.assertAlmostEqual(abs(exc_gpu.get() - -1.310501342423071).max(), 0, 10)
+        self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
+            - (-0.20448306536588537-6.75460139752253e-21j)).max(), 0, 10)
+
+        n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, GGA_PBE, dm)
+        self.assertAlmostEqual(abs(n_gpu.get() - 17.9999262659497).max(), 0, 10)
+        self.assertAlmostEqual(abs(exc_gpu.get() - -0.7237150857425112).max(), 0, 10)
+        self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
+            - (-0.05446425800187435-4.486282070082083e-21j)).max(), 0, 10)
+
+        n_gpu, exc_gpu, vmat_gpu = ni_gpu.nr_vxc(mol, grids_gpu, MGGA_M06, dm)
+        self.assertAlmostEqual(abs(n_gpu.get() - 17.9999262659497).max(), 0, 10)
+        self.assertAlmostEqual(abs(exc_gpu.get() - -0.7703982586705045).max(), 0, 10)
+        self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
+            - (-0.18688247306409317+7.50400133342109e-20j)).max(), 0, 10)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_mcol_lda_vxc_mat(self):
         xc_code = 'lda,'
 
@@ -168,14 +198,51 @@ def test_mcol_lda_vxc_mat(self):
         mask_gpu = cupy.array([i for i in range(mol.nbas)])
 
         shls_slice = (0, mol.nbas)
-        v0_cpu = pyscf_numint2c_file._mcol_lda_vxc_mat(mol, ao.transpose(1,0).get(), weight, rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
-        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 0)
-        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 1)
+        v0_cpu = pyscf_numint2c_file._mcol_lda_vxc_mat(mol, ao.transpose(1,0).get(), weight, 
+            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
         v1_gpu = v1_gpu + v1_gpu.conj().T
-        
         self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
         self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
+    def test_mcol_lda_vxc_mat_fp(self):
+        xc_code = 'lda,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .0001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=0, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='LDA', hermi=1, with_lapl=False)
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) - 
+            (-9.596802282363786+0.000531850990220307j)).max(), 0, 13)
+        self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) - 
+            (-9.596802282363786+0.000531850990220307j)).max(), 0, 13)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_mcol_gga_vxc_mat(self):
         xc_code = 'pbe,'
 
@@ -204,14 +271,51 @@ def test_mcol_gga_vxc_mat(self):
         mask_gpu = cupy.array([i for i in range(mol.nbas)])
 
         shls_slice = (0, mol.nbas)
-        v0_cpu = pyscf_numint2c_file._mcol_gga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
-        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 0)
-        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 1)
+        v0_cpu = pyscf_numint2c_file._mcol_gga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
+            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
         v1_gpu = v1_gpu + v1_gpu.conj().T
-        
         self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
         self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
+    def test_mcol_gga_vxc_mat_fp(self):
+        xc_code = 'pbe,'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .001j
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='GGA', hermi=1, with_lapl=False)
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) - 
+            (-9.624252836509262+0.005273283360578891j)).max(), 0, 13)
+        self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) - 
+            (-9.624252836509262+0.005273283360578891j)).max(), 0, 13)
+
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_mcol_mgga_vxc_mat(self):
         xc_code = 'tpss'
 
@@ -240,14 +344,50 @@ def test_mcol_mgga_vxc_mat(self):
         mask_gpu = cupy.array([i for i in range(mol.nbas)])
 
         shls_slice = (0, mol.nbas)
-        v0_cpu = pyscf_numint2c_file._mcol_mgga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
-        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 0)
-        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), mask_gpu, shls_slice, ao_loc, 1)
+        v0_cpu = pyscf_numint2c_file._mcol_mgga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
+            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
         v1_gpu = v1_gpu + v1_gpu.conj().T
-        
         self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
         self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
+    def test_mcol_mgga_vxc_mat_fp(self):
+        xc_code = 'tpss'
+
+        nao = mol.nao
+        n2c = nao * 2
+        ao_loc = mol.ao_loc
+        np.random.seed(12)
+        dm = np.random.rand(n2c, n2c) * .001
+        dm += np.eye(n2c)
+        dm = dm + dm.T
+        ngrids = 8
+        coords = np.random.rand(ngrids,3)
+        weight = np.random.rand(ngrids)
+
+        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
+        rho = numint2c.eval_rho(mol, ao, dm, xctype='MGGA', hermi=1, with_lapl=False)
+        ni_GPU = NumInt2C()
+        ni_GPU.collinear = 'mcol'
+        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
+        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
+        mask = np.ones((8, mol.nbas), dtype=np.uint8)
+        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+
+        shls_slice = (0, mol.nbas)
+        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 0)
+        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
+            mask_gpu, shls_slice, ao_loc, 1)
+        v1_gpu = v1_gpu + v1_gpu.conj().T
+        self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) 
+            - (-11.359687631112195+6.13828986953566e-22j)).max(), 0, 13)
+        self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) 
+            - (-11.359687631112195+6.13828986953566e-22j)).max(), 0, 13)
+
 
 if __name__ == "__main__":
     print("Full Tests for dft numint2c")
diff --git a/requirements.txt b/requirements.txt
index 6ef95eab3..70660747d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,4 @@ cupy-cuda12x==13.4.1
 pyscf==2.8.0
 basis-set-exchange==0.11
 pyscf-dispersion==1.3.0
-geometric==1.1.0
-MCfun==0.2.5
\ No newline at end of file
+geometric==1.1.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2070dfd4b..111e4f3c2 100755
--- a/setup.py
+++ b/setup.py
@@ -139,6 +139,5 @@ def initialize_with_default_plat_name(self):
         f'cupy-cuda{CUDA_VERSION}>=13.0,!=13.4.0', # Due to expm in cupyx.scipy.linalg and cutensor 2.0
         'geometric',
         f'gpu4pyscf-libxc-cuda{CUDA_VERSION}==0.5',
-        'MCfun==0.2.5',
     ]
 )

From 44e1ba8a3a186445d1461f7e9daeb9cd812a7e1d Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 7 Nov 2025 11:30:08 +0800
Subject: [PATCH 19/21] fix some typos

---
 gpu4pyscf/dft/tests/test_numint2c.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index c73cedbd5..4eb5ff678 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -228,7 +228,6 @@ def test_mcol_lda_vxc_mat_fp(self):
         ni_GPU.collinear = 'mcol'
         eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
         vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
-        mask = np.ones((8, mol.nbas), dtype=np.uint8)
         mask_gpu = cupy.array([i for i in range(mol.nbas)])
 
         shls_slice = (0, mol.nbas)
@@ -301,7 +300,6 @@ def test_mcol_gga_vxc_mat_fp(self):
         ni_GPU.collinear = 'mcol'
         eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
         vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
-        mask = np.ones((8, mol.nbas), dtype=np.uint8)
         mask_gpu = cupy.array([i for i in range(mol.nbas)])
 
         shls_slice = (0, mol.nbas)
@@ -374,7 +372,6 @@ def test_mcol_mgga_vxc_mat_fp(self):
         ni_GPU.collinear = 'mcol'
         eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
         vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
-        mask = np.ones((8, mol.nbas), dtype=np.uint8)
         mask_gpu = cupy.array([i for i in range(mol.nbas)])
 
         shls_slice = (0, mol.nbas)

From 2684db5e0bcb8f82a64f47bedad04f6d6048fbfc Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 14 Nov 2025 10:48:13 +0800
Subject: [PATCH 20/21] change the codes as review comments

---
 gpu4pyscf/dft/gks.py                 |  37 +++--
 gpu4pyscf/dft/numint2c.py            |  81 ++++++++++-
 gpu4pyscf/dft/tests/test_gks.py      |  16 +++
 gpu4pyscf/dft/tests/test_numint2c.py | 201 ++++++++++-----------------
 gpu4pyscf/scf/diis.py                |   2 +
 gpu4pyscf/scf/hf.py                  |   2 -
 6 files changed, 187 insertions(+), 152 deletions(-)

diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py
index bd9259a6f..cb37c146b 100644
--- a/gpu4pyscf/dft/gks.py
+++ b/gpu4pyscf/dft/gks.py
@@ -84,47 +84,46 @@ def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1):
             logger.debug(ks, 'nelec with nlc grids = %s', n)
         t0 = logger.timer(ks, 'vxc', *t0)
 
-    incremental_jk = (ks._eri is None and ks.direct_scf and
-                      getattr(vhf_last, 'vj', None) is not None)
-    if incremental_jk:
-        _dm = cp.asarray(dm) - cp.asarray(dm_last)
-    else:
-        _dm = dm
+    
+    dm_orig = cp.asarray(dm)
+    vj_last = getattr(vhf_last, 'vj', None)
+    if vj_last is not None:
+        dm = cp.asarray(dm) - cp.asarray(dm_last)
     if not ni.libxc.is_hybrid_xc(ks.xc):
         vk = None
-        vj = ks.get_j(mol, _dm, hermi)
-        if incremental_jk:
-            vj += vhf_last.vj
+        vj = ks.get_j(mol, dm, hermi)
+        if vj_last is not None:
+            vj += vj_last
         vxc += vj
     else:
         omega, alpha, hyb = ni.rsh_and_hybrid_coeff(ks.xc, spin=mol.spin)
         if omega == 0:
-            vj, vk = ks.get_jk(mol, _dm, hermi)
+            vj, vk = ks.get_jk(mol, dm, hermi)
             vk *= hyb
         elif alpha == 0: # LR=0, only SR exchange
-            vj = ks.get_j(mol, _dm, hermi)
-            vk = ks.get_k(mol, _dm, hermi, omega=-omega)
+            vj = ks.get_j(mol, dm, hermi)
+            vk = ks.get_k(mol, dm, hermi, omega=-omega)
             vk *= hyb
         elif hyb == 0: # SR=0, only LR exchange
-            vj = ks.get_j(mol, _dm, hermi)
-            vk = ks.get_k(mol, _dm, hermi, omega=omega)
+            vj = ks.get_j(mol, dm, hermi)
+            vk = ks.get_k(mol, dm, hermi, omega=omega)
             vk *= alpha
         else: # SR and LR exchange with different ratios
-            vj, vk = ks.get_jk(mol, _dm, hermi)
+            vj, vk = ks.get_jk(mol, dm, hermi)
             vk *= hyb
-            vklr = ks.get_k(mol, _dm, hermi, omega=omega)
+            vklr = ks.get_k(mol, dm, hermi, omega=omega)
             vklr *= (alpha - hyb)
             vk += vklr
-        if incremental_jk:
+        if vj_last is not None:
             vj += vhf_last.vj
             vk += vhf_last.vk
         vxc += vj - vk
 
         if ground_state:
-            exc -= cp.einsum('ij,ji', dm, vk).real * .5
+            exc -= cp.einsum('ij,ji', dm_orig, vk).real * .5
 
     if ground_state:
-        ecoul = cp.einsum('ij,ji', dm, vj).real * .5
+        ecoul = cp.einsum('ij,ji', dm_orig, vj).real * .5
     else:
         ecoul = None
 
diff --git a/gpu4pyscf/dft/numint2c.py b/gpu4pyscf/dft/numint2c.py
index cdc78e087..9b5b5e9a3 100644
--- a/gpu4pyscf/dft/numint2c.py
+++ b/gpu4pyscf/dft/numint2c.py
@@ -402,7 +402,7 @@ class NumInt2C(lib.StreamObject, numint.LibXCMixin):
     eval_rho = staticmethod(eval_rho)
 
     def build(self, mol, coords):
-        self.gdftopt = numint._GDFTOpt.from_mol(mol)
+        self.gdftopt = _GDFTOpt2C.from_mol(mol)
         self.grid_blksize = None
         self.non0ao_idx = {}
         return self
@@ -462,10 +462,10 @@ def nr_vxc(self, mol, grids, xc_code, dms, relativity=0, hermi=1,
                 opt = self.gdftopt
             assert dms.ndim == 2
             dms = cp.asarray(dms)
-            dms = opt.sort_orbitals_2c(dms, axis=[0,1])
+            dms = opt.sort_orbitals(dms, axis=[0,1])
             n, exc, vmat = self._gks_mcol_vxc(mol, grids, xc_code, dms, relativity,
                                               hermi, max_memory, verbose)
-            vmat = opt.unsort_orbitals_2c(vmat, axis=[0,1])
+            vmat = opt.unsort_orbitals(vmat, axis=[0,1])
         else:
             raise NotImplementedError("Locally collinear and collinear is not implemented")
         return n.sum(), exc, vmat
@@ -496,4 +496,77 @@ def _to_numint1c(self):
     
     def to_cpu(self):
         ni = numint2c.NumInt2C()
-        return ni
\ No newline at end of file
+        return ni
+    
+
+class _GDFTOpt2C(numint._GDFTOpt):
+
+    def sort_orbitals(self, mat, axis=[]):
+        ''' Transform given axis of a 2-component matrix (GKS) into sorted AO
+        
+        This assumes the axes specified in 'axis' have a dimension of 2*nao,
+        representing alpha (:nao) and beta (nao:) components. Both components
+        are sorted using the same AO sorting index.
+        '''
+        idx = self._ao_idx
+        nao = len(idx)
+        
+        # Create the 2-component sorting index:
+        # [sorted_alpha_indices, sorted_beta_indices]
+        # E.g., if idx = [0, 2, 1] (nao=3),
+        # idx_2c = [0, 2, 1, 3+0, 3+2, 3+1] = [0, 2, 1, 3, 5, 4]
+        # We use numpy to build the index, consistent with self._ao_idx
+        idx_2c = np.concatenate([idx, idx + nao])
+
+        shape_ones = (1,) * mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(mat.shape):
+            if dim in axis:
+                # Check if the dimension matches the 2-component size
+                if n != 2 * nao:
+                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component sorting")
+                indices = idx_2c
+            else:
+                # Use cp.arange for non-sorted axes, as in the original sort_orbitals
+                indices = cp.arange(n)
+            
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+            
+        # Perform the sorting using advanced indexing
+        return mat[tuple(fancy_index)]
+
+    def unsort_orbitals(self, sorted_mat, axis=[], out=None):
+        ''' Transform given axis of a 2-component matrix from sorted AO to original AO
+        
+        This assumes the axes specified in 'axis' have a dimension of 2*nao.
+        This is the inverse operation of sort_orbitals_2c.
+        '''
+        idx = self._ao_idx
+        nao = len(idx)
+        
+        # The 2-component index is created identically to sort_orbitals_2c
+        idx_2c = np.concatenate([idx, idx + nao])
+
+        shape_ones = (1,) * sorted_mat.ndim
+        fancy_index = []
+        for dim, n in enumerate(sorted_mat.shape):
+            if dim in axis:
+                # Check if the dimension matches the 2-component size
+                if n != 2 * nao:
+                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component unsorting")
+                indices = idx_2c
+            else:
+                indices = cp.arange(n)
+            
+            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(indices.reshape(idx_shape))
+            
+        if out is None:
+            out = cp.empty_like(sorted_mat)
+            
+        # Perform the unsorting assignment
+        out[tuple(fancy_index)] = sorted_mat
+        return out
+    
+    
\ No newline at end of file
diff --git a/gpu4pyscf/dft/tests/test_gks.py b/gpu4pyscf/dft/tests/test_gks.py
index 74a999f93..9bce0f950 100644
--- a/gpu4pyscf/dft/tests/test_gks.py
+++ b/gpu4pyscf/dft/tests/test_gks.py
@@ -141,6 +141,22 @@ def test_to_cpu(self):
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
 
+    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    def test_to_gpu(self):
+        mf_cpu = gks_cpu.GKS(mol1)
+        mf_cpu.xc = 'lda,vwn'
+        mf_cpu.collinear = 'mcol'
+        mf_cpu._numint.spin_samples = 50
+        eks4_cpu = mf_cpu.kernel()
+
+        mf_gpu = mf_cpu.to_gpu()
+        eks4_gpu = mf_cpu.kernel()
+
+        self.assertAlmostEqual(eks4_gpu, eks4_cpu, 6)
+        self.assertAlmostEqual(eks4_gpu, -74.3741809222222, 6)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
+        self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
+
 
 if __name__ == "__main__":
     print("Test GKS")
diff --git a/gpu4pyscf/dft/tests/test_numint2c.py b/gpu4pyscf/dft/tests/test_numint2c.py
index 4eb5ff678..e2bb23bda 100644
--- a/gpu4pyscf/dft/tests/test_numint2c.py
+++ b/gpu4pyscf/dft/tests/test_numint2c.py
@@ -169,7 +169,6 @@ def test_eval_xc_eff_fp(self):
         self.assertAlmostEqual(abs(lib.fp(vmat_gpu.get()) 
             - (-0.18688247306409317+7.50400133342109e-20j)).max(), 0, 10)
 
-    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_mcol_lda_vxc_mat(self):
         xc_code = 'lda,'
 
@@ -179,69 +178,51 @@ def test_mcol_lda_vxc_mat(self):
         np.random.seed(12)
         dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .0001j
         dm += np.eye(n2c)
-        dm = dm + dm.T
+        dm = dm + dm.T.conj()
         ngrids = 8
         coords = np.random.rand(ngrids,3)
         weight = np.random.rand(ngrids)
 
         ao = numint.eval_ao(mol, coords, deriv=0, transpose=False)
         rho = numint2c.eval_rho(mol, ao, dm, xctype='LDA', hermi=1, with_lapl=False)
-        ni_CPU = pyscf_numint2c()
+        
+        # --- GPU Calculations (Always Run) ---
         ni_GPU = NumInt2C()
         ni_GPU.collinear = 'mcol'
-        ni_CPU.collinear = 'mcol'
-        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
         eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
-        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
         vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
-        mask = np.ones((8, mol.nbas), dtype=np.uint8)
-        mask_gpu = cupy.array([i for i in range(mol.nbas)])
-
+        mask_gpu = cupy.arange(mol.nbas)
         shls_slice = (0, mol.nbas)
-        v0_cpu = pyscf_numint2c_file._mcol_lda_vxc_mat(mol, ao.transpose(1,0).get(), weight, 
-            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+        
         v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
             mask_gpu, shls_slice, ao_loc, 0)
         v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
             mask_gpu, shls_slice, ao_loc, 1)
         v1_gpu = v1_gpu + v1_gpu.conj().T
-        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
-        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
-    def test_mcol_lda_vxc_mat_fp(self):
-        xc_code = 'lda,'
-
-        nao = mol.nao
-        n2c = nao * 2
-        ao_loc = mol.ao_loc
-        np.random.seed(12)
-        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .0001j
-        dm += np.eye(n2c)
-        dm = dm + dm.T
-        ngrids = 8
-        coords = np.random.rand(ngrids,3)
-        weight = np.random.rand(ngrids)
-
-        ao = numint.eval_ao(mol, coords, deriv=0, transpose=False)
-        rho = numint2c.eval_rho(mol, ao, dm, xctype='LDA', hermi=1, with_lapl=False)
-        ni_GPU = NumInt2C()
-        ni_GPU.collinear = 'mcol'
-        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
-        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='LDA')[1]
-        mask_gpu = cupy.array([i for i in range(mol.nbas)])
-
-        shls_slice = (0, mol.nbas)
-        v0_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
-            mask_gpu, shls_slice, ao_loc, 0)
-        v1_gpu = numint2c._mcol_lda_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
-            mask_gpu, shls_slice, ao_loc, 1)
-        v1_gpu = v1_gpu + v1_gpu.conj().T
+        # --- Assertions (Always Run) ---
+        # Fingerprint checks (from _fp test)
         self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) - 
-            (-9.596802282363786+0.000531850990220307j)).max(), 0, 13)
+            (-9.596802359283691+2.969010922568367e-05j)).max(), 0, 13)
         self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) - 
-            (-9.596802282363786+0.000531850990220307j)).max(), 0, 13)
+            (-9.596802359283691+2.969010922568367e-05j)).max(), 0, 13)
+        # Internal consistency check
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+
+        # --- CPU Comparison (Conditional) ---
+        if mcfun is not None:
+            ni_CPU = pyscf_numint2c()
+            ni_CPU.collinear = 'mcol'
+            eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+            vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+            mask = np.ones((8, mol.nbas), dtype=np.uint8)
+            
+            v0_cpu = pyscf_numint2c_file._mcol_lda_vxc_mat(mol, ao.transpose(1,0).get(), weight, 
+                rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+            
+            # CPU vs GPU check
+            self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
-    @unittest.skipIf(mcfun is None, "mcfun library not found.")
     def test_mcol_gga_vxc_mat(self):
         xc_code = 'pbe,'
 
@@ -251,108 +232,52 @@ def test_mcol_gga_vxc_mat(self):
         np.random.seed(12)
         dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .001j
         dm += np.eye(n2c)
-        dm = dm + dm.T
+        dm = dm + dm.T.conj()
         ngrids = 8
         coords = np.random.rand(ngrids,3)
         weight = np.random.rand(ngrids)
 
         ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
         rho = numint2c.eval_rho(mol, ao, dm, xctype='GGA', hermi=1, with_lapl=False)
-        ni_CPU = pyscf_numint2c()
-        ni_GPU = NumInt2C()
-        ni_GPU.collinear = 'mcol'
-        ni_CPU.collinear = 'mcol'
-        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
-        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
-        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
-        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
-        mask = np.ones((8, mol.nbas), dtype=np.uint8)
-        mask_gpu = cupy.array([i for i in range(mol.nbas)])
-
-        shls_slice = (0, mol.nbas)
-        v0_cpu = pyscf_numint2c_file._mcol_gga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
-            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
-        v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
-            mask_gpu, shls_slice, ao_loc, 0)
-        v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
-            mask_gpu, shls_slice, ao_loc, 1)
-        v1_gpu = v1_gpu + v1_gpu.conj().T
-        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
-        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
-    def test_mcol_gga_vxc_mat_fp(self):
-        xc_code = 'pbe,'
-
-        nao = mol.nao
-        n2c = nao * 2
-        ao_loc = mol.ao_loc
-        np.random.seed(12)
-        dm = np.random.rand(n2c, n2c) * .001 + np.random.rand(n2c, n2c) * .001j
-        dm += np.eye(n2c)
-        dm = dm + dm.T
-        ngrids = 8
-        coords = np.random.rand(ngrids,3)
-        weight = np.random.rand(ngrids)
-
-        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
-        rho = numint2c.eval_rho(mol, ao, dm, xctype='GGA', hermi=1, with_lapl=False)
+        # --- GPU Calculations (Always Run) ---
         ni_GPU = NumInt2C()
         ni_GPU.collinear = 'mcol'
         eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
         vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='GGA')[1]
-        mask_gpu = cupy.array([i for i in range(mol.nbas)])
-
+        mask_gpu = cupy.arange(mol.nbas)
         shls_slice = (0, mol.nbas)
+        
         v0_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
             mask_gpu, shls_slice, ao_loc, 0)
         v1_gpu = numint2c._mcol_gga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
             mask_gpu, shls_slice, ao_loc, 1)
         v1_gpu = v1_gpu + v1_gpu.conj().T
+
+        # --- Assertions (Always Run) ---
+        # Fingerprint checks (from _fp test)
         self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) - 
-            (-9.624252836509262+0.005273283360578891j)).max(), 0, 13)
+            (-9.624260408900755+0.0003122100947141664j)).max(), 0, 13)
         self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) - 
-            (-9.624252836509262+0.005273283360578891j)).max(), 0, 13)
-
-    @unittest.skipIf(mcfun is None, "mcfun library not found.")
-    def test_mcol_mgga_vxc_mat(self):
-        xc_code = 'tpss'
-
-        nao = mol.nao
-        n2c = nao * 2
-        ao_loc = mol.ao_loc
-        np.random.seed(12)
-        dm = np.random.rand(n2c, n2c) * .001
-        dm += np.eye(n2c)
-        dm = dm + dm.T
-        ngrids = 8
-        coords = np.random.rand(ngrids,3)
-        weight = np.random.rand(ngrids)
+            (-9.624260408900755+0.0003122100947141664j)).max(), 0, 13)
+        # Internal consistency check
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
 
-        ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
-        rho = numint2c.eval_rho(mol, ao, dm, xctype='MGGA', hermi=1, with_lapl=False)
-        ni_CPU = pyscf_numint2c()
-        ni_GPU = NumInt2C()
-        ni_GPU.collinear = 'mcol'
-        ni_CPU.collinear = 'mcol'
-        eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
-        eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
-        vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
-        vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
-        mask = np.ones((8, mol.nbas), dtype=np.uint8)
-        mask_gpu = cupy.array([i for i in range(mol.nbas)])
+        # --- CPU Comparison (Conditional) ---
+        if mcfun is not None:
+            ni_CPU = pyscf_numint2c()
+            ni_CPU.collinear = 'mcol'
+            eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+            vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+            mask = np.ones((8, mol.nbas), dtype=np.uint8)
 
-        shls_slice = (0, mol.nbas)
-        v0_cpu = pyscf_numint2c_file._mcol_mgga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
-            rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
-        v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
-            mask_gpu, shls_slice, ao_loc, 0)
-        v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
-            mask_gpu, shls_slice, ao_loc, 1)
-        v1_gpu = v1_gpu + v1_gpu.conj().T
-        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
-        self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
+            v0_cpu = pyscf_numint2c_file._mcol_gga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
+                rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+            
+            # CPU vs GPU check
+            self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
-    def test_mcol_mgga_vxc_mat_fp(self):
+    def test_mcol_mgga_vxc_mat(self):
         xc_code = 'tpss'
 
         nao = mol.nao
@@ -361,29 +286,51 @@ def test_mcol_mgga_vxc_mat_fp(self):
         np.random.seed(12)
         dm = np.random.rand(n2c, n2c) * .001
         dm += np.eye(n2c)
-        dm = dm + dm.T
+        dm = dm + dm.T.conj()
         ngrids = 8
         coords = np.random.rand(ngrids,3)
         weight = np.random.rand(ngrids)
 
         ao = numint.eval_ao(mol, coords, deriv=1, transpose=False)
         rho = numint2c.eval_rho(mol, ao, dm, xctype='MGGA', hermi=1, with_lapl=False)
+
+        # --- GPU Calculations (Always Run) ---
         ni_GPU = NumInt2C()
         ni_GPU.collinear = 'mcol'
         eval_xc_GPU = ni_GPU.mcfun_eval_xc_adapter(xc_code)
         vxc_gpu = eval_xc_GPU(xc_code, rho, deriv=1, xctype='MGGA')[1]
-        mask_gpu = cupy.array([i for i in range(mol.nbas)])
-
+        mask_gpu = cupy.arange(mol.nbas)
         shls_slice = (0, mol.nbas)
+        
         v0_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
             mask_gpu, shls_slice, ao_loc, 0)
         v1_gpu = numint2c._mcol_mgga_vxc_mat(mol, ao, cupy.asarray(weight), rho, vxc_gpu.copy(), 
             mask_gpu, shls_slice, ao_loc, 1)
         v1_gpu = v1_gpu + v1_gpu.conj().T
+
+        # --- Assertions (Always Run) ---
+        # Fingerprint checks (from _fp test) <-- UPDATED
         self.assertAlmostEqual(abs(lib.fp(v1_gpu.get()) 
             - (-11.359687631112195+6.13828986953566e-22j)).max(), 0, 13)
         self.assertAlmostEqual(abs(lib.fp(v0_gpu.get()) 
             - (-11.359687631112195+6.13828986953566e-22j)).max(), 0, 13)
+        
+        # Internal consistency check
+        self.assertAlmostEqual(abs(v0_gpu.get() - v1_gpu.get()).max(), 0, 13)
+        
+        # --- CPU Comparison (Conditional) ---
+        if mcfun is not None:
+            ni_CPU = pyscf_numint2c()
+            ni_CPU.collinear = 'mcol'
+            eval_xc_cpu = ni_CPU.mcfun_eval_xc_adapter(xc_code)
+            vxc_cpu = eval_xc_cpu(xc_code, rho.get(), deriv=1)[1]
+            mask = np.ones((8, mol.nbas), dtype=np.uint8)
+
+            v0_cpu = pyscf_numint2c_file._mcol_mgga_vxc_mat(mol, ao.transpose(0,2,1).get(), weight, 
+                rho.get(), vxc_cpu.copy(), mask, shls_slice, ao_loc, 0)
+            
+            # CPU vs GPU check
+            self.assertAlmostEqual(abs(v0_gpu.get() - v0_cpu).max(), 0, 13)
 
 
 if __name__ == "__main__":
diff --git a/gpu4pyscf/scf/diis.py b/gpu4pyscf/scf/diis.py
index 570690600..091cd01d1 100644
--- a/gpu4pyscf/scf/diis.py
+++ b/gpu4pyscf/scf/diis.py
@@ -48,6 +48,8 @@ def __init__(self, mf=None, filename=None):
         self.space = 8
 
     def update(self, s, d, f, *args, **kwargs):
+        if d.dtype == cp.complex128:
+            s = s.astype(cp.complex128)
         errvec = self._sdf_err_vec(s, d, f)
         if self.incore is None:
             mem_avail = get_avail_mem()
diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py
index fbce65008..ab49acfa5 100644
--- a/gpu4pyscf/scf/hf.py
+++ b/gpu4pyscf/scf/hf.py
@@ -145,8 +145,6 @@ def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None,
     if dm is None: dm = mf.make_rdm1()
     s1e = cupy.asarray(s1e)
     dm = cupy.asarray(dm)
-    if dm.dtype == cupy.complex128:
-        s1e = s1e.astype(cupy.complex128)
     if diis_start_cycle is None:
         diis_start_cycle = mf.diis_start_cycle
     if level_shift_factor is None:

From 856eca1a90c1fe79ce51cb0c76e957ad3ab2f6bb Mon Sep 17 00:00:00 2001
From: "puzhichen.996" <puzhichen.996@bytedance.com>
Date: Fri, 14 Nov 2025 15:35:11 +0800
Subject: [PATCH 21/21] remove the to_gpu test

---
 gpu4pyscf/dft/numint.py         | 69 +--------------------------------
 gpu4pyscf/dft/tests/test_gks.py |  2 +-
 2 files changed, 2 insertions(+), 69 deletions(-)

diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 4e0345294..9bdc1e935 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -2428,74 +2428,7 @@ def unsort_orbitals(self, sorted_mat, axis=[], out=None):
             out = cupy.empty_like(sorted_mat)
         out[tuple(fancy_index)] = sorted_mat
         return out
-
-    def sort_orbitals_2c(self, mat, axis=[]):
-        ''' Transform given axis of a 2-component matrix (GKS) into sorted AO
-        
-        This assumes the axes specified in 'axis' have a dimension of 2*nao,
-        representing alpha (:nao) and beta (nao:) components. Both components
-        are sorted using the same AO sorting index.
-        '''
-        idx = self._ao_idx
-        nao = len(idx)
-        
-        # Create the 2-component sorting index:
-        # [sorted_alpha_indices, sorted_beta_indices]
-        # E.g., if idx = [0, 2, 1] (nao=3),
-        # idx_2c = [0, 2, 1, 3+0, 3+2, 3+1] = [0, 2, 1, 3, 5, 4]
-        # We use numpy to build the index, consistent with self._ao_idx
-        idx_2c = np.concatenate([idx, idx + nao])
-
-        shape_ones = (1,) * mat.ndim
-        fancy_index = []
-        for dim, n in enumerate(mat.shape):
-            if dim in axis:
-                # Check if the dimension matches the 2-component size
-                if n != 2 * nao:
-                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component sorting")
-                indices = idx_2c
-            else:
-                # Use cupy.arange for non-sorted axes, as in the original sort_orbitals
-                indices = cupy.arange(n)
-            
-            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
-            fancy_index.append(indices.reshape(idx_shape))
-            
-        # Perform the sorting using advanced indexing
-        return mat[tuple(fancy_index)]
-
-    def unsort_orbitals_2c(self, sorted_mat, axis=[], out=None):
-        ''' Transform given axis of a 2-component matrix from sorted AO to original AO
-        
-        This assumes the axes specified in 'axis' have a dimension of 2*nao.
-        This is the inverse operation of sort_orbitals_2c.
-        '''
-        idx = self._ao_idx
-        nao = len(idx)
-        
-        # The 2-component index is created identically to sort_orbitals_2c
-        idx_2c = np.concatenate([idx, idx + nao])
-
-        shape_ones = (1,) * sorted_mat.ndim
-        fancy_index = []
-        for dim, n in enumerate(sorted_mat.shape):
-            if dim in axis:
-                # Check if the dimension matches the 2-component size
-                if n != 2 * nao:
-                    raise ValueError(f"Axis {dim} has dimension {n}, expected {2*nao} for 2-component unsorting")
-                indices = idx_2c
-            else:
-                indices = cupy.arange(n)
-            
-            idx_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
-            fancy_index.append(indices.reshape(idx_shape))
-            
-        if out is None:
-            out = cupy.empty_like(sorted_mat)
-            
-        # Perform the unsorting assignment
-        out[tuple(fancy_index)] = sorted_mat
-        return out
+    
 
 class GTOValEnvVars(ctypes.Structure):
     _fields_ = [
diff --git a/gpu4pyscf/dft/tests/test_gks.py b/gpu4pyscf/dft/tests/test_gks.py
index 9bce0f950..d64b1f13a 100644
--- a/gpu4pyscf/dft/tests/test_gks.py
+++ b/gpu4pyscf/dft/tests/test_gks.py
@@ -141,7 +141,7 @@ def test_to_cpu(self):
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), lib.fp(mf_cpu.mo_energy), 5)
         self.assertAlmostEqual(lib.fp(mf_gpu.mo_energy.get()), -27.63368769213053, 5)
 
-    @unittest.skipIf(mcfun is None, "mcfun library not found.")
+    @unittest.skip("NumInt2C in PySCF has no to_gpu method.")
     def test_to_gpu(self):
         mf_cpu = gks_cpu.GKS(mol1)
         mf_cpu.xc = 'lda,vwn'