diff --git a/gpu4pyscf/dft/tests/test_libxc.py b/gpu4pyscf/dft/tests/test_libxc.py
index c13dba133..9edc9f1cc 100644
--- a/gpu4pyscf/dft/tests/test_libxc.py
+++ b/gpu4pyscf/dft/tests/test_libxc.py
@@ -20,6 +20,7 @@
 from gpu4pyscf.dft.numint import NumInt as numint_gpu
 from pyscf.dft.numint import NumInt as numint_cpu
 import cupy
+import os
 
 def setUpModule():
     global mol, dm1, dm0
@@ -51,7 +52,7 @@ def _diff(dat, ref):
     return np.min((abs(d/(ref+1e-300)), abs(d)), axis=0)
 
 class KnownValues(unittest.TestCase):
-    def _check_xc(self, xc, spin=0, fxc_tol=1e-10, kxc_tol=1e-10):
+    def _check_xc(self, xc, spin=0, deriv=2, fxc_tol=1e-10, kxc_tol=1e-10):
         ni_cpu = numint_cpu()
         ni_gpu = numint_gpu()
         xctype = ni_cpu._xc_type(xc)
@@ -66,8 +67,15 @@ def _check_xc(self, xc, spin=0, fxc_tol=1e-10, kxc_tol=1e-10):
         if spin != 0:
             rho = (rho, rho)
 
-        exc_cpu, vxc_cpu, fxc_cpu, kxc_cpu = ni_cpu.eval_xc_eff(xc, rho, deriv=2, xctype=xctype)
-        exc_gpu, vxc_gpu, fxc_gpu, kxc_gpu = ni_gpu.eval_xc_eff(xc, cupy.array(rho), deriv=2, xctype=xctype)
+        exc_cpu, vxc_cpu, fxc_cpu, kxc_cpu = ni_cpu.eval_xc_eff(xc, rho, deriv=deriv, xctype=xctype)
+        exc_gpu, vxc_gpu, fxc_gpu, kxc_gpu = ni_gpu.eval_xc_eff(xc, cupy.array(rho), deriv=deriv, xctype=xctype)
+
+        print(f"{xc} {spin} exc", _diff(exc_gpu[:,0].get(), exc_cpu).max())
+        print(f"{xc} {spin} vxc", _diff(vxc_gpu.get(), vxc_cpu).max())
+        if fxc_gpu is not None:
+            print(f"{xc} {spin} fxc", _diff(fxc_gpu.get(), fxc_cpu).max())
+        if kxc_gpu is not None:
+            print(f"{xc} {spin} kxc", _diff(kxc_gpu.get(), kxc_cpu).max())
 
         assert _diff(exc_gpu[:,0].get(), exc_cpu).max() < 1e-10
         assert _diff(vxc_gpu.get(), vxc_cpu).max() < 1e-10
@@ -77,15 +85,32 @@ def _check_xc(self, xc, spin=0, fxc_tol=1e-10, kxc_tol=1e-10):
             assert _diff(kxc_gpu.get(), kxc_cpu).max() < kxc_tol
 
     def test_LDA(self):
-        self._check_xc('LDA_C_VWN')
+        whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1'
+        if whether_use_gpu:
+            deriv = 3
+            print("test LDA with deriv 3")
+        else:
+            deriv = 2
+            print("test LDA with deriv 2")
+        self._check_xc('LDA_C_VWN', deriv=deriv)
 
     def test_GGA(self):
-        self._check_xc('HYB_GGA_XC_B3LYP')
-        self._check_xc('GGA_X_B88', fxc_tol=1e-10)
-        self._check_xc('GGA_C_PBE', fxc_tol=1e-4)
+        whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1'
+        if whether_use_gpu:
+            deriv = 3
+        else:
+            deriv = 2
+        self._check_xc('HYB_GGA_XC_B3LYP', deriv=deriv, kxc_tol=1e-9)
+        self._check_xc('GGA_X_B88', fxc_tol=1e-10, deriv=deriv, kxc_tol=1e-8)
+        self._check_xc('GGA_C_PBE', fxc_tol=1e-4, deriv=deriv, kxc_tol=3e2)
 
     def test_mGGA(self):
-        self._check_xc('MGGA_C_M06', fxc_tol=1e-4)
+        whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1'
+        if whether_use_gpu:
+            deriv = 3
+        else:
+            deriv = 2
+        self._check_xc('MGGA_C_M06', fxc_tol=1e-4, deriv=deriv, kxc_tol=1e-2)
 
     def test_u_LDA(self):
         self._check_xc('LDA_C_VWN', spin=1)
@@ -101,4 +126,4 @@ def test_u_mGGA(self):
 
 if __name__ == "__main__":
     print("Full Tests for xc fun")
-    unittest.main()
+    unittest.main()
\ No newline at end of file
diff --git a/gpu4pyscf/grad/tdrks.py b/gpu4pyscf/grad/tdrks.py
index 2d5265c5c..5ea9ba73f 100644
--- a/gpu4pyscf/grad/tdrks.py
+++ b/gpu4pyscf/grad/tdrks.py
@@ -27,6 +27,7 @@
 from gpu4pyscf.grad import rks as rks_grad
 from gpu4pyscf.grad import tdrhf
 from gpu4pyscf import tdscf
+import os
 
 
 #
@@ -362,13 +363,16 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None,
             rho = ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask, mo_occ, mask, xctype, with_lapl=False)
             # quick fix
             if deriv > 2:
-                ni_cpu = numint_cpu()
-                # TODO: If the libxc is stablized, this should be gpulized
-                # vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
-                vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:]
-                if isinstance(vxc,np.ndarray): vxc = cp.asarray(vxc)
-                if isinstance(fxc,np.ndarray): fxc = cp.asarray(fxc)
-                if isinstance(kxc,np.ndarray): kxc = cp.asarray(kxc)
+                whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1'
+                if not whether_use_gpu:
+                    ni_cpu = numint_cpu()
+                    # TODO: If the libxc is stablized, this should be gpulized
+                    vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:]
+                    if isinstance(vxc,np.ndarray): vxc = cp.asarray(vxc)
+                    if isinstance(fxc,np.ndarray): fxc = cp.asarray(fxc)
+                    if isinstance(kxc,np.ndarray): kxc = cp.asarray(kxc)
+                else:
+                    vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
             else:
                 vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
             dmvo_mask = dmvo[mask[:, None], mask]
@@ -423,14 +427,15 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None,
             rho *= 0.5
             rho = cp.repeat(rho[cp.newaxis], 2, axis=0)
             # quick fix
-            if deriv > 2:
-                ni_cpu = numint_cpu()
-                vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:]
-                if isinstance(vxc,np.ndarray): vxc = cp.asarray(vxc)
-                if isinstance(fxc,np.ndarray): fxc = cp.asarray(fxc)
-                if isinstance(kxc,np.ndarray): kxc = cp.asarray(kxc)
-            else:
-                vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
+            # if deriv > 2:
+            #     ni_cpu = numint_cpu()
+            #     vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:]
+            #     if isinstance(vxc,np.ndarray): vxc = cp.asarray(vxc)
+            #     if isinstance(fxc,np.ndarray): fxc = cp.asarray(fxc)
+            #     if isinstance(kxc,np.ndarray): kxc = cp.asarray(kxc)
+            # else:
+            #     vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
+            vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
             # fxc_t couples triplet excitation amplitudes
             # 1/2 int (tia - tIA) fxc (tjb - tJB) = tia fxc_t tjb
             fxc_t = fxc[:, :, 0] - fxc[:, :, 1]
diff --git a/gpu4pyscf/grad/tduks.py b/gpu4pyscf/grad/tduks.py
index 1e74cd7d7..00ce46da6 100644
--- a/gpu4pyscf/grad/tduks.py
+++ b/gpu4pyscf/grad/tduks.py
@@ -26,6 +26,7 @@
 from gpu4pyscf.grad import tdrks
 from gpu4pyscf.scf import ucphf
 from gpu4pyscf import tdscf
+import os
 
 
 #
@@ -442,13 +443,17 @@ def _contract_xc_kernel(td_grad, xc_code, dmvo, dmoo=None, with_vxc=True, with_k
             ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask_a, mo_occ[0], mask, xctype,with_lapl=False),
             ni.eval_rho2(_sorted_mol, ao0, mo_coeff_mask_b, mo_occ[1], mask, xctype, with_lapl=False)))
         if deriv > 2:
-            ni_cpu = numint_cpu()
-            # TODO: If the libxc is stablized, this should be gpulized
-            # vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
-            vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:]
-            if isinstance(vxc, np.ndarray): vxc = cp.asarray(vxc)
-            if isinstance(fxc, np.ndarray): fxc = cp.asarray(fxc)
-            if isinstance(kxc, np.ndarray): kxc = cp.asarray(kxc)
+            whether_use_gpu = os.environ.get('LIBXC_ON_GPU', '0') == '1'
+            if not whether_use_gpu:
+                ni_cpu = numint_cpu()
+                # TODO: If the libxc is stablized, this should be gpulized
+                # vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
+                vxc, fxc, kxc = ni_cpu.eval_xc_eff(xc_code, rho.get(), deriv, xctype=xctype)[1:]
+                if isinstance(vxc, np.ndarray): vxc = cp.asarray(vxc)
+                if isinstance(fxc, np.ndarray): fxc = cp.asarray(fxc)
+                if isinstance(kxc, np.ndarray): kxc = cp.asarray(kxc)
+            else:
+                vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
         else:
             vxc, fxc, kxc = ni.eval_xc_eff(xc_code, rho, deriv, xctype=xctype)[1:]
         dmvo_mask_a = dmvo[0, mask[:, None], mask]