diff --git a/gpu4pyscf/grad/tdrhf.py b/gpu4pyscf/grad/tdrhf.py
index 870e45a6e..a62363f6a 100644
--- a/gpu4pyscf/grad/tdrhf.py
+++ b/gpu4pyscf/grad/tdrhf.py
@@ -15,6 +15,7 @@
 
 from functools import reduce
 import cupy as cp
+import numpy as np
 from pyscf import lib, gto
 from gpu4pyscf.lib import logger
 from gpu4pyscf.grad import rhf as rhf_grad
@@ -24,6 +25,45 @@
 from pyscf import __config__
 from gpu4pyscf.lib import utils
 from gpu4pyscf import tdscf
+from scipy.optimize import linear_sum_assignment
+
+def match_and_reorder_mos(s12_ao, mo_coeff_b, mo_coeff, threshold=0.4):
+    if mo_coeff_b.shape != mo_coeff.shape:
+        raise ValueError("Mo coeff b and mo coeff must have the same shape.")
+    if s12_ao.shape[0] != s12_ao.shape[1] or s12_ao.shape[0] != mo_coeff_b.shape[0]:
+        raise ValueError("S12 ao must be a square matrix with the same shape as mo coeff b.")
+    mo_overlap_matrix = mo_coeff_b.T @ s12_ao @ mo_coeff
+    abs_mo_overlap = cp.abs(mo_overlap_matrix)
+    cost_matrix = -abs_mo_overlap
+    below_threshold_mask = abs_mo_overlap < threshold
+    infinity_cost = mo_coeff_b.shape[1] + 1
+    cost_matrix[below_threshold_mask] = infinity_cost
+    
+    row_ind, col_ind = linear_sum_assignment(cost_matrix.get())
+
+    matching_indices = col_ind
+    
+    mo2_reordered = mo_coeff[:, matching_indices]
+
+    final_chosen_overlaps = abs_mo_overlap[row_ind, col_ind]
+    invalid_matches_mask = final_chosen_overlaps < threshold
+    
+    if cp.any(invalid_matches_mask):
+        num_invalid = cp.sum(invalid_matches_mask)
+        print(
+            f"{num_invalid} orbital below threshold {threshold}."
+            "This may indicate significant changes in the properties of these orbitals between the two structures."
+        )
+        invalid_indices = cp.where(invalid_matches_mask)[0]
+        for idx in invalid_indices:
+            print(f"Warning: reference coeff #{idx}'s best match is {final_chosen_overlaps[idx]:.4f} (below threshold {threshold})")
+    s_mo_new = mo_coeff_b.T @ s12_ao @ mo2_reordered
+    sign_array = cp.ones(s_mo_new.shape[-1])
+    for i in range(s_mo_new.shape[-1]):
+        if s_mo_new[i,i] < 0.0:
+            mo2_reordered[:,i] *= -1
+            sign_array[i] = -1
+    return mo2_reordered, matching_indices, sign_array
 
 
 def grad_elec(td_grad, x_y, singlet=True, atmlst=None, verbose=logger.INFO):
@@ -236,6 +276,38 @@ def as_scanner(td_grad, state=1):
                          (TDSCF_GradScanner, td_grad.__class__), name)
 
 
+def check_flip(mol0, mol1, mo_coeff0, mo_coeff1, xy0, xy1, state, nocc):
+
+    nao = mol0.nao
+    nvir = nao - nocc
+
+    s = gto.intor_cross('int1e_ovlp', mol0, mol1)
+    s = cp.asarray(s)
+    mo1_reordered, matching_indices, sign_array = \
+        match_and_reorder_mos(s, mo_coeff0, mo_coeff1, threshold=0.4)
+    mo1_reordered = cp.asarray(mo1_reordered)
+    x0 = xy0[state-1][0]
+    nstates = len(xy0)
+    s_state_list = []
+    for istate in range(nstates):
+        x1 = xy1[istate][0]
+        s_state = 0
+        for i in range(nocc):
+            for a in range(nvir):
+                for j in range(nocc):
+                    for b in range(nvir):
+                        mo_coeff0_tmp = mo_coeff0[:,:nocc]*1.0
+                        mo_coeff1_tmp = mo1_reordered[:,:nocc]*1.0
+                        mo_coeff0_tmp[:,i] = mo_coeff0[:,a+nocc]
+                        mo_coeff1_tmp[:,j] = mo1_reordered[:,b+nocc]
+                        s_mo = mo_coeff0_tmp.T @ s @ mo_coeff1_tmp
+                        s_state += cp.linalg.det(s_mo)\
+                            *x0[i,a]*x1[j,b]*2
+        s_state_list.append(float(s_state))
+    s_state_list = np.asarray(s_state_list)
+    return np.argmax(np.abs(s_state_list))+1, s_state_list
+
+
 class TDSCF_GradScanner(lib.GradScanner):
     _keys = {'e_tot'}
 
@@ -245,12 +317,15 @@ def __init__(self, g, state):
             self.state = state
 
     def __call__(self, mol_or_geom, state=None, **kwargs):
+        mol0 = self.mol.copy()
         if isinstance(mol_or_geom, gto.MoleBase):
             assert mol_or_geom.__class__ == gto.Mole
             mol = mol_or_geom
         else:
             mol = self.mol.set_geom_(mol_or_geom, inplace=False)
         self.reset(mol)
+        mo_coeff0 = self.base._scf.mo_coeff
+        xy0 = self.base.xy
 
         if state is None:
             state = self.state
@@ -261,17 +336,25 @@ def __call__(self, mol_or_geom, state=None, **kwargs):
         assert td_scanner.device == 'gpu'
         assert self.device == 'gpu'
         td_scanner(mol)
-        # TODO: Check root flip.  Maybe avoid the initial guess in TDHF otherwise
-        # large error may be found in the excited states amplitudes
-        de = self.kernel(state=state, **kwargs)
-        e_tot = self.e_tot[state-1]
+        mo_coeff1 = self.base._scf.mo_coeff
+        xy1 = self.base.xy
+        mo_occ = cp.asarray(self.base._scf.mo_occ)
+        nocc = int((mo_occ > 0).sum())
+
+        flip_state, s_state_list = check_flip(mol0, mol, mo_coeff0, mo_coeff1, xy0, xy1, state, nocc)
+        if flip_state != state:
+            logger.warn(self, 'Root %d is flipped to %d', state, flip_state)
+            logger.warn(self, f'S_state_list = {s_state_list}')
+            self.state = flip_state
+        de = self.kernel(state=flip_state, **kwargs)
+        e_tot = self.e_tot[flip_state-1]
         return e_tot, de
 
     @property
     def converged(self):
         td_scanner = self.base
         return all((td_scanner._scf.converged,
-                    td_scanner.converged[self.state]))
+                    td_scanner.converged[self.state-1]))
 
 
 class Gradients(rhf_grad.GradientsBase):
diff --git a/gpu4pyscf/grad/tests/test_tddft_opt.py b/gpu4pyscf/grad/tests/test_tddft_opt.py
index a1faa5193..7b6a0cf19 100644
--- a/gpu4pyscf/grad/tests/test_tddft_opt.py
+++ b/gpu4pyscf/grad/tests/test_tddft_opt.py
@@ -23,18 +23,24 @@
 from packaging import version
 
 atom = """
-O       0.0000000000     0.0000000000     0.0000000000
-H       0.0000000000    -0.7570000000     0.5870000000
-H       0.0000000000     0.7570000000     0.5870000000
+H 0.000000 0.923274 1.238289
+H 0.000000 -0.923274 1.238289
+H 0.000000 0.923274 -1.238289
+H 0.000000 -0.923274 -1.238289
+C 0.000000 0.000000 0.668188
+C 0.000000 0.000000 -0.668188
 """
 
 atom_near_conv = """
-O  -0.000000  -0.000000   0.391241
-H  -0.000000  -1.283134   0.391326
-H  -0.000000   1.283134   0.391326
+H  0.000000   0.889149   1.396667
+H -0.000000  -0.889149   1.396667
+H -0.000000   0.889149  -1.396667
+H -0.000000  -0.889149  -1.396667
+C  0.000000   0.000000   0.696208
+C  0.000000   0.000000  -0.696208
 """
 
-bas0 = "631g"
+bas0 = "sto-3g"
 
 def setUpModule():
     global mol, mol_near_conv
@@ -55,7 +61,7 @@ def test_opt_rhf_tda(self):
         mf = scf.RHF(mol).to_gpu()
         mf.kernel()
         assert mf.converged
-        td = mf.TDA().set(nstates=3)
+        td = mf.TDA().set(nstates=5)
         td.kernel()
 
         # TODO: store CPU results for comparison
@@ -69,19 +75,26 @@ def test_opt_rks_tda(self):
         mf = dft.RKS(mol, xc='b3lyp').to_gpu()
         mf.kernel()
         assert mf.converged
-        td = mf.TDA().set(nstates=3)
+        td = mf.TDA().set(nstates=5)
         td.kernel()
         # TODO: store CPU results for comparison
-        td_cpu = td.to_cpu()
         mol_gpu = optimize(td)
-        mol_cpu = optimize(td_cpu)
-        assert np.linalg.norm(mol_gpu.atom_coords() - mol_cpu.atom_coords()) < 1e-4
+
+        mff = dft.RKS(mol_gpu, xc='b3lyp').to_gpu()
+        mff.kernel()
+        assert mff.converged
+        tdf = mff.TDA().set(nstates=5)
+        tdf.kernel()[0]
+        assert bool(np.all(tdf.converged))
+        excited_gradf = tdf.nuc_grad_method()
+        excited_gradf.kernel()
+        assert np.linalg.norm(excited_gradf.de) < 2.0e-4
 
     def test_opt_rks_tda_pcm_1(self):
         mf = dft.RKS(mol_near_conv, xc='b3lyp').PCM().to_gpu()
         mf.kernel()
         assert mf.converged
-        td = mf.TDA(equilibrium_solvation=True).set(nstates=3)
+        td = mf.TDA(equilibrium_solvation=True).set(nstates=5)
         td.kernel()
         mol_gpu = optimize(td)
 
@@ -99,7 +112,7 @@ def test_opt_rks_tda_pcm_2(self):
         mf = dft.RKS(mol_near_conv, xc='b3lyp').PCM().to_gpu()
         mf.kernel()
         assert mf.converged
-        td = mf.TDA(equilibrium_solvation=True).set(nstates=3)
+        td = mf.TDA(equilibrium_solvation=True).set(nstates=5)
         td.kernel()
 
         excited_grad = td.nuc_grad_method().as_scanner(state=1)