deepmodeling · AsymmetryChou · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -157,3 +157,10 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+
+# test files
+test/data/siesta/siesta_out/*
+example/siesta_io/siesta_io.ipynb
+CLAUDE.md
+playground/*
diff --git a/dftio/__main__.py b/dftio/__main__.py
@@ -153,6 +153,12 @@ def main_parser() -> argparse.ArgumentParser:
         default=0,
         help="The initial band index for eigenvalues to save.(0-band_index_min) bands will be ignored!"
     )
+    parser_parse.add_argument(
+        "-energy",
+        "--energy",
+        action="store_true",
+        help="Whether to parse the total energy (Etot) from DFT output",
+    )
 
     parser_band = subparsers.add_parser(
         "band",

diff --git a/dftio/data/_keys.py b/dftio/data/_keys.py
@@ -103,6 +103,7 @@
 
 PER_ATOM_ENERGY_KEY: Final[str] = "atomic_energy"
 TOTAL_ENERGY_KEY: Final[str] = "total_energy"
+UNCONVERGED_FRAME_INDICES_KEY: Final[str] = "unconverged_frames"
 FORCE_KEY: Final[str] = "forces"
 PARTIAL_FORCE_KEY: Final[str] = "partial_forces"
 STRESS_KEY: Final[str] = "stress"

diff --git a/dftio/io/abacus/abacus_parser.py b/dftio/io/abacus/abacus_parser.py
diff --git a/dftio/io/parse.py b/dftio/io/parse.py
@@ -211,13 +211,13 @@ def check_blocks(self, idx, hamiltonian: bool=False, overlap: bool=False, densit
 
         return True
 
-    def write(self, idx, outroot, format, eigenvalue, hamiltonian, overlap, density_matrix, band_index_min, **kwargs):
+    def write(self, idx, outroot, format, eigenvalue, hamiltonian, overlap, density_matrix, band_index_min, energy=False, **kwargs):
         if format == "hdf5":
-            self.write_hdf5(idx=idx, outroot=outroot, eigenvalue=eigenvalue, hamiltonian=hamiltonian, overlap=overlap, density_matrix=density_matrix,band_index_min=band_index_min)
+            self.write_hdf5(idx=idx, outroot=outroot, eigenvalue=eigenvalue, hamiltonian=hamiltonian, overlap=overlap, density_matrix=density_matrix,band_index_min=band_index_min, energy=energy)
         elif format in ["dat", "ase"]:
-            self.write_dat(idx=idx, outroot=outroot, fmt=format, eigenvalue=eigenvalue, hamiltonian=hamiltonian, overlap=overlap, density_matrix=density_matrix,band_index_min=band_index_min)
+            self.write_dat(idx=idx, outroot=outroot, fmt=format, eigenvalue=eigenvalue, hamiltonian=hamiltonian, overlap=overlap, density_matrix=density_matrix,band_index_min=band_index_min, energy=energy)
         elif format == "lmdb":
-            self.write_lmdb(idx=idx, outroot=outroot, eigenvalue=eigenvalue, hamiltonian=hamiltonian, overlap=overlap, density_matrix=density_matrix,band_index_min=band_index_min)
+            self.write_lmdb(idx=idx, outroot=outroot, eigenvalue=eigenvalue, hamiltonian=hamiltonian, overlap=overlap, density_matrix=density_matrix,band_index_min=band_index_min, energy=energy)
         else:
             raise NotImplementedError(f"Format: {format} is not implemented!")
 
@@ -242,10 +242,10 @@ def write_struct(self, structure, out_dir, fmt='dat'):
         else:
             raise NotImplementedError(f"Format: {fmt} is not implemented!")
 
-    def write_dat(self, idx, outroot, fmt='dat', eigenvalue=False, hamiltonian=False, overlap=False, density_matrix=False, band_index_min=0):
+    def write_dat(self, idx, outroot, fmt='dat', eigenvalue=False, hamiltonian=False, overlap=False, density_matrix=False, band_index_min=0, energy=False):
         # write structure
         os.makedirs(outroot, exist_ok=True)
-       
+
         structure = self.get_structure(idx)
 
         out_dir = os.path.join(outroot, self.formula(idx=idx)+".{}".format(idx))
@@ -255,7 +255,7 @@ def write_dat(self, idx, outroot, fmt='dat', eigenvalue=False, hamiltonian=False
         # np.savetxt(os.path.join(out_dir, "positions.dat"), structure[_keys.POSITIONS_KEY].reshape(-1, 3))
         # np.savetxt(os.path.join(out_dir, "atomic_numbers.dat"), structure[_keys.ATOMIC_NUMBERS_KEY], fmt='%d')
         # np.savetxt(os.path.join(out_dir, "pbc.dat"), structure[_keys.PBC_KEY])
-        
+
         # write structure
         self.write_struct(structure, out_dir, fmt=fmt)
 
@@ -266,6 +266,26 @@ def write_dat(self, idx, outroot, fmt='dat', eigenvalue=False, hamiltonian=False
             np.save(os.path.join(out_dir, "kpoints.npy"), eigstatus[_keys.KPOINT_KEY])
             np.save(os.path.join(out_dir, "eigenvalues.npy"), eigstatus[_keys.ENERGY_EIGENVALUE_KEY])
 
+        # write energy
+        if energy:
+            if hasattr(self, 'get_etot'):
+                energy_data = self.get_etot(idx)
+                if energy_data is not None:
+                    np.savetxt(os.path.join(out_dir, "total_energy.dat"), energy_data[_keys.TOTAL_ENERGY_KEY])
+
+                    # Write unconverged frame indices if present
+                    if _keys.UNCONVERGED_FRAME_INDICES_KEY in energy_data:
+                        unconverged_indices = energy_data[_keys.UNCONVERGED_FRAME_INDICES_KEY]
+                        if len(unconverged_indices) > 0:
+                            with open(os.path.join(out_dir, "unconverged_frames.dat"), 'w') as f:
+                                f.write("# Frame indices that did not converge during MD/RELAX\n")
+                                for idx_frame in unconverged_indices:
+                                    f.write(f"{idx_frame}\n")
+                else:
+                    log.warning(f"Failed to extract energy for structure {idx}")
+            else:
+                log.warning(f"Parser does not implement get_etot method")
+
         # write blocks
         if any([hamiltonian is not None, overlap is not None, density_matrix is not None]) and any([hamiltonian, overlap, density_matrix]):
             with open(os.path.join(out_dir, "basis.dat"), 'w') as f:
@@ -279,34 +299,55 @@ def write_dat(self, idx, outroot, fmt='dat', eigenvalue=False, hamiltonian=False
                         for key_str, value in ham[i].items():
                             default_group.create_dataset(key_str, data=value)
             del ham
-            
+
             if overlap:
                 with h5py.File(os.path.join(out_dir, "overlaps.h5"), 'w') as fid:
                     for i in range(len(ovp)):
                         default_group = fid.create_group(str(i))
                         for key_str, value in ovp[i].items():
                             default_group.create_dataset(key_str, data=value)
             del ovp
-            
+
             if density_matrix:
                 with h5py.File(os.path.join(out_dir, "density_matrices.h5"), 'w') as fid:
                     for i in range(len(dm)):
                         default_group = fid.create_group(str(i))
                         for key_str, value in dm[i].items():
                             default_group.create_dataset(key_str, data=value)
-            
+
             del dm
 
         return True
 
-    def write_lmdb(self, idx, outroot, eigenvalue: bool=False, hamiltonian: bool=False, overlap: bool=False, density_matrix: bool=False,band_index_min=0):
+    def write_lmdb(self, idx, outroot, eigenvalue: bool=False, hamiltonian: bool=False, overlap: bool=False, density_matrix: bool=False,band_index_min=0, energy: bool=False):
         os.makedirs(outroot, exist_ok=True)
         out_dir = os.path.join(outroot, "data.{}.lmdb".format(os.getpid()))
         structure = self.get_structure(idx)
         if any([hamiltonian, overlap, density_matrix]):
             ham, ovp, dm = self.get_blocks(idx, hamiltonian, overlap, density_matrix)
         if eigenvalue:
             eigstatus = self.get_eigenvalue(idx=idx, band_index_min=band_index_min)
+        if energy:
+            if hasattr(self, 'get_etot'):
+                energy_data = self.get_etot(idx)
+            else:
+                energy_data = None
+                log.warning(f"Parser does not implement get_etot method")
+
+        # Build frame index mapping for energy data
+        # If there are unconverged frames, energy array will be shorter than n_frames
+        energy_frame_mapping = None
+        if energy and energy_data is not None:
+            unconverged_indices = energy_data.get(_keys.UNCONVERGED_FRAME_INDICES_KEY, [])
+            if len(unconverged_indices) > 0:
+                # Build mapping: structure_frame_idx -> energy_array_idx
+                energy_frame_mapping = {}
+                energy_idx = 0
+                n_frames_total = structure[_keys.POSITIONS_KEY].shape[0]
+                for frame_idx in range(n_frames_total):
+                    if frame_idx not in unconverged_indices:
+                        energy_frame_mapping[frame_idx] = energy_idx
+                        energy_idx += 1
 
         n_frames = structure[_keys.POSITIONS_KEY].shape[0]
         lmdb_env = lmdb.open(out_dir, map_size=1048576000000, lock=True)
@@ -321,6 +362,23 @@ def write_lmdb(self, idx, outroot, eigenvalue: bool=False, hamiltonian: bool=Fal
                 data_dict[_keys.ENERGY_EIGENVALUE_KEY] = eigstatus[_keys.ENERGY_EIGENVALUE_KEY][nf]
                 data_dict[_keys.KPOINT_KEY] = eigstatus[_keys.KPOINT_KEY]
 
+            if energy and energy_data is not None:
+                # For single structure (SCF/NSCF), energy_data has shape [1,]
+                # For trajectories (MD/RELAX), energy_data has shape [nframes,] or less if unconverged
+                if energy_data[_keys.TOTAL_ENERGY_KEY].shape[0] == 1:
+                    # Single structure case
+                    data_dict[_keys.TOTAL_ENERGY_KEY] = energy_data[_keys.TOTAL_ENERGY_KEY][0]
+                else:
+                    # Trajectory case - use mapping if unconverged frames exist
+                    if energy_frame_mapping is not None:
+                        if nf in energy_frame_mapping:
+                            energy_idx = energy_frame_mapping[nf]
+                            data_dict[_keys.TOTAL_ENERGY_KEY] = energy_data[_keys.TOTAL_ENERGY_KEY][energy_idx]
+                        # else: skip energy for unconverged frames (don't add to data_dict)
+                    else:
+                        # No unconverged frames, direct indexing
+                        data_dict[_keys.TOTAL_ENERGY_KEY] = energy_data[_keys.TOTAL_ENERGY_KEY][nf]
+
             if hamiltonian:
                 data_dict["hamiltonian"] = ham[nf]
             if overlap: