From df383c71ef2027a9f4c0e384c0a8559a269af5a6 Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 14:43:39 +0530 Subject: [PATCH 1/7] ENH: Add benchmark for memory usage in pandas DataFrame creation --- asv_bench/benchmarks/bench_snapshot_memory.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 asv_bench/benchmarks/bench_snapshot_memory.py diff --git a/asv_bench/benchmarks/bench_snapshot_memory.py b/asv_bench/benchmarks/bench_snapshot_memory.py new file mode 100644 index 0000000000000..544f9e5e47ad0 --- /dev/null +++ b/asv_bench/benchmarks/bench_snapshot_memory.py @@ -0,0 +1,17 @@ +import pandas as pd +import numpy as np +import tracemalloc + +def make_df(nrows=1_000_000): + return pd.DataFrame({ + "a": np.random.randint(0, 100, size=nrows), + "b": np.random.random(size=nrows), + "c": np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), size=nrows) + }) + +df = make_df(200_000) +tracemalloc.start() +snap = df.snapshot("bench") +snap_shot = tracemalloc.take_snapshot() +top_stats = snap_shot.statistics('lineno') +print("Top memory stats:", top_stats[:3]) From 021ea8d5d65b1924a89b2462b260cf29bf1fee2c Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 14:43:45 +0530 Subject: [PATCH 2/7] ENH: Implement DataFrame snapshot store for versioning support --- pandas/core/frame_versioning.py | 69 +++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pandas/core/frame_versioning.py diff --git a/pandas/core/frame_versioning.py b/pandas/core/frame_versioning.py new file mode 100644 index 0000000000000..7fd6c71f75cf0 --- /dev/null +++ b/pandas/core/frame_versioning.py @@ -0,0 +1,69 @@ +# pandas/core/frame_versioning.py +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +import uuid +from typing import Dict, Optional + +import pandas as pd + + +def _generate_snapshot_id(name: Optional[str] = None) -> str: + if name: + return name + ts = datetime.utcnow().strftime("%Y%m%dT%H%M%S%fZ") + uid = uuid.uuid4().hex[:8] + return f"{ts}-{uid}" + + +@dataclass +class SnapshotMeta: + name: str + created_at: datetime + + +class DataFrameSnapshotStore: + """ + Per-DataFrame snapshot store. + Stores deep copies of DataFrames (safe, simple). + """ + + def __init__(self) -> None: + # snapshot_id -> DataFrame + self._snapshots: Dict[str, pd.DataFrame] = {} + self._meta: Dict[str, SnapshotMeta] = {} + + def snapshot(self, df: pd.DataFrame, name: Optional[str] = None) -> str: + sid = _generate_snapshot_id(name) + # deep copy for safety + self._snapshots[sid] = df.copy(deep=True) + self._meta[sid] = SnapshotMeta(name=sid, created_at=datetime.utcnow()) + return sid + + def restore(self, name: str) -> pd.DataFrame: + if name not in self._snapshots: + raise KeyError(f"Snapshot not found: {name}") + # return a deep copy so modifications don't change stored snapshot + return self._snapshots[name].copy(deep=True) + + def list(self) -> list[str]: + return list(self._snapshots.keys()) + + def drop(self, name: str) -> None: + if name not in self._snapshots: + raise KeyError(f"Snapshot not found: {name}") + del self._snapshots[name] + del self._meta[name] + + def clear(self) -> None: + self._snapshots.clear() + self._meta.clear() + + def info(self, name: Optional[str] = None) -> dict: + if name: + if name not in self._meta: + raise KeyError(f"Snapshot not found: {name}") + meta = self._meta[name] + return {"name": meta.name, "created_at": meta.created_at.isoformat()} + return {"count": len(self._snapshots), "snapshots": [m.name for m in self._meta.values()]} From d75e570ef70e4794498f3bdd660a2ae207dd1388 Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 14:43:49 +0530 Subject: [PATCH 3/7] ENH: Add snapshot functionality to DataFrame for versioning support --- pandas/core/frame.py | 99 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 00c9cb9316ab7..db17d67cefddc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,6 +34,7 @@ Self, cast, overload, + Optional, ) import warnings @@ -269,6 +270,8 @@ from pandas.io.formats.style import Styler + from pandas.core.frame_versioning import DataFrameSnapshotStore + # --------------------------------------------------------------------- # Docstring templates @@ -14353,6 +14356,93 @@ def values(self) -> np.ndarray: ['monkey', nan, None]], dtype=object) """ return self._mgr.as_array() + + def snapshot(self, name: Optional[str] = None) -> str: + """ + Create a named snapshot of this DataFrame and return the snapshot id. + + Parameters + ---------- + name : str, optional + Optional snapshot name. If not provided a timestamped id is returned. + + Returns + ------- + str + Snapshot id. + """ + store = _ensure_snapshot_store(self) + return store.snapshot(self, name=name) + + def restore(self, name: str, inplace: bool = False): + """ + Restore a previously created snapshot. + + Parameters + ---------- + name : str + Snapshot id returned by :meth:`DataFrame.snapshot`. + inplace : bool, default False + If True, mutate this DataFrame to match the snapshot. Otherwise return + a restored copy. + + Returns + ------- + DataFrame or None + Restored DataFrame when ``inplace=False``, otherwise None. + """ + store = getattr(self, "_version_snapshots", None) + if store is None: + raise KeyError(f"No snapshots present for this DataFrame (requested: {name})") + restored = store.restore(name) + if inplace: + # Replace internal state. Using _mgr replacement is more correct than __dict__ update. + # Many pandas internals use the attribute _mgr for BlockManager. Use it cautiously. + try: + # pandas >= 1.x use _mgr (BlockManager); adapt if different in your branch. + object.__setattr__(self, "_mgr", restored._mgr) + # also copy other key attrs + object.__setattr__(self, "axes", restored.axes) + object.__setattr__(self, "_item_cache", restored._item_cache) + except Exception: + # fallback: shallow __dict__ update (less safe) + self.__dict__.update(restored.__dict__) + return None + return restored + + def list_snapshots(self) -> list[str]: + """ + List snapshot ids for this DataFrame. + """ + store = getattr(self, "_version_snapshots", None) + return store.list() if store is not None else [] + + def drop_snapshot(self, name: str) -> None: + """ + Drop a snapshot by id. + """ + store = getattr(self, "_version_snapshots", None) + if store is None: + raise KeyError(f"No snapshots present for this DataFrame (requested drop: {name})") + store.drop(name) + + def clear_snapshots(self) -> None: + """ + Clear all snapshots for this DataFrame. + """ + store = getattr(self, "_version_snapshots", None) + if store is not None: + store.clear() + + def snapshot_info(self, name: Optional[str] = None) -> dict: + """ + Return metadata for all snapshots or a single snapshot. + """ + store = getattr(self, "_version_snapshots", None) + if store is None: + return {"count": 0, "snapshots": []} + return store.info(name) + def _from_nested_dict( @@ -14390,3 +14480,12 @@ def _reindex_for_setitem( "incompatible index of inserted column with frame index" ) from err return reindexed_value, None + +def _ensure_snapshot_store(self) -> DataFrameSnapshotStore: + # attach a per-instance store to DataFrame + store = getattr(self, "_version_snapshots", None) + if store is None: + store = DataFrameSnapshotStore() + # attach to object + object.__setattr__(self, "_version_snapshots", store) + return store From d50c42094dd1b5d8f86a6d51f5655a300a2eac1d Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 14:43:52 +0530 Subject: [PATCH 4/7] ENH: Add unit tests for DataFrame snapshot and restore functionality --- pandas/tests/frame/test_versioning.py | 54 +++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 pandas/tests/frame/test_versioning.py diff --git a/pandas/tests/frame/test_versioning.py b/pandas/tests/frame/test_versioning.py new file mode 100644 index 0000000000000..e8dfd5aa7a425 --- /dev/null +++ b/pandas/tests/frame/test_versioning.py @@ -0,0 +1,54 @@ +# pandas/tests/frame/test_versioning.py +import pandas as pd +import pytest + + +def test_snapshot_and_restore_returns_dataframe(): + df = pd.DataFrame({"x": [1, 2, 3]}) + sid = df.snapshot("t1") + assert sid in df.list_snapshots() + df.loc[0, "x"] = 99 + restored = df.restore(sid) + assert list(restored["x"]) == [1, 2, 3] + + +def test_restore_inplace_mutates_dataframe(): + df = pd.DataFrame({"x": [1, 2, 3]}) + sid = df.snapshot("t2") + df.loc[1, "x"] = 999 + df.restore(sid, inplace=True) + assert list(df["x"]) == [1, 2, 3] + + +def test_drop_and_clear_behaviour(): + df = pd.DataFrame({"a": [1, 2]}) + sid1 = df.snapshot("s1") + sid2 = df.snapshot("s2") + assert set(df.list_snapshots()) == {sid1, sid2} + df.drop_snapshot(sid1) + assert sid1 not in df.list_snapshots() + df.clear_snapshots() + assert df.list_snapshots() == [] + + +def test_snapshot_on_empty_dataframe(): + df = pd.DataFrame() + sid = df.snapshot() + df.loc[0, "a"] = 1 + restored = df.restore(sid) + assert restored.empty + + +def test_copy_does_not_inherit_snapshots(): + df = pd.DataFrame({"a": [1, 2, 3]}) + sid = df.snapshot("orig") + df2 = df.copy() + # design decision: copies do not copy snapshots + assert df2.list_snapshots() == [] + assert sid in df.list_snapshots() + + +def test_missing_snapshot_raises(): + df = pd.DataFrame({"x": [1]}) + with pytest.raises(KeyError): + df.restore("no-such-snapshot") From ee251a3a399ecbe90754d4bb2330ffb1675aa042 Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 15:01:50 +0530 Subject: [PATCH 5/7] ENH: Improve type hinting for DataFrame snapshot methods --- pandas/core/frame_versioning.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame_versioning.py b/pandas/core/frame_versioning.py index 7fd6c71f75cf0..6a63b347b2b16 100644 --- a/pandas/core/frame_versioning.py +++ b/pandas/core/frame_versioning.py @@ -1,12 +1,12 @@ -# pandas/core/frame_versioning.py from __future__ import annotations from dataclasses import dataclass from datetime import datetime import uuid -from typing import Dict, Optional +from typing import Dict, Optional, TYPE_CHECKING -import pandas as pd +if TYPE_CHECKING: + import pandas as pd # only used for type hints def _generate_snapshot_id(name: Optional[str] = None) -> str: @@ -31,17 +31,17 @@ class DataFrameSnapshotStore: def __init__(self) -> None: # snapshot_id -> DataFrame - self._snapshots: Dict[str, pd.DataFrame] = {} + self._snapshots: Dict[str, "pd.DataFrame"] = {} self._meta: Dict[str, SnapshotMeta] = {} - def snapshot(self, df: pd.DataFrame, name: Optional[str] = None) -> str: + def snapshot(self, df: "pd.DataFrame", name: Optional[str] = None) -> str: sid = _generate_snapshot_id(name) # deep copy for safety self._snapshots[sid] = df.copy(deep=True) self._meta[sid] = SnapshotMeta(name=sid, created_at=datetime.utcnow()) return sid - def restore(self, name: str) -> pd.DataFrame: + def restore(self, name: str) -> "pd.DataFrame": if name not in self._snapshots: raise KeyError(f"Snapshot not found: {name}") # return a deep copy so modifications don't change stored snapshot @@ -66,4 +66,5 @@ def info(self, name: Optional[str] = None) -> dict: raise KeyError(f"Snapshot not found: {name}") meta = self._meta[name] return {"name": meta.name, "created_at": meta.created_at.isoformat()} - return {"count": len(self._snapshots), "snapshots": [m.name for m in self._meta.values()]} + return {"count": len(self._snapshots), + "snapshots": [m.name for m in self._meta.values()]} From 3dd4dce1ea2c20e8ce342ae31a53ba33764c4c03 Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 15:01:56 +0530 Subject: [PATCH 6/7] ENH: Refactor KeyError messages for DataFrame snapshot methods for improved readability --- pandas/core/frame.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index db17d67cefddc..492669583fe12 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -270,7 +270,7 @@ from pandas.io.formats.style import Styler - from pandas.core.frame_versioning import DataFrameSnapshotStore +from pandas.core.frame_versioning import DataFrameSnapshotStore # --------------------------------------------------------------------- # Docstring templates @@ -14393,13 +14393,10 @@ def restore(self, name: str, inplace: bool = False): """ store = getattr(self, "_version_snapshots", None) if store is None: - raise KeyError(f"No snapshots present for this DataFrame (requested: {name})") + raise KeyError(f"No snapshots present for this DataFrame(requested:{name})") restored = store.restore(name) if inplace: - # Replace internal state. Using _mgr replacement is more correct than __dict__ update. - # Many pandas internals use the attribute _mgr for BlockManager. Use it cautiously. try: - # pandas >= 1.x use _mgr (BlockManager); adapt if different in your branch. object.__setattr__(self, "_mgr", restored._mgr) # also copy other key attrs object.__setattr__(self, "axes", restored.axes) @@ -14423,7 +14420,7 @@ def drop_snapshot(self, name: str) -> None: """ store = getattr(self, "_version_snapshots", None) if store is None: - raise KeyError(f"No snapshots present for this DataFrame (requested drop: {name})") + raise KeyError(f"No snapshots present for this DataFrame(requested drop:{name})") store.drop(name) def clear_snapshots(self) -> None: From 3884e667ff256fdbc91d09bd4712cddcde9209cf Mon Sep 17 00:00:00 2001 From: AkiTheMemeGod Date: Sat, 25 Oct 2025 15:02:00 +0530 Subject: [PATCH 7/7] DEL: Remove benchmark script for DataFrame snapshot memory testing --- asv_bench/benchmarks/bench_snapshot_memory.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 asv_bench/benchmarks/bench_snapshot_memory.py diff --git a/asv_bench/benchmarks/bench_snapshot_memory.py b/asv_bench/benchmarks/bench_snapshot_memory.py deleted file mode 100644 index 544f9e5e47ad0..0000000000000 --- a/asv_bench/benchmarks/bench_snapshot_memory.py +++ /dev/null @@ -1,17 +0,0 @@ -import pandas as pd -import numpy as np -import tracemalloc - -def make_df(nrows=1_000_000): - return pd.DataFrame({ - "a": np.random.randint(0, 100, size=nrows), - "b": np.random.random(size=nrows), - "c": np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), size=nrows) - }) - -df = make_df(200_000) -tracemalloc.start() -snap = df.snapshot("bench") -snap_shot = tracemalloc.take_snapshot() -top_stats = snap_shot.statistics('lineno') -print("Top memory stats:", top_stats[:3])