diff --git a/asv_bench/benchmarks/bench_snapshot_memory.py b/asv_bench/benchmarks/bench_snapshot_memory.py new file mode 100644 index 0000000000000..544f9e5e47ad0 --- /dev/null +++ b/asv_bench/benchmarks/bench_snapshot_memory.py @@ -0,0 +1,17 @@ +import pandas as pd +import numpy as np +import tracemalloc + +def make_df(nrows=1_000_000): + return pd.DataFrame({ + "a": np.random.randint(0, 100, size=nrows), + "b": np.random.random(size=nrows), + "c": np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), size=nrows) + }) + +df = make_df(200_000) +tracemalloc.start() +snap = df.snapshot("bench") +snap_shot = tracemalloc.take_snapshot() +top_stats = snap_shot.statistics('lineno') +print("Top memory stats:", top_stats[:3]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 00c9cb9316ab7..db17d67cefddc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,6 +34,7 @@ Self, cast, overload, + Optional, ) import warnings @@ -269,6 +270,8 @@ from pandas.io.formats.style import Styler + from pandas.core.frame_versioning import DataFrameSnapshotStore + # --------------------------------------------------------------------- # Docstring templates @@ -14353,6 +14356,93 @@ def values(self) -> np.ndarray: ['monkey', nan, None]], dtype=object) """ return self._mgr.as_array() + + def snapshot(self, name: Optional[str] = None) -> str: + """ + Create a named snapshot of this DataFrame and return the snapshot id. + + Parameters + ---------- + name : str, optional + Optional snapshot name. If not provided a timestamped id is returned. + + Returns + ------- + str + Snapshot id. + """ + store = _ensure_snapshot_store(self) + return store.snapshot(self, name=name) + + def restore(self, name: str, inplace: bool = False): + """ + Restore a previously created snapshot. + + Parameters + ---------- + name : str + Snapshot id returned by :meth:`DataFrame.snapshot`. + inplace : bool, default False + If True, mutate this DataFrame to match the snapshot. Otherwise return + a restored copy. + + Returns + ------- + DataFrame or None + Restored DataFrame when ``inplace=False``, otherwise None. + """ + store = getattr(self, "_version_snapshots", None) + if store is None: + raise KeyError(f"No snapshots present for this DataFrame (requested: {name})") + restored = store.restore(name) + if inplace: + # Replace internal state. Using _mgr replacement is more correct than __dict__ update. + # Many pandas internals use the attribute _mgr for BlockManager. Use it cautiously. + try: + # pandas >= 1.x use _mgr (BlockManager); adapt if different in your branch. + object.__setattr__(self, "_mgr", restored._mgr) + # also copy other key attrs + object.__setattr__(self, "axes", restored.axes) + object.__setattr__(self, "_item_cache", restored._item_cache) + except Exception: + # fallback: shallow __dict__ update (less safe) + self.__dict__.update(restored.__dict__) + return None + return restored + + def list_snapshots(self) -> list[str]: + """ + List snapshot ids for this DataFrame. + """ + store = getattr(self, "_version_snapshots", None) + return store.list() if store is not None else [] + + def drop_snapshot(self, name: str) -> None: + """ + Drop a snapshot by id. + """ + store = getattr(self, "_version_snapshots", None) + if store is None: + raise KeyError(f"No snapshots present for this DataFrame (requested drop: {name})") + store.drop(name) + + def clear_snapshots(self) -> None: + """ + Clear all snapshots for this DataFrame. + """ + store = getattr(self, "_version_snapshots", None) + if store is not None: + store.clear() + + def snapshot_info(self, name: Optional[str] = None) -> dict: + """ + Return metadata for all snapshots or a single snapshot. + """ + store = getattr(self, "_version_snapshots", None) + if store is None: + return {"count": 0, "snapshots": []} + return store.info(name) + def _from_nested_dict( @@ -14390,3 +14480,12 @@ def _reindex_for_setitem( "incompatible index of inserted column with frame index" ) from err return reindexed_value, None + +def _ensure_snapshot_store(self) -> DataFrameSnapshotStore: + # attach a per-instance store to DataFrame + store = getattr(self, "_version_snapshots", None) + if store is None: + store = DataFrameSnapshotStore() + # attach to object + object.__setattr__(self, "_version_snapshots", store) + return store diff --git a/pandas/core/frame_versioning.py b/pandas/core/frame_versioning.py new file mode 100644 index 0000000000000..7fd6c71f75cf0 --- /dev/null +++ b/pandas/core/frame_versioning.py @@ -0,0 +1,69 @@ +# pandas/core/frame_versioning.py +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +import uuid +from typing import Dict, Optional + +import pandas as pd + + +def _generate_snapshot_id(name: Optional[str] = None) -> str: + if name: + return name + ts = datetime.utcnow().strftime("%Y%m%dT%H%M%S%fZ") + uid = uuid.uuid4().hex[:8] + return f"{ts}-{uid}" + + +@dataclass +class SnapshotMeta: + name: str + created_at: datetime + + +class DataFrameSnapshotStore: + """ + Per-DataFrame snapshot store. + Stores deep copies of DataFrames (safe, simple). + """ + + def __init__(self) -> None: + # snapshot_id -> DataFrame + self._snapshots: Dict[str, pd.DataFrame] = {} + self._meta: Dict[str, SnapshotMeta] = {} + + def snapshot(self, df: pd.DataFrame, name: Optional[str] = None) -> str: + sid = _generate_snapshot_id(name) + # deep copy for safety + self._snapshots[sid] = df.copy(deep=True) + self._meta[sid] = SnapshotMeta(name=sid, created_at=datetime.utcnow()) + return sid + + def restore(self, name: str) -> pd.DataFrame: + if name not in self._snapshots: + raise KeyError(f"Snapshot not found: {name}") + # return a deep copy so modifications don't change stored snapshot + return self._snapshots[name].copy(deep=True) + + def list(self) -> list[str]: + return list(self._snapshots.keys()) + + def drop(self, name: str) -> None: + if name not in self._snapshots: + raise KeyError(f"Snapshot not found: {name}") + del self._snapshots[name] + del self._meta[name] + + def clear(self) -> None: + self._snapshots.clear() + self._meta.clear() + + def info(self, name: Optional[str] = None) -> dict: + if name: + if name not in self._meta: + raise KeyError(f"Snapshot not found: {name}") + meta = self._meta[name] + return {"name": meta.name, "created_at": meta.created_at.isoformat()} + return {"count": len(self._snapshots), "snapshots": [m.name for m in self._meta.values()]} diff --git a/pandas/tests/frame/test_versioning.py b/pandas/tests/frame/test_versioning.py new file mode 100644 index 0000000000000..e8dfd5aa7a425 --- /dev/null +++ b/pandas/tests/frame/test_versioning.py @@ -0,0 +1,54 @@ +# pandas/tests/frame/test_versioning.py +import pandas as pd +import pytest + + +def test_snapshot_and_restore_returns_dataframe(): + df = pd.DataFrame({"x": [1, 2, 3]}) + sid = df.snapshot("t1") + assert sid in df.list_snapshots() + df.loc[0, "x"] = 99 + restored = df.restore(sid) + assert list(restored["x"]) == [1, 2, 3] + + +def test_restore_inplace_mutates_dataframe(): + df = pd.DataFrame({"x": [1, 2, 3]}) + sid = df.snapshot("t2") + df.loc[1, "x"] = 999 + df.restore(sid, inplace=True) + assert list(df["x"]) == [1, 2, 3] + + +def test_drop_and_clear_behaviour(): + df = pd.DataFrame({"a": [1, 2]}) + sid1 = df.snapshot("s1") + sid2 = df.snapshot("s2") + assert set(df.list_snapshots()) == {sid1, sid2} + df.drop_snapshot(sid1) + assert sid1 not in df.list_snapshots() + df.clear_snapshots() + assert df.list_snapshots() == [] + + +def test_snapshot_on_empty_dataframe(): + df = pd.DataFrame() + sid = df.snapshot() + df.loc[0, "a"] = 1 + restored = df.restore(sid) + assert restored.empty + + +def test_copy_does_not_inherit_snapshots(): + df = pd.DataFrame({"a": [1, 2, 3]}) + sid = df.snapshot("orig") + df2 = df.copy() + # design decision: copies do not copy snapshots + assert df2.list_snapshots() == [] + assert sid in df.list_snapshots() + + +def test_missing_snapshot_raises(): + df = pd.DataFrame({"x": [1]}) + with pytest.raises(KeyError): + df.restore("no-such-snapshot")