Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions asv_bench/benchmarks/bench_snapshot_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd
import numpy as np
import tracemalloc

def make_df(nrows=1_000_000):
return pd.DataFrame({
"a": np.random.randint(0, 100, size=nrows),
"b": np.random.random(size=nrows),
"c": np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), size=nrows)
})

df = make_df(200_000)
tracemalloc.start()
snap = df.snapshot("bench")
snap_shot = tracemalloc.take_snapshot()
top_stats = snap_shot.statistics('lineno')
print("Top memory stats:", top_stats[:3])
99 changes: 99 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
Self,
cast,
overload,
Optional,
)
import warnings

Expand Down Expand Up @@ -269,6 +270,8 @@

from pandas.io.formats.style import Styler

from pandas.core.frame_versioning import DataFrameSnapshotStore

# ---------------------------------------------------------------------
# Docstring templates

Expand Down Expand Up @@ -14353,6 +14356,93 @@ def values(self) -> np.ndarray:
['monkey', nan, None]], dtype=object)
"""
return self._mgr.as_array()

def snapshot(self, name: Optional[str] = None) -> str:
"""
Create a named snapshot of this DataFrame and return the snapshot id.

Parameters
----------
name : str, optional
Optional snapshot name. If not provided a timestamped id is returned.

Returns
-------
str
Snapshot id.
"""
store = _ensure_snapshot_store(self)
return store.snapshot(self, name=name)

def restore(self, name: str, inplace: bool = False):
"""
Restore a previously created snapshot.

Parameters
----------
name : str
Snapshot id returned by :meth:`DataFrame.snapshot`.
inplace : bool, default False
If True, mutate this DataFrame to match the snapshot. Otherwise return
a restored copy.

Returns
-------
DataFrame or None
Restored DataFrame when ``inplace=False``, otherwise None.
"""
store = getattr(self, "_version_snapshots", None)
if store is None:
raise KeyError(f"No snapshots present for this DataFrame (requested: {name})")
restored = store.restore(name)
if inplace:
# Replace internal state. Using _mgr replacement is more correct than __dict__ update.
# Many pandas internals use the attribute _mgr for BlockManager. Use it cautiously.
try:
# pandas >= 1.x use _mgr (BlockManager); adapt if different in your branch.
object.__setattr__(self, "_mgr", restored._mgr)
# also copy other key attrs
object.__setattr__(self, "axes", restored.axes)
object.__setattr__(self, "_item_cache", restored._item_cache)
except Exception:
# fallback: shallow __dict__ update (less safe)
self.__dict__.update(restored.__dict__)
return None
return restored

def list_snapshots(self) -> list[str]:
"""
List snapshot ids for this DataFrame.
"""
store = getattr(self, "_version_snapshots", None)
return store.list() if store is not None else []

def drop_snapshot(self, name: str) -> None:
"""
Drop a snapshot by id.
"""
store = getattr(self, "_version_snapshots", None)
if store is None:
raise KeyError(f"No snapshots present for this DataFrame (requested drop: {name})")
store.drop(name)

def clear_snapshots(self) -> None:
"""
Clear all snapshots for this DataFrame.
"""
store = getattr(self, "_version_snapshots", None)
if store is not None:
store.clear()

def snapshot_info(self, name: Optional[str] = None) -> dict:
"""
Return metadata for all snapshots or a single snapshot.
"""
store = getattr(self, "_version_snapshots", None)
if store is None:
return {"count": 0, "snapshots": []}
return store.info(name)



def _from_nested_dict(
Expand Down Expand Up @@ -14390,3 +14480,12 @@ def _reindex_for_setitem(
"incompatible index of inserted column with frame index"
) from err
return reindexed_value, None

def _ensure_snapshot_store(self) -> DataFrameSnapshotStore:
# attach a per-instance store to DataFrame
store = getattr(self, "_version_snapshots", None)
if store is None:
store = DataFrameSnapshotStore()
# attach to object
object.__setattr__(self, "_version_snapshots", store)
return store
69 changes: 69 additions & 0 deletions pandas/core/frame_versioning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# pandas/core/frame_versioning.py
from __future__ import annotations

from dataclasses import dataclass
from datetime import datetime
import uuid
from typing import Dict, Optional

import pandas as pd


def _generate_snapshot_id(name: Optional[str] = None) -> str:
if name:
return name
ts = datetime.utcnow().strftime("%Y%m%dT%H%M%S%fZ")
uid = uuid.uuid4().hex[:8]
return f"{ts}-{uid}"


@dataclass
class SnapshotMeta:
name: str
created_at: datetime


class DataFrameSnapshotStore:
"""
Per-DataFrame snapshot store.
Stores deep copies of DataFrames (safe, simple).
"""

def __init__(self) -> None:
# snapshot_id -> DataFrame
self._snapshots: Dict[str, pd.DataFrame] = {}
self._meta: Dict[str, SnapshotMeta] = {}

def snapshot(self, df: pd.DataFrame, name: Optional[str] = None) -> str:
sid = _generate_snapshot_id(name)
# deep copy for safety
self._snapshots[sid] = df.copy(deep=True)
self._meta[sid] = SnapshotMeta(name=sid, created_at=datetime.utcnow())
return sid

def restore(self, name: str) -> pd.DataFrame:
if name not in self._snapshots:
raise KeyError(f"Snapshot not found: {name}")
# return a deep copy so modifications don't change stored snapshot
return self._snapshots[name].copy(deep=True)

def list(self) -> list[str]:
return list(self._snapshots.keys())

def drop(self, name: str) -> None:
if name not in self._snapshots:
raise KeyError(f"Snapshot not found: {name}")
del self._snapshots[name]
del self._meta[name]

def clear(self) -> None:
self._snapshots.clear()
self._meta.clear()

def info(self, name: Optional[str] = None) -> dict:
if name:
if name not in self._meta:
raise KeyError(f"Snapshot not found: {name}")
meta = self._meta[name]
return {"name": meta.name, "created_at": meta.created_at.isoformat()}
return {"count": len(self._snapshots), "snapshots": [m.name for m in self._meta.values()]}
54 changes: 54 additions & 0 deletions pandas/tests/frame/test_versioning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# pandas/tests/frame/test_versioning.py
import pandas as pd
import pytest


def test_snapshot_and_restore_returns_dataframe():
df = pd.DataFrame({"x": [1, 2, 3]})
sid = df.snapshot("t1")
assert sid in df.list_snapshots()
df.loc[0, "x"] = 99
restored = df.restore(sid)
assert list(restored["x"]) == [1, 2, 3]


def test_restore_inplace_mutates_dataframe():
df = pd.DataFrame({"x": [1, 2, 3]})
sid = df.snapshot("t2")
df.loc[1, "x"] = 999
df.restore(sid, inplace=True)
assert list(df["x"]) == [1, 2, 3]


def test_drop_and_clear_behaviour():
df = pd.DataFrame({"a": [1, 2]})
sid1 = df.snapshot("s1")
sid2 = df.snapshot("s2")
assert set(df.list_snapshots()) == {sid1, sid2}
df.drop_snapshot(sid1)
assert sid1 not in df.list_snapshots()
df.clear_snapshots()
assert df.list_snapshots() == []


def test_snapshot_on_empty_dataframe():
df = pd.DataFrame()
sid = df.snapshot()
df.loc[0, "a"] = 1
restored = df.restore(sid)
assert restored.empty


def test_copy_does_not_inherit_snapshots():
df = pd.DataFrame({"a": [1, 2, 3]})
sid = df.snapshot("orig")
df2 = df.copy()
# design decision: copies do not copy snapshots
assert df2.list_snapshots() == []
assert sid in df.list_snapshots()


def test_missing_snapshot_raises():
df = pd.DataFrame({"x": [1]})
with pytest.raises(KeyError):
df.restore("no-such-snapshot")
Loading