Skip to content

Commit 6d0095a

Browse files
committed
Provide new object for copy routines requiring info. on its destination.
1 parent ee29930 commit 6d0095a

File tree

1 file changed

+235
-0
lines changed

1 file changed

+235
-0
lines changed
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
2+
import weakref
3+
4+
import numpy as np
5+
import pandas as pd
6+
7+
import pytest
8+
9+
from pandas.testing import assert_frame_equal
10+
11+
12+
class StatsSummary(dict):
13+
"""
14+
A lightweight, production-plausible cache object that stores simple stats
15+
for numeric columns and keeps a weakref to its owning NDFrame.
16+
17+
On deepcopy, it should bind to the *destination* NDFrame (if provided in memo)
18+
and rebuild its stats from that destination, so the cache belongs to and
19+
reflects the new object.
20+
"""
21+
22+
def __init__(self, owner, *, cols=None):
23+
import pandas as pd
24+
assert isinstance(owner, pd.core.generic.NDFrame)
25+
self._owner_ref = weakref.ref(owner)
26+
super(StatsSummary, self).__init__(dict((column, type(self)(owner[column])) for column in (list(getattr(owner, "columns", {})) or super(StatsSummary, self).__init__(
27+
(name, function(owner)) for name, function in self.stats().items()
28+
) or {}) if owner[column].dtype.kind in "if"))
29+
pass
30+
31+
@classmethod
32+
def stats(cls):
33+
return dict(
34+
cummin=lambda series: series.cummin().sum(),
35+
cummax=lambda series: series.cummax().sum(),
36+
kurtosis=lambda series: series.kurt(),
37+
median=lambda series:series.median(),
38+
)
39+
40+
@classmethod
41+
def gauge(cls, obj, columns):
42+
return dict(((column,dict([[name, function(obj[column])] for name, function in cls.stats().items()])) for column,dtyp in columns))
43+
44+
@property
45+
def owner(self):
46+
return self._owner_ref()
47+
48+
def __eq__(self, other) -> bool:
49+
outs = all(self[column] == other[column] for column in self)
50+
return outs
51+
52+
def __deepcopy__(self, memo):
53+
import pandas as pd
54+
# Find destination NDFrame in memo. The patch injects {id(dest): dest}.
55+
new_owner = next(
56+
(v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)),
57+
None,
58+
)
59+
return type(self)(new_owner) if hasattr(new_owner, "select_dtypes") or new_owner.dtype.kind in "if" else None
60+
61+
62+
class FrozenHeadTail(dict):
63+
"""
64+
A preview helper that remembers first/last row 'snapshots' cheaply.
65+
On deepcopy, it should rebuild from the destination NDFrame, so that the
66+
preview corresponds to the new object (e.g., after concat).
67+
"""
68+
69+
def __init__(self, owner, *, cols=None):
70+
import pandas as pd
71+
assert isinstance(owner, pd.core.generic.NDFrame)
72+
self._owner_ref = weakref.ref(owner)
73+
super(FrozenHeadTail, self).__init__(dict((name, function(self.owner)) for name, function in self.stats().items()))
74+
pass
75+
76+
@property
77+
def owner(self):
78+
return self._owner_ref()
79+
80+
@classmethod
81+
def stats(cls):
82+
return dict(
83+
head=lambda x:pd.DataFrame(x.values[:2], columns=list(getattr(x,"columns",[])) or [x.name], index=x.index[:2]),
84+
tail=lambda x:pd.DataFrame(x.values[-2:], columns=list(getattr(x,"columns",[])) or [x.name], index=x.index[-2:]),
85+
)
86+
87+
def __eq__(self, other) -> bool:
88+
try:
89+
[assert_frame_equal(self[column], other[column]) for column in self]
90+
return True
91+
except:
92+
return False
93+
94+
def __deepcopy__(self, memo):
95+
new_owner = next(
96+
(v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)),
97+
None,
98+
)
99+
return type(self)(new_owner)
100+
101+
102+
def test_attrs_stats_summary_binds_to_destination_on_copy():
103+
# Sample Data
104+
dset = np.arange(8,dtype=float)
105+
np.random.shuffle(dset)
106+
107+
df = pd.DataFrame({"foo": dset, "bar": dset*2, "qux": np.array(["waldo","fred","plugh","thud"]).repeat(len(dset)//4)}) # mixed dtypes
108+
109+
df.attrs["summary"] = StatsSummary(df)
110+
111+
# --------------------------------------
112+
# Copy triggered by panel Y axis slicing
113+
# --------------------------------------
114+
out = df.iloc[:len(df)//2]
115+
summ = out.attrs.get("summary")
116+
gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items())))
117+
118+
assert isinstance(summ, StatsSummary)
119+
120+
# The cache should now belong to the *new* DataFrame
121+
assert summ.owner is out
122+
# pandas.DataFrame propagate to its pandas.Series correspondingly
123+
assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)])
124+
# And stats reflect the destination (shape matches numeric subset)
125+
assert summ == gage
126+
127+
# -----------------------------------
128+
# Copy triggered by columns selection
129+
# -----------------------------------
130+
out = df[["foo","qux"]]
131+
summ = out.attrs.get("summary")
132+
gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items())))
133+
134+
assert isinstance(summ, StatsSummary)
135+
136+
# The cache should now belong to the *new* DataFrame
137+
assert summ.owner is out
138+
# pandas.DataFrame propagate to its pandas.Series correspondingly
139+
assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)])
140+
# And stats reflect the destination (shape matches numeric subset)
141+
assert summ == gage
142+
143+
# ----------------------------------
144+
# Copy triggered by DataFrame concat
145+
# ----------------------------------
146+
left = df.iloc[len(df)//4:].copy(deep=True)
147+
right = df.iloc[len(df)//4:].copy(deep=True)
148+
out = pd.concat([left,right])
149+
150+
summ = out.attrs.get("summary")
151+
gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items())))
152+
153+
assert isinstance(summ, StatsSummary)
154+
155+
# The cache should now belong to the *new* DataFrame
156+
assert summ.owner is out
157+
# pandas.DataFrame propagate to its pandas.Series correspondingly
158+
assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)])
159+
# And stats reflect the destination (shape matches numeric subset)
160+
assert summ == gage
161+
162+
# -----------------------------------
163+
# Arithemetic operations on DataFrame
164+
# -----------------------------------
165+
out = df[["foo","bar"]]
166+
out = out.multiply(np.random.random_integers(0, 1, len(out))*np.lib.stride_tricks.as_strided(np.asarray(2, dtype=np.int8), shape=(len(out),), strides=(0,))-1, axis=0)
167+
168+
summ = out.attrs.get("summary")
169+
gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items())))
170+
171+
assert isinstance(summ, StatsSummary)
172+
173+
# The cache should now belong to the *new* DataFrame
174+
assert summ.owner is out
175+
# pandas.DataFrame propagate to its pandas.Series correspondingly
176+
assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)])
177+
# And stats reflect the destination (shape matches numeric subset)
178+
assert summ == gage
179+
180+
181+
def test_attrs_stats_summary_works_for_series_too():
182+
# Sample Data
183+
dset = np.arange(8,dtype=float)
184+
np.random.shuffle(dset)
185+
186+
df = pd.DataFrame({"foo": dset, "bar": dset*2, "qux": np.array(["waldo","fred","plugh","thud"]).repeat(len(dset)//4)}) # mixed dtypes
187+
df.attrs["summary"] = StatsSummary(df)
188+
189+
# ------------------------------------------
190+
# Directly to pandas.Series, complex slicing
191+
# ------------------------------------------
192+
sr = df["bar"]
193+
out = pd.concat([sr.iloc[:len(sr)//2],sr.iloc[len(sr)//4:]])
194+
195+
summ = out.attrs["summary"] = StatsSummary(out)
196+
gage = StatsSummary.gauge(out, [(Ellipsis, sr.dtype)])[...]
197+
198+
assert isinstance(summ, StatsSummary)
199+
200+
# The cache should now belong to the *new* DataFrame
201+
assert summ.owner is out
202+
# And stats reflect the destination (shape matches numeric subset)
203+
assert summ == gage
204+
205+
206+
def test_attrs_headtail_probe_rebinds_on_concat_have_same_attrs():
207+
# Sample Data
208+
dset = np.arange(8,dtype=float)
209+
np.random.shuffle(dset)
210+
df = pd.DataFrame(dict(foo=dset*2, bar=dset*4, baz=dset*8, qux=dset*16))
211+
212+
df.attrs["preview"] = FrozenHeadTail(df)
213+
214+
# same attrs object on both inputs -> triggers have_same_attrs=True branch
215+
fred = df.copy(deep=True)
216+
thud = df.iloc[list(range(-2,2))].sort_index()
217+
218+
out = pd.concat([fred, thud], ignore_index=True)
219+
220+
pr = out.attrs.get("preview")
221+
assert isinstance(pr, FrozenHeadTail)
222+
223+
# The preview should be tied to the concatenated destination and reflect it
224+
assert pr.owner is out
225+
pass
226+
assert_frame_equal(pr["head"], out.iloc[:2])
227+
assert_frame_equal(pr["tail"], out.iloc[-2:])
228+
pass
229+
230+
231+
def test_attrs_empty_remains_empty_on_deepcopy():
232+
df = pd.DataFrame({"a": [1, 2]})
233+
assert df.attrs == {}
234+
out = df.copy(deep=True)
235+
assert out.attrs == {}

0 commit comments

Comments
 (0)