1+
2+ import weakref
3+
4+ import numpy as np
5+ import pandas as pd
6+
7+ import pytest
8+
9+ from pandas .testing import assert_frame_equal
10+
11+
12+ class StatsSummary (dict ):
13+ """
14+ A lightweight, production-plausible cache object that stores simple stats
15+ for numeric columns and keeps a weakref to its owning NDFrame.
16+
17+ On deepcopy, it should bind to the *destination* NDFrame (if provided in memo)
18+ and rebuild its stats from that destination, so the cache belongs to and
19+ reflects the new object.
20+ """
21+
22+ def __init__ (self , owner , * , cols = None ):
23+ import pandas as pd
24+ assert isinstance (owner , pd .core .generic .NDFrame )
25+ self ._owner_ref = weakref .ref (owner )
26+ super (StatsSummary , self ).__init__ (dict ((column , type (self )(owner [column ])) for column in (list (getattr (owner , "columns" , {})) or super (StatsSummary , self ).__init__ (
27+ (name , function (owner )) for name , function in self .stats ().items ()
28+ ) or {}) if owner [column ].dtype .kind in "if" ))
29+ pass
30+
31+ @classmethod
32+ def stats (cls ):
33+ return dict (
34+ cummin = lambda series : series .cummin ().sum (),
35+ cummax = lambda series : series .cummax ().sum (),
36+ kurtosis = lambda series : series .kurt (),
37+ median = lambda series :series .median (),
38+ )
39+
40+ @classmethod
41+ def gauge (cls , obj , columns ):
42+ return dict (((column ,dict ([[name , function (obj [column ])] for name , function in cls .stats ().items ()])) for column ,dtyp in columns ))
43+
44+ @property
45+ def owner (self ):
46+ return self ._owner_ref ()
47+
48+ def __eq__ (self , other ) -> bool :
49+ outs = all (self [column ] == other [column ] for column in self )
50+ return outs
51+
52+ def __deepcopy__ (self , memo ):
53+ import pandas as pd
54+ # Find destination NDFrame in memo. The patch injects {id(dest): dest}.
55+ new_owner = next (
56+ (v for v in memo .values () if isinstance (v , pd .core .generic .NDFrame )),
57+ None ,
58+ )
59+ return type (self )(new_owner ) if hasattr (new_owner , "select_dtypes" ) or new_owner .dtype .kind in "if" else None
60+
61+
62+ class FrozenHeadTail (dict ):
63+ """
64+ A preview helper that remembers first/last row 'snapshots' cheaply.
65+ On deepcopy, it should rebuild from the destination NDFrame, so that the
66+ preview corresponds to the new object (e.g., after concat).
67+ """
68+
69+ def __init__ (self , owner , * , cols = None ):
70+ import pandas as pd
71+ assert isinstance (owner , pd .core .generic .NDFrame )
72+ self ._owner_ref = weakref .ref (owner )
73+ super (FrozenHeadTail , self ).__init__ (dict ((name , function (self .owner )) for name , function in self .stats ().items ()))
74+ pass
75+
76+ @property
77+ def owner (self ):
78+ return self ._owner_ref ()
79+
80+ @classmethod
81+ def stats (cls ):
82+ return dict (
83+ head = lambda x :pd .DataFrame (x .values [:2 ], columns = list (getattr (x ,"columns" ,[])) or [x .name ], index = x .index [:2 ]),
84+ tail = lambda x :pd .DataFrame (x .values [- 2 :], columns = list (getattr (x ,"columns" ,[])) or [x .name ], index = x .index [- 2 :]),
85+ )
86+
87+ def __eq__ (self , other ) -> bool :
88+ try :
89+ [assert_frame_equal (self [column ], other [column ]) for column in self ]
90+ return True
91+ except :
92+ return False
93+
94+ def __deepcopy__ (self , memo ):
95+ new_owner = next (
96+ (v for v in memo .values () if isinstance (v , pd .core .generic .NDFrame )),
97+ None ,
98+ )
99+ return type (self )(new_owner )
100+
101+
102+ def test_attrs_stats_summary_binds_to_destination_on_copy ():
103+ # Sample Data
104+ dset = np .arange (8 ,dtype = float )
105+ np .random .shuffle (dset )
106+
107+ df = pd .DataFrame ({"foo" : dset , "bar" : dset * 2 , "qux" : np .array (["waldo" ,"fred" ,"plugh" ,"thud" ]).repeat (len (dset )// 4 )}) # mixed dtypes
108+
109+ df .attrs ["summary" ] = StatsSummary (df )
110+
111+ # --------------------------------------
112+ # Copy triggered by panel Y axis slicing
113+ # --------------------------------------
114+ out = df .iloc [:len (df )// 2 ]
115+ summ = out .attrs .get ("summary" )
116+ gage = StatsSummary .gauge (out , list (filter (lambda x :x [- 1 ].kind in "if" , out .dtypes .to_dict ().items ())))
117+
118+ assert isinstance (summ , StatsSummary )
119+
120+ # The cache should now belong to the *new* DataFrame
121+ assert summ .owner is out
122+ # pandas.DataFrame propagate to its pandas.Series correspondingly
123+ assert all ([out [column ].attrs ["summary" ] == out .attrs ["summary" ][column ] for column in list (gage )])
124+ # And stats reflect the destination (shape matches numeric subset)
125+ assert summ == gage
126+
127+ # -----------------------------------
128+ # Copy triggered by columns selection
129+ # -----------------------------------
130+ out = df [["foo" ,"qux" ]]
131+ summ = out .attrs .get ("summary" )
132+ gage = StatsSummary .gauge (out , list (filter (lambda x :x [- 1 ].kind in "if" , out .dtypes .to_dict ().items ())))
133+
134+ assert isinstance (summ , StatsSummary )
135+
136+ # The cache should now belong to the *new* DataFrame
137+ assert summ .owner is out
138+ # pandas.DataFrame propagate to its pandas.Series correspondingly
139+ assert all ([out [column ].attrs ["summary" ] == out .attrs ["summary" ][column ] for column in list (gage )])
140+ # And stats reflect the destination (shape matches numeric subset)
141+ assert summ == gage
142+
143+ # ----------------------------------
144+ # Copy triggered by DataFrame concat
145+ # ----------------------------------
146+ left = df .iloc [len (df )// 4 :].copy (deep = True )
147+ right = df .iloc [len (df )// 4 :].copy (deep = True )
148+ out = pd .concat ([left ,right ])
149+
150+ summ = out .attrs .get ("summary" )
151+ gage = StatsSummary .gauge (out , list (filter (lambda x :x [- 1 ].kind in "if" , out .dtypes .to_dict ().items ())))
152+
153+ assert isinstance (summ , StatsSummary )
154+
155+ # The cache should now belong to the *new* DataFrame
156+ assert summ .owner is out
157+ # pandas.DataFrame propagate to its pandas.Series correspondingly
158+ assert all ([out [column ].attrs ["summary" ] == out .attrs ["summary" ][column ] for column in list (gage )])
159+ # And stats reflect the destination (shape matches numeric subset)
160+ assert summ == gage
161+
162+ # -----------------------------------
163+ # Arithemetic operations on DataFrame
164+ # -----------------------------------
165+ out = df [["foo" ,"bar" ]]
166+ out = out .multiply (np .random .random_integers (0 , 1 , len (out ))* np .lib .stride_tricks .as_strided (np .asarray (2 , dtype = np .int8 ), shape = (len (out ),), strides = (0 ,))- 1 , axis = 0 )
167+
168+ summ = out .attrs .get ("summary" )
169+ gage = StatsSummary .gauge (out , list (filter (lambda x :x [- 1 ].kind in "if" , out .dtypes .to_dict ().items ())))
170+
171+ assert isinstance (summ , StatsSummary )
172+
173+ # The cache should now belong to the *new* DataFrame
174+ assert summ .owner is out
175+ # pandas.DataFrame propagate to its pandas.Series correspondingly
176+ assert all ([out [column ].attrs ["summary" ] == out .attrs ["summary" ][column ] for column in list (gage )])
177+ # And stats reflect the destination (shape matches numeric subset)
178+ assert summ == gage
179+
180+
181+ def test_attrs_stats_summary_works_for_series_too ():
182+ # Sample Data
183+ dset = np .arange (8 ,dtype = float )
184+ np .random .shuffle (dset )
185+
186+ df = pd .DataFrame ({"foo" : dset , "bar" : dset * 2 , "qux" : np .array (["waldo" ,"fred" ,"plugh" ,"thud" ]).repeat (len (dset )// 4 )}) # mixed dtypes
187+ df .attrs ["summary" ] = StatsSummary (df )
188+
189+ # ------------------------------------------
190+ # Directly to pandas.Series, complex slicing
191+ # ------------------------------------------
192+ sr = df ["bar" ]
193+ out = pd .concat ([sr .iloc [:len (sr )// 2 ],sr .iloc [len (sr )// 4 :]])
194+
195+ summ = out .attrs ["summary" ] = StatsSummary (out )
196+ gage = StatsSummary .gauge (out , [(Ellipsis , sr .dtype )])[...]
197+
198+ assert isinstance (summ , StatsSummary )
199+
200+ # The cache should now belong to the *new* DataFrame
201+ assert summ .owner is out
202+ # And stats reflect the destination (shape matches numeric subset)
203+ assert summ == gage
204+
205+
206+ def test_attrs_headtail_probe_rebinds_on_concat_have_same_attrs ():
207+ # Sample Data
208+ dset = np .arange (8 ,dtype = float )
209+ np .random .shuffle (dset )
210+ df = pd .DataFrame (dict (foo = dset * 2 , bar = dset * 4 , baz = dset * 8 , qux = dset * 16 ))
211+
212+ df .attrs ["preview" ] = FrozenHeadTail (df )
213+
214+ # same attrs object on both inputs -> triggers have_same_attrs=True branch
215+ fred = df .copy (deep = True )
216+ thud = df .iloc [list (range (- 2 ,2 ))].sort_index ()
217+
218+ out = pd .concat ([fred , thud ], ignore_index = True )
219+
220+ pr = out .attrs .get ("preview" )
221+ assert isinstance (pr , FrozenHeadTail )
222+
223+ # The preview should be tied to the concatenated destination and reflect it
224+ assert pr .owner is out
225+ pass
226+ assert_frame_equal (pr ["head" ], out .iloc [:2 ])
227+ assert_frame_equal (pr ["tail" ], out .iloc [- 2 :])
228+ pass
229+
230+
231+ def test_attrs_empty_remains_empty_on_deepcopy ():
232+ df = pd .DataFrame ({"a" : [1 , 2 ]})
233+ assert df .attrs == {}
234+ out = df .copy (deep = True )
235+ assert out .attrs == {}
0 commit comments