From 61aba594149640a910f27ab4f16d2071a34617cb Mon Sep 17 00:00:00 2001 From: dangreb Date: Sat, 16 Aug 2025 04:44:01 -0300 Subject: [PATCH 1/8] Provide new object for copy routines requiring info. on its destination. --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9840af15f1249..01f7ae5a0ea7e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6112,7 +6112,10 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: # impact if attrs are not used; i.e. attrs is an empty dict. # One could make the deepcopy unconditionally, but a deepcopy # of an empty dict is 50x more expensive than the empty check. - self.attrs = deepcopy(other.attrs) + # We provide the new dataset via the deepcopy memo to properly + # supply eventual attribute copy routines requiring information + # from its destination + self.attrs = deepcopy(other.attrs, memo={object:self}) self.flags.allows_duplicate_labels = ( self.flags.allows_duplicate_labels and other.flags.allows_duplicate_labels From 5f0627215aa4a74544e4de2d5d34153ae0a758b9 Mon Sep 17 00:00:00 2001 From: dangreb Date: Sat, 16 Aug 2025 05:13:36 -0300 Subject: [PATCH 2/8] Provide new object for copy routines requiring info. on its destination. --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 01f7ae5a0ea7e..f00b120c2d186 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6115,7 +6115,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: # We provide the new dataset via the deepcopy memo to properly # supply eventual attribute copy routines requiring information # from its destination - self.attrs = deepcopy(other.attrs, memo={object:self}) + self.attrs = deepcopy(other.attrs, memo={object(): self}) self.flags.allows_duplicate_labels = ( self.flags.allows_duplicate_labels and other.flags.allows_duplicate_labels @@ -6133,7 +6133,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: attrs = objs[0].attrs have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: - self.attrs = deepcopy(attrs) + self.attrs = deepcopy(attrs, memo={object(): self}) allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs) self.flags.allows_duplicate_labels = allows_duplicate_labels From daacf51c2c79916aec403cbc2626127fc1713979 Mon Sep 17 00:00:00 2001 From: dangreb Date: Sat, 16 Aug 2025 05:58:09 -0300 Subject: [PATCH 3/8] Provide new object for copy routines requiring info. on its destination. --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f00b120c2d186..6c8de8a1e2ba4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6114,7 +6114,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: # of an empty dict is 50x more expensive than the empty check. # We provide the new dataset via the deepcopy memo to properly # supply eventual attribute copy routines requiring information - # from its destination + # from its destination. self.attrs = deepcopy(other.attrs, memo={object(): self}) self.flags.allows_duplicate_labels = ( self.flags.allows_duplicate_labels From ee29930887aeeef2c9409115f4fc786796204e33 Mon Sep 17 00:00:00 2001 From: dangreb Date: Sat, 16 Aug 2025 06:36:22 -0300 Subject: [PATCH 4/8] Provide new object for copy routines requiring info. on its destination. --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c8de8a1e2ba4..1c87137a1b4fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6115,7 +6115,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: # We provide the new dataset via the deepcopy memo to properly # supply eventual attribute copy routines requiring information # from its destination. - self.attrs = deepcopy(other.attrs, memo={object(): self}) + self.attrs = deepcopy(other.attrs, memo={id(self): self}) self.flags.allows_duplicate_labels = ( self.flags.allows_duplicate_labels and other.flags.allows_duplicate_labels @@ -6133,7 +6133,7 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: attrs = objs[0].attrs have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: - self.attrs = deepcopy(attrs, memo={object(): self}) + self.attrs = deepcopy(attrs, memo={id(self): self}) allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs) self.flags.allows_duplicate_labels = allows_duplicate_labels From 6d0095ab22325728f7bc72e8f9bf1cc834e11536 Mon Sep 17 00:00:00 2001 From: dangreb Date: Wed, 20 Aug 2025 20:01:43 -0300 Subject: [PATCH 5/8] Provide new object for copy routines requiring info. on its destination. --- .../test_attrs_deepcopy_destination.py | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 pandas/tests/generic/test_attrs_deepcopy_destination.py diff --git a/pandas/tests/generic/test_attrs_deepcopy_destination.py b/pandas/tests/generic/test_attrs_deepcopy_destination.py new file mode 100644 index 0000000000000..365db013b913a --- /dev/null +++ b/pandas/tests/generic/test_attrs_deepcopy_destination.py @@ -0,0 +1,235 @@ + +import weakref + +import numpy as np +import pandas as pd + +import pytest + +from pandas.testing import assert_frame_equal + + +class StatsSummary(dict): + """ + A lightweight, production-plausible cache object that stores simple stats + for numeric columns and keeps a weakref to its owning NDFrame. + + On deepcopy, it should bind to the *destination* NDFrame (if provided in memo) + and rebuild its stats from that destination, so the cache belongs to and + reflects the new object. + """ + + def __init__(self, owner, *, cols=None): + import pandas as pd + assert isinstance(owner, pd.core.generic.NDFrame) + self._owner_ref = weakref.ref(owner) + super(StatsSummary, self).__init__(dict((column, type(self)(owner[column])) for column in (list(getattr(owner, "columns", {})) or super(StatsSummary, self).__init__( + (name, function(owner)) for name, function in self.stats().items() + ) or {}) if owner[column].dtype.kind in "if")) + pass + + @classmethod + def stats(cls): + return dict( + cummin=lambda series: series.cummin().sum(), + cummax=lambda series: series.cummax().sum(), + kurtosis=lambda series: series.kurt(), + median=lambda series:series.median(), + ) + + @classmethod + def gauge(cls, obj, columns): + return dict(((column,dict([[name, function(obj[column])] for name, function in cls.stats().items()])) for column,dtyp in columns)) + + @property + def owner(self): + return self._owner_ref() + + def __eq__(self, other) -> bool: + outs = all(self[column] == other[column] for column in self) + return outs + + def __deepcopy__(self, memo): + import pandas as pd + # Find destination NDFrame in memo. The patch injects {id(dest): dest}. + new_owner = next( + (v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)), + None, + ) + return type(self)(new_owner) if hasattr(new_owner, "select_dtypes") or new_owner.dtype.kind in "if" else None + + +class FrozenHeadTail(dict): + """ + A preview helper that remembers first/last row 'snapshots' cheaply. + On deepcopy, it should rebuild from the destination NDFrame, so that the + preview corresponds to the new object (e.g., after concat). + """ + + def __init__(self, owner, *, cols=None): + import pandas as pd + assert isinstance(owner, pd.core.generic.NDFrame) + self._owner_ref = weakref.ref(owner) + super(FrozenHeadTail, self).__init__(dict((name, function(self.owner)) for name, function in self.stats().items())) + pass + + @property + def owner(self): + return self._owner_ref() + + @classmethod + def stats(cls): + return dict( + head=lambda x:pd.DataFrame(x.values[:2], columns=list(getattr(x,"columns",[])) or [x.name], index=x.index[:2]), + tail=lambda x:pd.DataFrame(x.values[-2:], columns=list(getattr(x,"columns",[])) or [x.name], index=x.index[-2:]), + ) + + def __eq__(self, other) -> bool: + try: + [assert_frame_equal(self[column], other[column]) for column in self] + return True + except: + return False + + def __deepcopy__(self, memo): + new_owner = next( + (v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)), + None, + ) + return type(self)(new_owner) + + +def test_attrs_stats_summary_binds_to_destination_on_copy(): + # Sample Data + dset = np.arange(8,dtype=float) + np.random.shuffle(dset) + + df = pd.DataFrame({"foo": dset, "bar": dset*2, "qux": np.array(["waldo","fred","plugh","thud"]).repeat(len(dset)//4)}) # mixed dtypes + + df.attrs["summary"] = StatsSummary(df) + + # -------------------------------------- + # Copy triggered by panel Y axis slicing + # -------------------------------------- + out = df.iloc[:len(df)//2] + summ = out.attrs.get("summary") + gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + + assert isinstance(summ, StatsSummary) + + # The cache should now belong to the *new* DataFrame + assert summ.owner is out + # pandas.DataFrame propagate to its pandas.Series correspondingly + assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + # And stats reflect the destination (shape matches numeric subset) + assert summ == gage + + # ----------------------------------- + # Copy triggered by columns selection + # ----------------------------------- + out = df[["foo","qux"]] + summ = out.attrs.get("summary") + gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + + assert isinstance(summ, StatsSummary) + + # The cache should now belong to the *new* DataFrame + assert summ.owner is out + # pandas.DataFrame propagate to its pandas.Series correspondingly + assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + # And stats reflect the destination (shape matches numeric subset) + assert summ == gage + + # ---------------------------------- + # Copy triggered by DataFrame concat + # ---------------------------------- + left = df.iloc[len(df)//4:].copy(deep=True) + right = df.iloc[len(df)//4:].copy(deep=True) + out = pd.concat([left,right]) + + summ = out.attrs.get("summary") + gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + + assert isinstance(summ, StatsSummary) + + # The cache should now belong to the *new* DataFrame + assert summ.owner is out + # pandas.DataFrame propagate to its pandas.Series correspondingly + assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + # And stats reflect the destination (shape matches numeric subset) + assert summ == gage + + # ----------------------------------- + # Arithemetic operations on DataFrame + # ----------------------------------- + out = df[["foo","bar"]] + out = out.multiply(np.random.random_integers(0, 1, len(out))*np.lib.stride_tricks.as_strided(np.asarray(2, dtype=np.int8), shape=(len(out),), strides=(0,))-1, axis=0) + + summ = out.attrs.get("summary") + gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + + assert isinstance(summ, StatsSummary) + + # The cache should now belong to the *new* DataFrame + assert summ.owner is out + # pandas.DataFrame propagate to its pandas.Series correspondingly + assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + # And stats reflect the destination (shape matches numeric subset) + assert summ == gage + + +def test_attrs_stats_summary_works_for_series_too(): + # Sample Data + dset = np.arange(8,dtype=float) + np.random.shuffle(dset) + + df = pd.DataFrame({"foo": dset, "bar": dset*2, "qux": np.array(["waldo","fred","plugh","thud"]).repeat(len(dset)//4)}) # mixed dtypes + df.attrs["summary"] = StatsSummary(df) + + # ------------------------------------------ + # Directly to pandas.Series, complex slicing + # ------------------------------------------ + sr = df["bar"] + out = pd.concat([sr.iloc[:len(sr)//2],sr.iloc[len(sr)//4:]]) + + summ = out.attrs["summary"] = StatsSummary(out) + gage = StatsSummary.gauge(out, [(Ellipsis, sr.dtype)])[...] + + assert isinstance(summ, StatsSummary) + + # The cache should now belong to the *new* DataFrame + assert summ.owner is out + # And stats reflect the destination (shape matches numeric subset) + assert summ == gage + + +def test_attrs_headtail_probe_rebinds_on_concat_have_same_attrs(): + # Sample Data + dset = np.arange(8,dtype=float) + np.random.shuffle(dset) + df = pd.DataFrame(dict(foo=dset*2, bar=dset*4, baz=dset*8, qux=dset*16)) + + df.attrs["preview"] = FrozenHeadTail(df) + + # same attrs object on both inputs -> triggers have_same_attrs=True branch + fred = df.copy(deep=True) + thud = df.iloc[list(range(-2,2))].sort_index() + + out = pd.concat([fred, thud], ignore_index=True) + + pr = out.attrs.get("preview") + assert isinstance(pr, FrozenHeadTail) + + # The preview should be tied to the concatenated destination and reflect it + assert pr.owner is out + pass + assert_frame_equal(pr["head"], out.iloc[:2]) + assert_frame_equal(pr["tail"], out.iloc[-2:]) + pass + + +def test_attrs_empty_remains_empty_on_deepcopy(): + df = pd.DataFrame({"a": [1, 2]}) + assert df.attrs == {} + out = df.copy(deep=True) + assert out.attrs == {} \ No newline at end of file From 1ca0f5e96141824e196d0361a841728e7af16d81 Mon Sep 17 00:00:00 2001 From: dangreb Date: Wed, 20 Aug 2025 20:04:28 -0300 Subject: [PATCH 6/8] Provide new object for copy routines requiring info. on its destination. --- .../test_attrs_deepcopy_destination.py | 161 ++++++++++++++---- 1 file changed, 125 insertions(+), 36 deletions(-) diff --git a/pandas/tests/generic/test_attrs_deepcopy_destination.py b/pandas/tests/generic/test_attrs_deepcopy_destination.py index 365db013b913a..2d11435be3765 100644 --- a/pandas/tests/generic/test_attrs_deepcopy_destination.py +++ b/pandas/tests/generic/test_attrs_deepcopy_destination.py @@ -1,4 +1,3 @@ - import weakref import numpy as np @@ -21,11 +20,23 @@ class StatsSummary(dict): def __init__(self, owner, *, cols=None): import pandas as pd + assert isinstance(owner, pd.core.generic.NDFrame) self._owner_ref = weakref.ref(owner) - super(StatsSummary, self).__init__(dict((column, type(self)(owner[column])) for column in (list(getattr(owner, "columns", {})) or super(StatsSummary, self).__init__( - (name, function(owner)) for name, function in self.stats().items() - ) or {}) if owner[column].dtype.kind in "if")) + super(StatsSummary, self).__init__( + dict( + (column, type(self)(owner[column])) + for column in ( + list(getattr(owner, "columns", {})) + or super(StatsSummary, self).__init__( + (name, function(owner)) + for name, function in self.stats().items() + ) + or {} + ) + if owner[column].dtype.kind in "if" + ) + ) pass @classmethod @@ -34,12 +45,25 @@ def stats(cls): cummin=lambda series: series.cummin().sum(), cummax=lambda series: series.cummax().sum(), kurtosis=lambda series: series.kurt(), - median=lambda series:series.median(), + median=lambda series: series.median(), ) @classmethod def gauge(cls, obj, columns): - return dict(((column,dict([[name, function(obj[column])] for name, function in cls.stats().items()])) for column,dtyp in columns)) + return dict( + ( + ( + column, + dict( + [ + [name, function(obj[column])] + for name, function in cls.stats().items() + ] + ), + ) + for column, dtyp in columns + ) + ) @property def owner(self): @@ -51,12 +75,17 @@ def __eq__(self, other) -> bool: def __deepcopy__(self, memo): import pandas as pd + # Find destination NDFrame in memo. The patch injects {id(dest): dest}. - new_owner = next( + new_owner = next( (v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)), None, ) - return type(self)(new_owner) if hasattr(new_owner, "select_dtypes") or new_owner.dtype.kind in "if" else None + return ( + type(self)(new_owner) + if hasattr(new_owner, "select_dtypes") or new_owner.dtype.kind in "if" + else None + ) class FrozenHeadTail(dict): @@ -68,9 +97,14 @@ class FrozenHeadTail(dict): def __init__(self, owner, *, cols=None): import pandas as pd + assert isinstance(owner, pd.core.generic.NDFrame) self._owner_ref = weakref.ref(owner) - super(FrozenHeadTail, self).__init__(dict((name, function(self.owner)) for name, function in self.stats().items())) + super(FrozenHeadTail, self).__init__( + dict( + (name, function(self.owner)) for name, function in self.stats().items() + ) + ) pass @property @@ -80,8 +114,16 @@ def owner(self): @classmethod def stats(cls): return dict( - head=lambda x:pd.DataFrame(x.values[:2], columns=list(getattr(x,"columns",[])) or [x.name], index=x.index[:2]), - tail=lambda x:pd.DataFrame(x.values[-2:], columns=list(getattr(x,"columns",[])) or [x.name], index=x.index[-2:]), + head=lambda x: pd.DataFrame( + x.values[:2], + columns=list(getattr(x, "columns", [])) or [x.name], + index=x.index[:2], + ), + tail=lambda x: pd.DataFrame( + x.values[-2:], + columns=list(getattr(x, "columns", [])) or [x.name], + index=x.index[-2:], + ), ) def __eq__(self, other) -> bool: @@ -92,7 +134,7 @@ def __eq__(self, other) -> bool: return False def __deepcopy__(self, memo): - new_owner = next( + new_owner = next( (v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)), None, ) @@ -101,96 +143,143 @@ def __deepcopy__(self, memo): def test_attrs_stats_summary_binds_to_destination_on_copy(): # Sample Data - dset = np.arange(8,dtype=float) + dset = np.arange(8, dtype=float) np.random.shuffle(dset) - df = pd.DataFrame({"foo": dset, "bar": dset*2, "qux": np.array(["waldo","fred","plugh","thud"]).repeat(len(dset)//4)}) # mixed dtypes + df = pd.DataFrame( + { + "foo": dset, + "bar": dset * 2, + "qux": np.array(["waldo", "fred", "plugh", "thud"]).repeat(len(dset) // 4), + } + ) # mixed dtypes df.attrs["summary"] = StatsSummary(df) # -------------------------------------- # Copy triggered by panel Y axis slicing # -------------------------------------- - out = df.iloc[:len(df)//2] + out = df.iloc[: len(df) // 2] summ = out.attrs.get("summary") - gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + gage = StatsSummary.gauge( + out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) + ) assert isinstance(summ, StatsSummary) # The cache should now belong to the *new* DataFrame assert summ.owner is out # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + assert all( + [ + out[column].attrs["summary"] == out.attrs["summary"][column] + for column in list(gage) + ] + ) # And stats reflect the destination (shape matches numeric subset) assert summ == gage # ----------------------------------- # Copy triggered by columns selection # ----------------------------------- - out = df[["foo","qux"]] + out = df[["foo", "qux"]] summ = out.attrs.get("summary") - gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + gage = StatsSummary.gauge( + out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) + ) assert isinstance(summ, StatsSummary) # The cache should now belong to the *new* DataFrame assert summ.owner is out # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + assert all( + [ + out[column].attrs["summary"] == out.attrs["summary"][column] + for column in list(gage) + ] + ) # And stats reflect the destination (shape matches numeric subset) assert summ == gage # ---------------------------------- # Copy triggered by DataFrame concat # ---------------------------------- - left = df.iloc[len(df)//4:].copy(deep=True) - right = df.iloc[len(df)//4:].copy(deep=True) - out = pd.concat([left,right]) + left = df.iloc[len(df) // 4 :].copy(deep=True) + right = df.iloc[len(df) // 4 :].copy(deep=True) + out = pd.concat([left, right]) summ = out.attrs.get("summary") - gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + gage = StatsSummary.gauge( + out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) + ) assert isinstance(summ, StatsSummary) # The cache should now belong to the *new* DataFrame assert summ.owner is out # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + assert all( + [ + out[column].attrs["summary"] == out.attrs["summary"][column] + for column in list(gage) + ] + ) # And stats reflect the destination (shape matches numeric subset) assert summ == gage # ----------------------------------- # Arithemetic operations on DataFrame # ----------------------------------- - out = df[["foo","bar"]] - out = out.multiply(np.random.random_integers(0, 1, len(out))*np.lib.stride_tricks.as_strided(np.asarray(2, dtype=np.int8), shape=(len(out),), strides=(0,))-1, axis=0) + out = df[["foo", "bar"]] + out = out.multiply( + np.random.random_integers(0, 1, len(out)) + * np.lib.stride_tricks.as_strided( + np.asarray(2, dtype=np.int8), shape=(len(out),), strides=(0,) + ) + - 1, + axis=0, + ) summ = out.attrs.get("summary") - gage = StatsSummary.gauge(out, list(filter(lambda x:x[-1].kind in "if", out.dtypes.to_dict().items()))) + gage = StatsSummary.gauge( + out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) + ) assert isinstance(summ, StatsSummary) # The cache should now belong to the *new* DataFrame assert summ.owner is out # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all([out[column].attrs["summary"] == out.attrs["summary"][column] for column in list(gage)]) + assert all( + [ + out[column].attrs["summary"] == out.attrs["summary"][column] + for column in list(gage) + ] + ) # And stats reflect the destination (shape matches numeric subset) assert summ == gage def test_attrs_stats_summary_works_for_series_too(): # Sample Data - dset = np.arange(8,dtype=float) + dset = np.arange(8, dtype=float) np.random.shuffle(dset) - df = pd.DataFrame({"foo": dset, "bar": dset*2, "qux": np.array(["waldo","fred","plugh","thud"]).repeat(len(dset)//4)}) # mixed dtypes + df = pd.DataFrame( + { + "foo": dset, + "bar": dset * 2, + "qux": np.array(["waldo", "fred", "plugh", "thud"]).repeat(len(dset) // 4), + } + ) # mixed dtypes df.attrs["summary"] = StatsSummary(df) # ------------------------------------------ # Directly to pandas.Series, complex slicing # ------------------------------------------ sr = df["bar"] - out = pd.concat([sr.iloc[:len(sr)//2],sr.iloc[len(sr)//4:]]) + out = pd.concat([sr.iloc[: len(sr) // 2], sr.iloc[len(sr) // 4 :]]) summ = out.attrs["summary"] = StatsSummary(out) gage = StatsSummary.gauge(out, [(Ellipsis, sr.dtype)])[...] @@ -205,15 +294,15 @@ def test_attrs_stats_summary_works_for_series_too(): def test_attrs_headtail_probe_rebinds_on_concat_have_same_attrs(): # Sample Data - dset = np.arange(8,dtype=float) + dset = np.arange(8, dtype=float) np.random.shuffle(dset) - df = pd.DataFrame(dict(foo=dset*2, bar=dset*4, baz=dset*8, qux=dset*16)) + df = pd.DataFrame(dict(foo=dset * 2, bar=dset * 4, baz=dset * 8, qux=dset * 16)) df.attrs["preview"] = FrozenHeadTail(df) # same attrs object on both inputs -> triggers have_same_attrs=True branch fred = df.copy(deep=True) - thud = df.iloc[list(range(-2,2))].sort_index() + thud = df.iloc[list(range(-2, 2))].sort_index() out = pd.concat([fred, thud], ignore_index=True) @@ -232,4 +321,4 @@ def test_attrs_empty_remains_empty_on_deepcopy(): df = pd.DataFrame({"a": [1, 2]}) assert df.attrs == {} out = df.copy(deep=True) - assert out.attrs == {} \ No newline at end of file + assert out.attrs == {} From eae57d0d2addcf5b67000d2bb3573441fb1684fe Mon Sep 17 00:00:00 2001 From: dangreb Date: Wed, 20 Aug 2025 20:17:08 -0300 Subject: [PATCH 7/8] Provide new object for copy routines requiring info. on its destination. --- .../test_attrs_deepcopy_destination.py | 52 ++++++++----------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/pandas/tests/generic/test_attrs_deepcopy_destination.py b/pandas/tests/generic/test_attrs_deepcopy_destination.py index 2d11435be3765..853544e2dcfc3 100644 --- a/pandas/tests/generic/test_attrs_deepcopy_destination.py +++ b/pandas/tests/generic/test_attrs_deepcopy_destination.py @@ -24,7 +24,7 @@ def __init__(self, owner, *, cols=None): assert isinstance(owner, pd.core.generic.NDFrame) self._owner_ref = weakref.ref(owner) super(StatsSummary, self).__init__( - dict( + { (column, type(self)(owner[column])) for column in ( list(getattr(owner, "columns", {})) @@ -35,35 +35,27 @@ def __init__(self, owner, *, cols=None): or {} ) if owner[column].dtype.kind in "if" - ) + } ) pass @classmethod def stats(cls): - return dict( - cummin=lambda series: series.cummin().sum(), - cummax=lambda series: series.cummax().sum(), - kurtosis=lambda series: series.kurt(), - median=lambda series: series.median(), - ) + return { + "cummin": lambda series: series.cummin().sum(), + "cummax": lambda series: series.cummax().sum(), + "kurtosis": lambda series: series.kurt(), + "median": lambda series: series.median(), + } @classmethod def gauge(cls, obj, columns): - return dict( - ( - ( - column, - dict( - [ - [name, function(obj[column])] - for name, function in cls.stats().items() - ] - ), - ) - for column, dtyp in columns - ) - ) + return { + column: { + name: function(obj[column]) for name, function in cls.stats().items() + } + for column, dtyp in columns + } @property def owner(self): @@ -101,9 +93,7 @@ def __init__(self, owner, *, cols=None): assert isinstance(owner, pd.core.generic.NDFrame) self._owner_ref = weakref.ref(owner) super(FrozenHeadTail, self).__init__( - dict( - (name, function(self.owner)) for name, function in self.stats().items() - ) + {name: function(self.owner) for name, function in self.stats().items()} ) pass @@ -113,18 +103,18 @@ def owner(self): @classmethod def stats(cls): - return dict( - head=lambda x: pd.DataFrame( + return { + "head": lambda x: pd.DataFrame( x.values[:2], columns=list(getattr(x, "columns", [])) or [x.name], index=x.index[:2], ), - tail=lambda x: pd.DataFrame( + "tail": lambda x: pd.DataFrame( x.values[-2:], columns=list(getattr(x, "columns", [])) or [x.name], index=x.index[-2:], ), - ) + } def __eq__(self, other) -> bool: try: @@ -296,7 +286,9 @@ def test_attrs_headtail_probe_rebinds_on_concat_have_same_attrs(): # Sample Data dset = np.arange(8, dtype=float) np.random.shuffle(dset) - df = pd.DataFrame(dict(foo=dset * 2, bar=dset * 4, baz=dset * 8, qux=dset * 16)) + df = pd.DataFrame( + {"foo": dset * 2, "bar": dset * 4, "baz": dset * 8, "qux": dset * 16} + ) df.attrs["preview"] = FrozenHeadTail(df) From 91f27cf1ff25e787d6b2a588a45ff7d237d30bbf Mon Sep 17 00:00:00 2001 From: dangreb Date: Wed, 20 Aug 2025 20:24:31 -0300 Subject: [PATCH 8/8] Provide new object for copy routines requiring info. on its destination. --- .../test_attrs_deepcopy_destination.py | 316 ------------------ 1 file changed, 316 deletions(-) delete mode 100644 pandas/tests/generic/test_attrs_deepcopy_destination.py diff --git a/pandas/tests/generic/test_attrs_deepcopy_destination.py b/pandas/tests/generic/test_attrs_deepcopy_destination.py deleted file mode 100644 index 853544e2dcfc3..0000000000000 --- a/pandas/tests/generic/test_attrs_deepcopy_destination.py +++ /dev/null @@ -1,316 +0,0 @@ -import weakref - -import numpy as np -import pandas as pd - -import pytest - -from pandas.testing import assert_frame_equal - - -class StatsSummary(dict): - """ - A lightweight, production-plausible cache object that stores simple stats - for numeric columns and keeps a weakref to its owning NDFrame. - - On deepcopy, it should bind to the *destination* NDFrame (if provided in memo) - and rebuild its stats from that destination, so the cache belongs to and - reflects the new object. - """ - - def __init__(self, owner, *, cols=None): - import pandas as pd - - assert isinstance(owner, pd.core.generic.NDFrame) - self._owner_ref = weakref.ref(owner) - super(StatsSummary, self).__init__( - { - (column, type(self)(owner[column])) - for column in ( - list(getattr(owner, "columns", {})) - or super(StatsSummary, self).__init__( - (name, function(owner)) - for name, function in self.stats().items() - ) - or {} - ) - if owner[column].dtype.kind in "if" - } - ) - pass - - @classmethod - def stats(cls): - return { - "cummin": lambda series: series.cummin().sum(), - "cummax": lambda series: series.cummax().sum(), - "kurtosis": lambda series: series.kurt(), - "median": lambda series: series.median(), - } - - @classmethod - def gauge(cls, obj, columns): - return { - column: { - name: function(obj[column]) for name, function in cls.stats().items() - } - for column, dtyp in columns - } - - @property - def owner(self): - return self._owner_ref() - - def __eq__(self, other) -> bool: - outs = all(self[column] == other[column] for column in self) - return outs - - def __deepcopy__(self, memo): - import pandas as pd - - # Find destination NDFrame in memo. The patch injects {id(dest): dest}. - new_owner = next( - (v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)), - None, - ) - return ( - type(self)(new_owner) - if hasattr(new_owner, "select_dtypes") or new_owner.dtype.kind in "if" - else None - ) - - -class FrozenHeadTail(dict): - """ - A preview helper that remembers first/last row 'snapshots' cheaply. - On deepcopy, it should rebuild from the destination NDFrame, so that the - preview corresponds to the new object (e.g., after concat). - """ - - def __init__(self, owner, *, cols=None): - import pandas as pd - - assert isinstance(owner, pd.core.generic.NDFrame) - self._owner_ref = weakref.ref(owner) - super(FrozenHeadTail, self).__init__( - {name: function(self.owner) for name, function in self.stats().items()} - ) - pass - - @property - def owner(self): - return self._owner_ref() - - @classmethod - def stats(cls): - return { - "head": lambda x: pd.DataFrame( - x.values[:2], - columns=list(getattr(x, "columns", [])) or [x.name], - index=x.index[:2], - ), - "tail": lambda x: pd.DataFrame( - x.values[-2:], - columns=list(getattr(x, "columns", [])) or [x.name], - index=x.index[-2:], - ), - } - - def __eq__(self, other) -> bool: - try: - [assert_frame_equal(self[column], other[column]) for column in self] - return True - except: - return False - - def __deepcopy__(self, memo): - new_owner = next( - (v for v in memo.values() if isinstance(v, pd.core.generic.NDFrame)), - None, - ) - return type(self)(new_owner) - - -def test_attrs_stats_summary_binds_to_destination_on_copy(): - # Sample Data - dset = np.arange(8, dtype=float) - np.random.shuffle(dset) - - df = pd.DataFrame( - { - "foo": dset, - "bar": dset * 2, - "qux": np.array(["waldo", "fred", "plugh", "thud"]).repeat(len(dset) // 4), - } - ) # mixed dtypes - - df.attrs["summary"] = StatsSummary(df) - - # -------------------------------------- - # Copy triggered by panel Y axis slicing - # -------------------------------------- - out = df.iloc[: len(df) // 2] - summ = out.attrs.get("summary") - gage = StatsSummary.gauge( - out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) - ) - - assert isinstance(summ, StatsSummary) - - # The cache should now belong to the *new* DataFrame - assert summ.owner is out - # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all( - [ - out[column].attrs["summary"] == out.attrs["summary"][column] - for column in list(gage) - ] - ) - # And stats reflect the destination (shape matches numeric subset) - assert summ == gage - - # ----------------------------------- - # Copy triggered by columns selection - # ----------------------------------- - out = df[["foo", "qux"]] - summ = out.attrs.get("summary") - gage = StatsSummary.gauge( - out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) - ) - - assert isinstance(summ, StatsSummary) - - # The cache should now belong to the *new* DataFrame - assert summ.owner is out - # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all( - [ - out[column].attrs["summary"] == out.attrs["summary"][column] - for column in list(gage) - ] - ) - # And stats reflect the destination (shape matches numeric subset) - assert summ == gage - - # ---------------------------------- - # Copy triggered by DataFrame concat - # ---------------------------------- - left = df.iloc[len(df) // 4 :].copy(deep=True) - right = df.iloc[len(df) // 4 :].copy(deep=True) - out = pd.concat([left, right]) - - summ = out.attrs.get("summary") - gage = StatsSummary.gauge( - out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) - ) - - assert isinstance(summ, StatsSummary) - - # The cache should now belong to the *new* DataFrame - assert summ.owner is out - # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all( - [ - out[column].attrs["summary"] == out.attrs["summary"][column] - for column in list(gage) - ] - ) - # And stats reflect the destination (shape matches numeric subset) - assert summ == gage - - # ----------------------------------- - # Arithemetic operations on DataFrame - # ----------------------------------- - out = df[["foo", "bar"]] - out = out.multiply( - np.random.random_integers(0, 1, len(out)) - * np.lib.stride_tricks.as_strided( - np.asarray(2, dtype=np.int8), shape=(len(out),), strides=(0,) - ) - - 1, - axis=0, - ) - - summ = out.attrs.get("summary") - gage = StatsSummary.gauge( - out, list(filter(lambda x: x[-1].kind in "if", out.dtypes.to_dict().items())) - ) - - assert isinstance(summ, StatsSummary) - - # The cache should now belong to the *new* DataFrame - assert summ.owner is out - # pandas.DataFrame propagate to its pandas.Series correspondingly - assert all( - [ - out[column].attrs["summary"] == out.attrs["summary"][column] - for column in list(gage) - ] - ) - # And stats reflect the destination (shape matches numeric subset) - assert summ == gage - - -def test_attrs_stats_summary_works_for_series_too(): - # Sample Data - dset = np.arange(8, dtype=float) - np.random.shuffle(dset) - - df = pd.DataFrame( - { - "foo": dset, - "bar": dset * 2, - "qux": np.array(["waldo", "fred", "plugh", "thud"]).repeat(len(dset) // 4), - } - ) # mixed dtypes - df.attrs["summary"] = StatsSummary(df) - - # ------------------------------------------ - # Directly to pandas.Series, complex slicing - # ------------------------------------------ - sr = df["bar"] - out = pd.concat([sr.iloc[: len(sr) // 2], sr.iloc[len(sr) // 4 :]]) - - summ = out.attrs["summary"] = StatsSummary(out) - gage = StatsSummary.gauge(out, [(Ellipsis, sr.dtype)])[...] - - assert isinstance(summ, StatsSummary) - - # The cache should now belong to the *new* DataFrame - assert summ.owner is out - # And stats reflect the destination (shape matches numeric subset) - assert summ == gage - - -def test_attrs_headtail_probe_rebinds_on_concat_have_same_attrs(): - # Sample Data - dset = np.arange(8, dtype=float) - np.random.shuffle(dset) - df = pd.DataFrame( - {"foo": dset * 2, "bar": dset * 4, "baz": dset * 8, "qux": dset * 16} - ) - - df.attrs["preview"] = FrozenHeadTail(df) - - # same attrs object on both inputs -> triggers have_same_attrs=True branch - fred = df.copy(deep=True) - thud = df.iloc[list(range(-2, 2))].sort_index() - - out = pd.concat([fred, thud], ignore_index=True) - - pr = out.attrs.get("preview") - assert isinstance(pr, FrozenHeadTail) - - # The preview should be tied to the concatenated destination and reflect it - assert pr.owner is out - pass - assert_frame_equal(pr["head"], out.iloc[:2]) - assert_frame_equal(pr["tail"], out.iloc[-2:]) - pass - - -def test_attrs_empty_remains_empty_on_deepcopy(): - df = pd.DataFrame({"a": [1, 2]}) - assert df.attrs == {} - out = df.copy(deep=True) - assert out.attrs == {}