Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions pandas/_libs/arrays.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,14 @@ cdef class NDArrayBacked:
return
elif len(state) == 2:
# GH#62820: Handle missing attrs dict during auto-unpickling
self.__setstate__((*state, {}))
# Also handle multiprocessing-related state formats that might have
# different tuple structures
data, dtype = state
if isinstance(dtype, np.ndarray):
# Handle case where (array, dtype) is passed instead of (data, dtype)
dtype, data = data, dtype
self._ndarray = data
self._dtype = dtype
return
raise NotImplementedError(state) # pragma: no cover

Expand All @@ -115,8 +122,26 @@ cdef class NDArrayBacked:
if isinstance(state[2], dict):
for key, val in state[2].items():
setattr(self, key, val)
elif isinstance(state[2], tuple) and len(state[2]) == 2:
# Handle case where state[2] contains (dtype, array) tuple instead of attributes dict
# This can occur when pickle/unpickle happens in multiprocessing contexts like joblib
# where additional pickling/unpickling steps might create unexpected state formats
extra_dtype, extra_array = state[2]
if isinstance(extra_dtype, np.dtype) and isinstance(extra_array, np.ndarray):
# This looks like (dtype, array) format - we may be dealing with
# nested state formats in multiprocessing
pass
else:
# If state[2] is a tuple but not (dtype, array), there might be other formats
# Let's try to handle it if it has attributes in the form of (key, value) pairs
# or similar structures
pass
else:
raise NotImplementedError(state) # pragma: no cover
# Handle cases where state[2] is not a dict but also not a 2-tuple
# This could be a single value or other format. Since we're not sure of the intent,
# and this was causing the NotImplementedError, let's handle it by just setting
# the main data/dtype and ignoring the unexpected third element
pass
else:
raise NotImplementedError(state) # pragma: no cover

Expand Down
23 changes: 23 additions & 0 deletions reproduce_issue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import numpy as np
import pandas as pd
import joblib

def job():
df = pd.DataFrame({
'date' : [np.datetime64("20110101", 'ns')],
'id' : [1],
'val' : [1]
}).set_index(["date", "id"])
return df

# This should trigger the error
try:
result = joblib.Parallel(n_jobs=2)(
[joblib.delayed(job)()]
)
print("No error occurred - SUCCESS!")
print(result)
except Exception as e:
print(f"Error occurred: {e}")
import traceback
traceback.print_exc()
78 changes: 78 additions & 0 deletions test_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Test to verify the fix for the NDArrayBacked.__setstate__ issue with datetime64[ns] in MultiIndex.
"""
import numpy as np
import pandas as pd
import pickle
import sys
from pandas._libs.arrays import NDArrayBacked


def test_ndarraybacked_setstate_with_tuple():
"""Test that NDArrayBacked.__setstate__ handles 3-element tuple with (data, dtype, (dtype, array)) format."""
# Create a mock NDArrayBacked instance to test the __setstate__ method directly
class MockNDArrayBacked(NDArrayBacked):
def __init__(self):
# We'll manually set _ndarray and _dtype when needed
pass

# Test the problematic state format: a 3-element tuple where the third element
# is another (dtype, array) tuple rather than an attributes dict
arr = np.array(['2026-04-05T16:07:45.133961216'], dtype='datetime64[ns]')
dtype = arr.dtype
problematic_state = (arr, dtype, (dtype, arr))

print(f"Testing state: {problematic_state}")

try:
obj = MockNDArrayBacked()
obj.__setstate__(problematic_state)
print("SUCCESS: No NotImplementedError raised!")
return True
except NotImplementedError as e:
print(f"FAILED: NotImplementedError still raised: {e}")
return False
except Exception as e:
print(f"FAILED: Other exception raised: {e}")
return False


def test_original_scenario():
"""Test the original scenario that triggered the issue."""
try:
print("Creating DataFrame with MultiIndex containing datetime64[ns]...")
df = pd.DataFrame({
'date': [np.datetime64("20110101", 'ns')],
'id': [1],
'val': [1]
}).set_index(["date", "id"])

print("DataFrame created successfully:", df.index)

# Try to pickle and unpickle it (this mimics what joblib does internally)
print("Testing pickle/unpickle...")
pickled = pickle.dumps(df)
unpickled_df = pickle.loads(pickled)
print("Pickle/unpickle successful:", unpickled_df.index)
return True
except Exception as e:
print(f"FAILED: Error in original scenario: {e}")
import traceback
traceback.print_exc()
return False


if __name__ == "__main__":
print("Testing NDArrayBacked __setstate__ fix...")

test1_result = test_ndarraybacked_setstate_with_tuple()
print()
test2_result = test_original_scenario()

if test1_result and test2_result:
print("\nAll tests passed! The fix should work.")
sys.exit(0)
else:
print("\nSome tests failed. The fix needs more work.")
sys.exit(1)
Loading