Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion src/datasets/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,33 @@ class _TempCacheDir:
"""

def __init__(self):
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
# Check if TMPDIR is set and handle the case where it doesn't exist
tmpdir = os.environ.get("TMPDIR")
# Normalize the path to handle any path resolution issues
if tmpdir:
tmpdir = os.path.normpath(tmpdir)
if not os.path.exists(tmpdir):
# Auto-create the directory if it doesn't exist
# This prevents tempfile from silently falling back to /tmp
try:
os.makedirs(tmpdir, exist_ok=True)
logger.info(f"Created TMPDIR directory: {tmpdir}")
except OSError as e:
raise OSError(
f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. "
"Please create it manually or unset TMPDIR to fall back to the default temporary directory."
) from e
# If tmpdir exists, verify it's actually a directory and writable
elif not os.path.isdir(tmpdir):
raise OSError(
f"TMPDIR is set to '{tmpdir}' but it is not a directory. "
"Please point TMPDIR to a writable directory or unset it to fall back to the default temporary directory."
)

# Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected
# This works even if tempfile.gettempdir() was already called and cached
# Pass dir=None if tmpdir is None to use default temp directory
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir)
self._finalizer = weakref.finalize(self, self._cleanup)

def _cleanup(self):
Expand Down
137 changes: 132 additions & 5 deletions tests/test_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,12 +407,139 @@ def test_fingerprint_in_multiprocessing():
data = {"a": [0, 1, 2]}
dataset = DatasetChild(InMemoryTable.from_pydict(data))
expected_fingerprint = dataset.func1()._fingerprint
assert expected_fingerprint == dataset.func1()._fingerprint
assert expected_fingerprint != dataset.func2()._fingerprint
with Pool(2) as pool:
fingerprints = pool.map(
lambda _: DatasetChild(InMemoryTable.from_pydict(data)).func1()._fingerprint, range(10)
)
assert all(f == expected_fingerprint for f in fingerprints)

with Pool(2) as p:
assert expected_fingerprint == p.apply_async(dataset.func1).get()._fingerprint
assert expected_fingerprint != p.apply_async(dataset.func2).get()._fingerprint

def test_temp_cache_dir_with_tmpdir_nonexistent(tmp_path, caplog):
"""Test that _TempCacheDir creates TMPDIR if it doesn't exist."""
import os

# Set TMPDIR to a non-existent directory
tmpdir_path = tmp_path / "custom_tmpdir"
assert not tmpdir_path.exists(), "TMPDIR should not exist initially"

# Save original TMPDIR and set new one
original_tmpdir = os.environ.get("TMPDIR")
try:
os.environ["TMPDIR"] = str(tmpdir_path)

# Clear any existing temp cache dir to force recreation
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

# Import and test _TempCacheDir directly
from datasets.fingerprint import _TempCacheDir

with caplog.at_level("INFO", logger="datasets.fingerprint"):
temp_cache = _TempCacheDir()
cache_dir = temp_cache.name

# The key test: verify the cache directory is within the TMPDIR we set
# This proves that TMPDIR was respected and the directory was created
tmpdir_path_str = str(tmpdir_path)
assert cache_dir.startswith(tmpdir_path_str), (
f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path_str}. TMPDIR env var: {os.environ.get('TMPDIR')}"
)
# Verify the directory was created
assert tmpdir_path.exists(), (
f"TMPDIR directory {tmpdir_path} should have been created. Cache dir is: {cache_dir}"
)
# Verify logging
assert f"Created TMPDIR directory: {tmpdir_path_str}" in caplog.text

# Cleanup
temp_cache.cleanup()
finally:
# Restore original TMPDIR
if original_tmpdir is not None:
os.environ["TMPDIR"] = original_tmpdir
elif "TMPDIR" in os.environ:
del os.environ["TMPDIR"]


def test_temp_cache_dir_with_tmpdir_existing(tmp_path, monkeypatch):
"""Test that _TempCacheDir works correctly when TMPDIR exists."""
from datasets.fingerprint import get_temporary_cache_files_directory

# Set TMPDIR to an existing directory
tmpdir_path = tmp_path / "existing_tmpdir"
tmpdir_path.mkdir()
monkeypatch.setenv("TMPDIR", str(tmpdir_path))

# Clear any existing temp cache dir
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

cache_dir = get_temporary_cache_files_directory()

# Verify the cache directory is within the TMPDIR
assert cache_dir.startswith(str(tmpdir_path)), f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path}"


def test_temp_cache_dir_without_tmpdir(monkeypatch):
"""Test that _TempCacheDir works correctly when TMPDIR is not set."""
from datasets.fingerprint import get_temporary_cache_files_directory

# Remove TMPDIR if it exists
monkeypatch.delenv("TMPDIR", raising=False)

# Clear any existing temp cache dir
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

cache_dir = get_temporary_cache_files_directory()

# Verify it uses the default temp directory
from tempfile import gettempdir

default_temp = gettempdir()
assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should be in default temp {default_temp}"


def test_temp_cache_dir_tmpdir_creation_failure(tmp_path, monkeypatch, caplog):
"""Test that _TempCacheDir raises if TMPDIR cannot be created."""
from unittest.mock import patch

from datasets.fingerprint import _TempCacheDir

# Set TMPDIR to a path that will fail to create (e.g., invalid permissions)
# Use a path that's likely to fail on creation
tmpdir_path = tmp_path / "nonexistent" / "nested" / "path"
monkeypatch.setenv("TMPDIR", str(tmpdir_path))

# Mock os.makedirs to raise an error
with patch("datasets.fingerprint.os.makedirs", side_effect=OSError("Permission denied")):
with pytest.raises(OSError) as excinfo:
_TempCacheDir()

# Verify the error message gives clear context about TMPDIR
msg = str(excinfo.value)
assert "TMPDIR is set to" in msg
assert "could not be created" in msg


def test_temp_cache_dir_tmpdir_not_directory(tmp_path, monkeypatch):
"""Test that _TempCacheDir raises if TMPDIR points to a non-directory."""
from datasets.fingerprint import _TempCacheDir

# Create a regular file and point TMPDIR to it
file_path = tmp_path / "not_a_dir"
file_path.write_text("not a directory")
monkeypatch.setenv("TMPDIR", str(file_path))

with pytest.raises(OSError) as excinfo:
_TempCacheDir()

msg = str(excinfo.value)
assert "TMPDIR is set to" in msg
assert "is not a directory" in msg


def test_fingerprint_when_transform_version_changes():
Expand Down