Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion src/datasets/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,35 @@ class _TempCacheDir:
"""

def __init__(self):
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
# Check if TMPDIR is set and handle the case where it doesn't exist
tmpdir = os.environ.get("TMPDIR")
# Normalize the path to handle any path resolution issues
if tmpdir:
tmpdir = os.path.normpath(tmpdir)
if not os.path.exists(tmpdir):
# Auto-create the directory if it doesn't exist
# This prevents tempfile from silently falling back to /tmp
try:
os.makedirs(tmpdir, exist_ok=True)
logger.info(f"Created TMPDIR directory: {tmpdir}")
except OSError as e:
logger.warning(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if TMPDIR is considered as part of the API then this and the one below ideally should re-raise an exception, while giving additional context.

Warnings are very easy to miss in complex applications where there are already dozens of warnings multiplied by multiple gpu processes.

Copy link
Author

@ada-ggf25 ada-ggf25 Dec 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if TMPDIR is considered as part of the API then this and the one below ideally should re-raise an exception, while giving additional context.

Warnings are very easy to miss in complex applications where there are already dozens of warnings multiplied by multiple gpu processes.

I agree that if we treat TMPDIR as part of the API, a silent warning is too easy to miss in complex multi‑GPU runs. I’ll update _TempCacheDir.__init__ so that when TMPDIR is set but cannot be created/used, the code re‑raises an OSError with a clearer, actionable message (e.g. “TMPDIR is set to ‘…’ but the directory could not be created, please create it manually or unset TMPDIR”). I’ll also adjust the corresponding test to expect this behaviour.

I’ll open a PR update with those changes, please let me know if you’d prefer a different error type or wording.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! You can check the new PR here #7891. I hope it helps!

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused - why was there a need for a new PR? Just in case the maintainers would prefer the original instead?

f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. "
f"Falling back to default temporary directory."
)
tmpdir = None
# If tmpdir exists, verify it's actually a directory and writable
elif not os.path.isdir(tmpdir):
logger.warning(
f"TMPDIR is set to '{tmpdir}' but it is not a directory. "
f"Falling back to default temporary directory."
)
tmpdir = None

# Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected
# This works even if tempfile.gettempdir() was already called and cached
# Pass dir=None if tmpdir is None to use default temp directory
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir)
self._finalizer = weakref.finalize(self, self._cleanup)

def _cleanup(self):
Expand Down
129 changes: 124 additions & 5 deletions tests/test_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,12 +407,131 @@ def test_fingerprint_in_multiprocessing():
data = {"a": [0, 1, 2]}
dataset = DatasetChild(InMemoryTable.from_pydict(data))
expected_fingerprint = dataset.func1()._fingerprint
assert expected_fingerprint == dataset.func1()._fingerprint
assert expected_fingerprint != dataset.func2()._fingerprint
with Pool(2) as pool:
fingerprints = pool.map(
lambda _: DatasetChild(InMemoryTable.from_pydict(data)).func1()._fingerprint, range(10)
)
assert all(f == expected_fingerprint for f in fingerprints)

with Pool(2) as p:
assert expected_fingerprint == p.apply_async(dataset.func1).get()._fingerprint
assert expected_fingerprint != p.apply_async(dataset.func2).get()._fingerprint

def test_temp_cache_dir_with_tmpdir_nonexistent(tmp_path, caplog):
"""Test that _TempCacheDir creates TMPDIR if it doesn't exist."""
import os

# Set TMPDIR to a non-existent directory
tmpdir_path = tmp_path / "custom_tmpdir"
assert not tmpdir_path.exists(), "TMPDIR should not exist initially"

# Save original TMPDIR and set new one
original_tmpdir = os.environ.get("TMPDIR")
try:
os.environ["TMPDIR"] = str(tmpdir_path)

# Clear any existing temp cache dir to force recreation
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

# Import and test _TempCacheDir directly
from datasets.fingerprint import _TempCacheDir

with caplog.at_level("INFO", logger="datasets.fingerprint"):
temp_cache = _TempCacheDir()
cache_dir = temp_cache.name

# The key test: verify the cache directory is within the TMPDIR we set
# This proves that TMPDIR was respected and the directory was created
tmpdir_path_str = str(tmpdir_path)
assert cache_dir.startswith(tmpdir_path_str), (
f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path_str}. TMPDIR env var: {os.environ.get('TMPDIR')}"
)
# Verify the directory was created
assert tmpdir_path.exists(), (
f"TMPDIR directory {tmpdir_path} should have been created. Cache dir is: {cache_dir}"
)
# Verify logging
assert f"Created TMPDIR directory: {tmpdir_path_str}" in caplog.text

# Cleanup
temp_cache.cleanup()
finally:
# Restore original TMPDIR
if original_tmpdir is not None:
os.environ["TMPDIR"] = original_tmpdir
elif "TMPDIR" in os.environ:
del os.environ["TMPDIR"]


def test_temp_cache_dir_with_tmpdir_existing(tmp_path, monkeypatch):
"""Test that _TempCacheDir works correctly when TMPDIR exists."""
from datasets.fingerprint import get_temporary_cache_files_directory

# Set TMPDIR to an existing directory
tmpdir_path = tmp_path / "existing_tmpdir"
tmpdir_path.mkdir()
monkeypatch.setenv("TMPDIR", str(tmpdir_path))

# Clear any existing temp cache dir
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

cache_dir = get_temporary_cache_files_directory()

# Verify the cache directory is within the TMPDIR
assert cache_dir.startswith(str(tmpdir_path)), f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path}"


def test_temp_cache_dir_without_tmpdir(monkeypatch):
"""Test that _TempCacheDir works correctly when TMPDIR is not set."""
from datasets.fingerprint import get_temporary_cache_files_directory

# Remove TMPDIR if it exists
monkeypatch.delenv("TMPDIR", raising=False)

# Clear any existing temp cache dir
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

cache_dir = get_temporary_cache_files_directory()

# Verify it uses the default temp directory
from tempfile import gettempdir

default_temp = gettempdir()
assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should be in default temp {default_temp}"


def test_temp_cache_dir_tmpdir_creation_failure(tmp_path, monkeypatch, caplog):
"""Test that _TempCacheDir handles failure to create TMPDIR gracefully."""
from unittest.mock import patch

from datasets.fingerprint import get_temporary_cache_files_directory

# Set TMPDIR to a path that will fail to create (e.g., invalid permissions)
# Use a path that's likely to fail on creation
tmpdir_path = tmp_path / "nonexistent" / "nested" / "path"
monkeypatch.setenv("TMPDIR", str(tmpdir_path))

# Clear any existing temp cache dir
import datasets.fingerprint

datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None

# Mock os.makedirs to raise an error
with patch("datasets.fingerprint.os.makedirs", side_effect=OSError("Permission denied")):
with caplog.at_level("WARNING", logger="datasets.fingerprint"):
cache_dir = get_temporary_cache_files_directory()

# Verify warning was logged
assert "TMPDIR is set to" in caplog.text
assert "could not be created" in caplog.text
# Verify it falls back to default temp directory
from tempfile import gettempdir

default_temp = gettempdir()
assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should fall back to default temp {default_temp}"


def test_fingerprint_when_transform_version_changes():
Expand Down