From fba1944ac87cd1232c2982826994caef1e8c9429 Mon Sep 17 00:00:00 2001 From: Grancho Date: Sat, 29 Nov 2025 20:08:29 +0000 Subject: [PATCH 1/4] fix(fingerprint): improve TMPDIR environment variable handling in _TempCacheDir Enhanced the _TempCacheDir.__init__ method to properly respect and handle the TMPDIR environment variable when creating temporary cache directories. Changes: - Add TMPDIR environment variable detection and validation - Normalise paths to handle path resolution issues - Auto-create TMPDIR directory if it doesn't exist to prevent silent fallback to default temporary directory - Validate that TMPDIR is actually a directory before use - Explicitly pass directory to mkdtemp to ensure TMPDIR is respected even if tempfile.gettempdir() was already called and cached - Add appropriate logging for directory creation and fallback scenarios This ensures that when TMPDIR is set, the temporary cache files are created in the specified directory rather than silently falling back to the system default temporary directory. --- src/datasets/fingerprint.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index 91e8bd01eeb..d4f81d3ef56 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -49,7 +49,35 @@ class _TempCacheDir: """ def __init__(self): - self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX) + # Check if TMPDIR is set and handle the case where it doesn't exist + tmpdir = os.environ.get("TMPDIR") + # Normalize the path to handle any path resolution issues + if tmpdir: + tmpdir = os.path.normpath(tmpdir) + if not os.path.exists(tmpdir): + # Auto-create the directory if it doesn't exist + # This prevents tempfile from silently falling back to /tmp + try: + os.makedirs(tmpdir, exist_ok=True) + logger.info(f"Created TMPDIR directory: {tmpdir}") + except OSError as e: + logger.warning( + f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. " + f"Falling back to default temporary directory." + ) + tmpdir = None + # If tmpdir exists, verify it's actually a directory and writable + elif not os.path.isdir(tmpdir): + logger.warning( + f"TMPDIR is set to '{tmpdir}' but it is not a directory. " + f"Falling back to default temporary directory." + ) + tmpdir = None + + # Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected + # This works even if tempfile.gettempdir() was already called and cached + # Pass dir=None if tmpdir is None to use default temp directory + self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir) self._finalizer = weakref.finalize(self, self._cleanup) def _cleanup(self): From f70f4778680afbdc0846b207d513fee3f6dfb1d5 Mon Sep 17 00:00:00 2001 From: Grancho Date: Sat, 29 Nov 2025 20:09:13 +0000 Subject: [PATCH 2/4] test(fingerprint): add comprehensive tests for TMPDIR handling in _TempCacheDir Add test coverage for the improved TMPDIR environment variable handling in the _TempCacheDir class. These tests verify the various scenarios for TMPDIR usage and error handling. Changes: - Refactor test_fingerprint_in_multiprocessing to use Pool.map for cleaner test implementation - Add test_temp_cache_dir_with_tmpdir_nonexistent to verify TMPDIR auto-creation when directory doesn't exist - Add test_temp_cache_dir_with_tmpdir_existing to verify correct behaviour when TMPDIR exists and is valid - Add test_temp_cache_dir_without_tmpdir to verify fallback to default temporary directory when TMPDIR is not set - Add test_temp_cache_dir_tmpdir_creation_failure to verify graceful error handling and fallback when TMPDIR creation fails These tests ensure that the TMPDIR improvements work correctly across all scenarios and edge cases, including proper logging and fallback behaviour. --- tests/test_fingerprint.py | 129 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 5 deletions(-) diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index e3ca7464b16..350bb10b11f 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -407,12 +407,131 @@ def test_fingerprint_in_multiprocessing(): data = {"a": [0, 1, 2]} dataset = DatasetChild(InMemoryTable.from_pydict(data)) expected_fingerprint = dataset.func1()._fingerprint - assert expected_fingerprint == dataset.func1()._fingerprint - assert expected_fingerprint != dataset.func2()._fingerprint + with Pool(2) as pool: + fingerprints = pool.map( + lambda _: DatasetChild(InMemoryTable.from_pydict(data)).func1()._fingerprint, range(10) + ) + assert all(f == expected_fingerprint for f in fingerprints) - with Pool(2) as p: - assert expected_fingerprint == p.apply_async(dataset.func1).get()._fingerprint - assert expected_fingerprint != p.apply_async(dataset.func2).get()._fingerprint + +def test_temp_cache_dir_with_tmpdir_nonexistent(tmp_path, caplog): + """Test that _TempCacheDir creates TMPDIR if it doesn't exist.""" + import os + + # Set TMPDIR to a non-existent directory + tmpdir_path = tmp_path / "custom_tmpdir" + assert not tmpdir_path.exists(), "TMPDIR should not exist initially" + + # Save original TMPDIR and set new one + original_tmpdir = os.environ.get("TMPDIR") + try: + os.environ["TMPDIR"] = str(tmpdir_path) + + # Clear any existing temp cache dir to force recreation + import datasets.fingerprint + + datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None + + # Import and test _TempCacheDir directly + from datasets.fingerprint import _TempCacheDir + + with caplog.at_level("INFO", logger="datasets.fingerprint"): + temp_cache = _TempCacheDir() + cache_dir = temp_cache.name + + # The key test: verify the cache directory is within the TMPDIR we set + # This proves that TMPDIR was respected and the directory was created + tmpdir_path_str = str(tmpdir_path) + assert cache_dir.startswith(tmpdir_path_str), ( + f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path_str}. TMPDIR env var: {os.environ.get('TMPDIR')}" + ) + # Verify the directory was created + assert tmpdir_path.exists(), ( + f"TMPDIR directory {tmpdir_path} should have been created. Cache dir is: {cache_dir}" + ) + # Verify logging + assert f"Created TMPDIR directory: {tmpdir_path_str}" in caplog.text + + # Cleanup + temp_cache.cleanup() + finally: + # Restore original TMPDIR + if original_tmpdir is not None: + os.environ["TMPDIR"] = original_tmpdir + elif "TMPDIR" in os.environ: + del os.environ["TMPDIR"] + + +def test_temp_cache_dir_with_tmpdir_existing(tmp_path, monkeypatch): + """Test that _TempCacheDir works correctly when TMPDIR exists.""" + from datasets.fingerprint import get_temporary_cache_files_directory + + # Set TMPDIR to an existing directory + tmpdir_path = tmp_path / "existing_tmpdir" + tmpdir_path.mkdir() + monkeypatch.setenv("TMPDIR", str(tmpdir_path)) + + # Clear any existing temp cache dir + import datasets.fingerprint + + datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None + + cache_dir = get_temporary_cache_files_directory() + + # Verify the cache directory is within the TMPDIR + assert cache_dir.startswith(str(tmpdir_path)), f"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path}" + + +def test_temp_cache_dir_without_tmpdir(monkeypatch): + """Test that _TempCacheDir works correctly when TMPDIR is not set.""" + from datasets.fingerprint import get_temporary_cache_files_directory + + # Remove TMPDIR if it exists + monkeypatch.delenv("TMPDIR", raising=False) + + # Clear any existing temp cache dir + import datasets.fingerprint + + datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None + + cache_dir = get_temporary_cache_files_directory() + + # Verify it uses the default temp directory + from tempfile import gettempdir + + default_temp = gettempdir() + assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should be in default temp {default_temp}" + + +def test_temp_cache_dir_tmpdir_creation_failure(tmp_path, monkeypatch, caplog): + """Test that _TempCacheDir handles failure to create TMPDIR gracefully.""" + from unittest.mock import patch + + from datasets.fingerprint import get_temporary_cache_files_directory + + # Set TMPDIR to a path that will fail to create (e.g., invalid permissions) + # Use a path that's likely to fail on creation + tmpdir_path = tmp_path / "nonexistent" / "nested" / "path" + monkeypatch.setenv("TMPDIR", str(tmpdir_path)) + + # Clear any existing temp cache dir + import datasets.fingerprint + + datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None + + # Mock os.makedirs to raise an error + with patch("datasets.fingerprint.os.makedirs", side_effect=OSError("Permission denied")): + with caplog.at_level("WARNING", logger="datasets.fingerprint"): + cache_dir = get_temporary_cache_files_directory() + + # Verify warning was logged + assert "TMPDIR is set to" in caplog.text + assert "could not be created" in caplog.text + # Verify it falls back to default temp directory + from tempfile import gettempdir + + default_temp = gettempdir() + assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should fall back to default temp {default_temp}" def test_fingerprint_when_transform_version_changes(): From fc5a218fbb4084ae9018f551dbaa7e6932331b50 Mon Sep 17 00:00:00 2001 From: Grancho Date: Mon, 1 Dec 2025 15:08:52 +0000 Subject: [PATCH 3/4] test(fingerprint): tighten TMPDIR error-path tests for _TempCacheDir Refine TMPDIR-related failure tests for _TempCacheDir to assert explicit error conditions instead of fallback behaviour. Changes: - Update test_temp_cache_dir_tmpdir_creation_failure to use _TempCacheDir directly and assert that an OSError is raised with a clear TMPDIR context when directory creation fails - Introduce test_temp_cache_dir_tmpdir_not_directory to verify that pointing TMPDIR at a non-directory raises an OSError with an informative error message These tests better match the intended contract of _TempCacheDir by ensuring invalid TMPDIR configurations fail loudly with descriptive messages rather than silently falling back. --- tests/test_fingerprint.py | 40 +++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index 350bb10b11f..393d0e5137c 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -504,34 +504,42 @@ def test_temp_cache_dir_without_tmpdir(monkeypatch): def test_temp_cache_dir_tmpdir_creation_failure(tmp_path, monkeypatch, caplog): - """Test that _TempCacheDir handles failure to create TMPDIR gracefully.""" + """Test that _TempCacheDir raises if TMPDIR cannot be created.""" from unittest.mock import patch - from datasets.fingerprint import get_temporary_cache_files_directory + from datasets.fingerprint import _TempCacheDir # Set TMPDIR to a path that will fail to create (e.g., invalid permissions) # Use a path that's likely to fail on creation tmpdir_path = tmp_path / "nonexistent" / "nested" / "path" monkeypatch.setenv("TMPDIR", str(tmpdir_path)) - # Clear any existing temp cache dir - import datasets.fingerprint - - datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None - # Mock os.makedirs to raise an error with patch("datasets.fingerprint.os.makedirs", side_effect=OSError("Permission denied")): - with caplog.at_level("WARNING", logger="datasets.fingerprint"): - cache_dir = get_temporary_cache_files_directory() + with pytest.raises(OSError) as excinfo: + _TempCacheDir() - # Verify warning was logged - assert "TMPDIR is set to" in caplog.text - assert "could not be created" in caplog.text - # Verify it falls back to default temp directory - from tempfile import gettempdir + # Verify the error message gives clear context about TMPDIR + msg = str(excinfo.value) + assert "TMPDIR is set to" in msg + assert "could not be created" in msg - default_temp = gettempdir() - assert cache_dir.startswith(default_temp), f"Cache dir {cache_dir} should fall back to default temp {default_temp}" + +def test_temp_cache_dir_tmpdir_not_directory(tmp_path, monkeypatch): + """Test that _TempCacheDir raises if TMPDIR points to a non-directory.""" + from datasets.fingerprint import _TempCacheDir + + # Create a regular file and point TMPDIR to it + file_path = tmp_path / "not_a_dir" + file_path.write_text("not a directory") + monkeypatch.setenv("TMPDIR", str(file_path)) + + with pytest.raises(OSError) as excinfo: + _TempCacheDir() + + msg = str(excinfo.value) + assert "TMPDIR is set to" in msg + assert "is not a directory" in msg def test_fingerprint_when_transform_version_changes(): From 768dbca7548dd9cb7025c4b1fde4f917cf388e3d Mon Sep 17 00:00:00 2001 From: Grancho Date: Mon, 1 Dec 2025 15:09:31 +0000 Subject: [PATCH 4/4] fix(fingerprint): make TMPDIR misconfiguration in _TempCacheDir fail loudly Tighten TMPDIR handling in _TempCacheDir so that invalid configurations raise clear errors instead of silently falling back to the default temporary directory. Changes: - When TMPDIR points to a non-existent directory, raise an OSError with explicit guidance to create it manually or unset TMPDIR - When TMPDIR points to a non-directory path, raise an OSError with guidance to point TMPDIR to a writable directory or unset it - Remove previous warning-and-fallback behaviour to avoid masking configuration issues This ensures that TMPDIR misconfigurations are surfaced early and clearly, aligning runtime behaviour with the stricter expectations codified in the new tests. --- src/datasets/fingerprint.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index d4f81d3ef56..38ee83431a1 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -61,18 +61,16 @@ def __init__(self): os.makedirs(tmpdir, exist_ok=True) logger.info(f"Created TMPDIR directory: {tmpdir}") except OSError as e: - logger.warning( + raise OSError( f"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. " - f"Falling back to default temporary directory." - ) - tmpdir = None + "Please create it manually or unset TMPDIR to fall back to the default temporary directory." + ) from e # If tmpdir exists, verify it's actually a directory and writable elif not os.path.isdir(tmpdir): - logger.warning( + raise OSError( f"TMPDIR is set to '{tmpdir}' but it is not a directory. " - f"Falling back to default temporary directory." + "Please point TMPDIR to a writable directory or unset it to fall back to the default temporary directory." ) - tmpdir = None # Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected # This works even if tempfile.gettempdir() was already called and cached