From 0db2e1668db4865af452f0e9a7c958fe9936b376 Mon Sep 17 00:00:00 2001 From: haochengxia Date: Tue, 5 Aug 2025 10:12:42 +0000 Subject: [PATCH 1/4] Feat: Make data load internal --- libcachesim/__init__.py | 3 - libcachesim/_s3_cache.py | 347 ++++++++++++++++++++++++++++++++++++ libcachesim/data_loader.py | 118 ------------ libcachesim/trace_reader.py | 81 ++++++++- src/libCacheSim | 2 +- tests/test_analyzer.py | 11 +- tests/test_data_loader.py | 8 - tests/test_reader.py | 12 +- 8 files changed, 439 insertions(+), 143 deletions(-) create mode 100644 libcachesim/_s3_cache.py delete mode 100644 libcachesim/data_loader.py delete mode 100644 tests/test_data_loader.py diff --git a/libcachesim/__init__.py b/libcachesim/__init__.py index c9fc1e7..86baf3a 100644 --- a/libcachesim/__init__.py +++ b/libcachesim/__init__.py @@ -59,7 +59,6 @@ from .trace_analyzer import TraceAnalyzer from .synthetic_reader import SyntheticReader, create_zipf_requests, create_uniform_requests from .util import Util -from .data_loader import DataLoader __all__ = [ # Core classes @@ -118,8 +117,6 @@ "create_uniform_requests", # Utilities "Util", - # Data loader - "DataLoader", # Metadata "__doc__", "__version__", diff --git a/libcachesim/_s3_cache.py b/libcachesim/_s3_cache.py new file mode 100644 index 0000000..5fa60e3 --- /dev/null +++ b/libcachesim/_s3_cache.py @@ -0,0 +1,347 @@ +"""S3 Bucket data loader with local caching (HuggingFace-style).""" + +from __future__ import annotations + +import hashlib +import logging +import os +import re +import shutil +from pathlib import Path +from typing import Optional, Union +from urllib.parse import urlparse + +logger = logging.getLogger(__name__) + + +class _DataLoader: + """Internal S3 data loader with local caching.""" + + DEFAULT_BUCKET = "cache-datasets" + DEFAULT_CACHE_DIR = Path(os.environ.get("LCS_HUB_CACHE", Path.home() / ".cache/libcachesim/hub")) + + # Characters that are problematic on various filesystems + INVALID_CHARS = set('<>:"|?*\x00') + # Reserved names on Windows + RESERVED_NAMES = { + 'CON', 'PRN', 'AUX', 'NUL', + 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', + 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' + } + + def __init__( + self, bucket_name: str = DEFAULT_BUCKET, cache_dir: Optional[Union[str, Path]] = None, use_auth: bool = False + ): + self.bucket_name = self._validate_bucket_name(bucket_name) + self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR + self.use_auth = use_auth + self._s3_client = None + self._ensure_cache_dir() + + def _validate_bucket_name(self, bucket_name: str) -> str: + """Validate S3 bucket name according to AWS rules.""" + if not bucket_name: + raise ValueError("Bucket name cannot be empty") + + if len(bucket_name) < 3 or len(bucket_name) > 63: + raise ValueError("Bucket name must be between 3 and 63 characters") + + if not re.match(r'^[a-z0-9.-]+$', bucket_name): + raise ValueError("Bucket name can only contain lowercase letters, numbers, periods, and hyphens") + + if bucket_name.startswith('.') or bucket_name.endswith('.'): + raise ValueError("Bucket name cannot start or end with a period") + + if bucket_name.startswith('-') or bucket_name.endswith('-'): + raise ValueError("Bucket name cannot start or end with a hyphen") + + if '..' in bucket_name: + raise ValueError("Bucket name cannot contain consecutive periods") + + return bucket_name + + def _validate_and_sanitize_key(self, key: str) -> str: + """Validate and sanitize S3 key for safe local filesystem usage.""" + if not key: + raise ValueError("S3 key cannot be empty") + + if len(key) > 1024: # S3 limit is 1024 bytes + raise ValueError("S3 key is too long (max 1024 characters)") + + # Check for path traversal attempts + if '..' in key: + raise ValueError("S3 key cannot contain '..' (path traversal not allowed)") + + if key.startswith('/'): + raise ValueError("S3 key cannot start with '/'") + + # Split key into parts and validate each part + parts = key.split('/') + sanitized_parts = [] + + for part in parts: + if not part: # Empty part (double slash) + continue + + # Check for reserved names (case insensitive) + if part.upper() in self.RESERVED_NAMES: + raise ValueError(f"S3 key contains reserved name: {part}") + + # Check for invalid characters + if any(c in self.INVALID_CHARS for c in part): + raise ValueError(f"S3 key contains invalid characters in part: {part}") + + # Check if part is too long for filesystem + if len(part) > 255: # Most filesystems have 255 char limit per component + raise ValueError(f"S3 key component too long: {part}") + + sanitized_parts.append(part) + + if not sanitized_parts: + raise ValueError("S3 key resulted in empty path after sanitization") + + return '/'.join(sanitized_parts) + + def _ensure_cache_dir(self) -> None: + (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True) + + def _get_available_disk_space(self, path: Path) -> int: + """Get available disk space in bytes.""" + try: + stat = os.statvfs(path) + return stat.f_bavail * stat.f_frsize + except (OSError, AttributeError): + # Fallback for Windows or other systems + try: + import shutil + return shutil.disk_usage(path).free + except Exception: + logger.warning("Could not determine available disk space") + return float('inf') # Assume unlimited space if we can't check + + @property + def s3_client(self): + if self._s3_client is None: + try: + import boto3 + from botocore.config import Config + from botocore import UNSIGNED + + self._s3_client = boto3.client( + "s3", config=None if self.use_auth else Config(signature_version=UNSIGNED) + ) + except ImportError: + raise ImportError("Install boto3: pip install boto3") + return self._s3_client + + def _cache_path(self, key: str) -> Path: + """Create cache path that mirrors S3 structure after validation.""" + sanitized_key = self._validate_and_sanitize_key(key) + cache_path = self.cache_dir / self.bucket_name / sanitized_key + + # Double-check that the resolved path is still within cache directory + try: + cache_path.resolve().relative_to(self.cache_dir.resolve()) + except ValueError: + raise ValueError(f"S3 key resolves outside cache directory: {key}") + + return cache_path + + def _get_object_size(self, key: str) -> int: + """Get the size of an S3 object without downloading it.""" + try: + response = self.s3_client.head_object(Bucket=self.bucket_name, Key=key) + return response['ContentLength'] + except Exception as e: + logger.warning(f"Could not determine object size for s3://{self.bucket_name}/{key}: {e}") + return 0 + + def _download(self, key: str, dest: Path) -> None: + temp = dest.with_suffix(dest.suffix + ".tmp") + temp.parent.mkdir(parents=True, exist_ok=True) + + try: + # Check available disk space before downloading + object_size = self._get_object_size(key) + if object_size > 0: + available_space = self._get_available_disk_space(temp.parent) + if object_size > available_space: + raise RuntimeError( + f"Insufficient disk space. Need {object_size / (1024**3):.2f} GB, " + f"but only {available_space / (1024**3):.2f} GB available" + ) + + logger.info(f"Downloading s3://{self.bucket_name}/{key}") + obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) + with open(temp, "wb") as f: + f.write(obj["Body"].read()) + shutil.move(str(temp), str(dest)) + logger.info(f"Saved to: {dest}") + except Exception as e: + if temp.exists(): + temp.unlink() + raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}") + + def load(self, key: str, force: bool = False, mode: str = "rb") -> Union[bytes, str]: + path = self._cache_path(key) + if not path.exists() or force: + self._download(key, path) + with open(path, mode) as f: + return f.read() + + def get_cached_path(self, key: str) -> str: + """Get the local cached file path, downloading if necessary.""" + path = self._cache_path(key) + if not path.exists(): + self._download(key, path) + return str(path) + + def is_cached(self, key: str) -> bool: + try: + return self._cache_path(key).exists() + except ValueError: + return False + + def clear_cache(self, key: Optional[str] = None) -> None: + if key: + try: + path = self._cache_path(key) + if path.exists(): + path.unlink() + logger.info(f"Cleared: {path}") + except ValueError as e: + logger.warning(f"Cannot clear cache for invalid key {key}: {e}") + else: + shutil.rmtree(self.cache_dir, ignore_errors=True) + logger.info(f"Cleared entire cache: {self.cache_dir}") + + def list_cached_files(self) -> list[str]: + if not self.cache_dir.exists(): + return [] + return [str(p) for p in self.cache_dir.rglob("*") if p.is_file() and not p.name.endswith(".tmp")] + + def get_cache_size(self) -> int: + return sum(p.stat().st_size for p in self.cache_dir.rglob("*") if p.is_file()) + + def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict: + """ + List S3 objects and pseudo-folders under a prefix. + + Args: + prefix: The S3 prefix to list under (like folder path) + delimiter: Use "/" to simulate folder structure + + Returns: + A dict with two keys: + - "folders": list of sub-prefixes (folders) + - "files": list of object keys (files) + """ + paginator = self.s3_client.get_paginator("list_objects_v2") + result = {"folders": [], "files": []} + + for page in paginator.paginate(Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter): + # CommonPrefixes are like subdirectories + result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", [])) + result["files"].extend(obj["Key"] for obj in page.get("Contents", [])) + + return result + + +# Global data loader instance +_data_loader = _DataLoader() + + +def set_cache_dir(cache_dir: Union[str, Path]) -> None: + """ + Set the global cache directory for S3 downloads. + + Args: + cache_dir: Path to the cache directory + + Example: + >>> import libcachesim as lcs + >>> lcs.set_cache_dir("/tmp/my_cache") + """ + global _data_loader + _data_loader = _DataLoader(cache_dir=cache_dir) + + +def get_cache_dir() -> Path: + """ + Get the current cache directory. + + Returns: + Path to the current cache directory + + Example: + >>> import libcachesim as lcs + >>> print(lcs.get_cache_dir()) + /home/user/.cache/libcachesim/hub + """ + return _data_loader.cache_dir + + +def clear_cache(s3_path: Optional[str] = None) -> None: + """ + Clear cached files. + + Args: + s3_path: Specific S3 path to clear, or None to clear all cache + + Example: + >>> import libcachesim as lcs + >>> # Clear specific file + >>> lcs.clear_cache("s3://cache-datasets/trace1.lcs.zst") + >>> # Clear all cache + >>> lcs.clear_cache() + """ + if s3_path and s3_path.startswith("s3://"): + parsed = urlparse(s3_path) + bucket = parsed.netloc + key = parsed.path.lstrip('/') + if bucket == _data_loader.bucket_name: + _data_loader.clear_cache(key) + else: + logger.warning(f"Cannot clear cache for different bucket: {bucket}") + else: + _data_loader.clear_cache(s3_path) + + +def get_cache_size() -> int: + """ + Get total size of cached files in bytes. + + Returns: + Total cache size in bytes + + Example: + >>> import libcachesim as lcs + >>> size_mb = lcs.get_cache_size() / (1024**2) + >>> print(f"Cache size: {size_mb:.2f} MB") + """ + return _data_loader.get_cache_size() + + +def list_cached_files() -> list[str]: + """ + List all cached files. + + Returns: + List of cached file paths + + Example: + >>> import libcachesim as lcs + >>> files = lcs.list_cached_files() + >>> for file in files: + ... print(file) + """ + return _data_loader.list_cached_files() + + +def get_data_loader(bucket_name: str = None) -> _DataLoader: + """Get data loader instance for a specific bucket or the global one.""" + global _data_loader + if bucket_name is None or bucket_name == _data_loader.bucket_name: + return _data_loader + else: + return _DataLoader(bucket_name=bucket_name, cache_dir=_data_loader.cache_dir.parent) \ No newline at end of file diff --git a/libcachesim/data_loader.py b/libcachesim/data_loader.py deleted file mode 100644 index f889364..0000000 --- a/libcachesim/data_loader.py +++ /dev/null @@ -1,118 +0,0 @@ -"""S3 Bucket data loader with local caching (HuggingFace-style).""" - -from __future__ import annotations - -import hashlib -import logging -import shutil -from pathlib import Path -from typing import Optional, Union -from urllib.parse import quote - -logger = logging.getLogger(__name__) - - -class DataLoader: - DEFAULT_BUCKET = "cache-datasets" - DEFAULT_CACHE_DIR = Path.home() / ".cache/libcachesim_hub" - - def __init__( - self, bucket_name: str = DEFAULT_BUCKET, cache_dir: Optional[Union[str, Path]] = None, use_auth: bool = False - ): - self.bucket_name = bucket_name - self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR - self.use_auth = use_auth - self._s3_client = None - self._ensure_cache_dir() - - def _ensure_cache_dir(self) -> None: - (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True) - - @property - def s3_client(self): - if self._s3_client is None: - try: - import boto3 - from botocore.config import Config - from botocore import UNSIGNED - - self._s3_client = boto3.client( - "s3", config=None if self.use_auth else Config(signature_version=UNSIGNED) - ) - except ImportError: - raise ImportError("Install boto3: pip install boto3") - return self._s3_client - - def _cache_path(self, key: str) -> Path: - safe_name = hashlib.sha256(key.encode()).hexdigest()[:16] + "_" + quote(key, safe="") - return self.cache_dir / self.bucket_name / safe_name - - def _download(self, key: str, dest: Path) -> None: - temp = dest.with_suffix(dest.suffix + ".tmp") - temp.parent.mkdir(parents=True, exist_ok=True) - - try: - logger.info(f"Downloading s3://{self.bucket_name}/{key}") - obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key) - with open(temp, "wb") as f: - f.write(obj["Body"].read()) - shutil.move(str(temp), str(dest)) - logger.info(f"Saved to: {dest}") - except Exception as e: - if temp.exists(): - temp.unlink() - raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}") - - def load(self, key: str, force: bool = False, mode: str = "rb") -> Union[bytes, str]: - path = self._cache_path(key) - if not path.exists() or force: - self._download(key, path) - with open(path, mode) as f: - return f.read() - - def is_cached(self, key: str) -> bool: - return self._cache_path(key).exists() - - def get_cache_path(self, key: str) -> Path: - return self._cache_path(key).as_posix() - - def clear_cache(self, key: Optional[str] = None) -> None: - if key: - path = self._cache_path(key) - if path.exists(): - path.unlink() - logger.info(f"Cleared: {path}") - else: - shutil.rmtree(self.cache_dir, ignore_errors=True) - logger.info(f"Cleared entire cache: {self.cache_dir}") - - def list_cached_files(self) -> list[str]: - if not self.cache_dir.exists(): - return [] - return [str(p) for p in self.cache_dir.rglob("*") if p.is_file() and not p.name.endswith(".tmp")] - - def get_cache_size(self) -> int: - return sum(p.stat().st_size for p in self.cache_dir.rglob("*") if p.is_file()) - - def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict: - """ - List S3 objects and pseudo-folders under a prefix. - - Args: - prefix: The S3 prefix to list under (like folder path) - delimiter: Use "/" to simulate folder structure - - Returns: - A dict with two keys: - - "folders": list of sub-prefixes (folders) - - "files": list of object keys (files) - """ - paginator = self.s3_client.get_paginator("list_objects_v2") - result = {"folders": [], "files": []} - - for page in paginator.paginate(Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter): - # CommonPrefixes are like subdirectories - result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", [])) - result["files"].extend(obj["Key"] for obj in page.get("Contents", [])) - - return result diff --git a/libcachesim/trace_reader.py b/libcachesim/trace_reader.py index d282a68..2f66456 100644 --- a/libcachesim/trace_reader.py +++ b/libcachesim/trace_reader.py @@ -1,12 +1,15 @@ -"""Wrapper of Reader""" +"""Wrapper of Reader with S3 support.""" import logging from typing import overload, Union, Optional from collections.abc import Iterator +from urllib.parse import urlparse from .protocols import ReaderProtocol - from .libcachesim_python import TraceType, SamplerType, Request, ReaderInitParam, Reader, Sampler, ReadDirection +from ._s3_cache import get_data_loader + +logger = logging.getLogger(__name__) class TraceReader(ReaderProtocol): @@ -28,6 +31,10 @@ def __init__( self._reader = trace return + # Handle S3 URIs + if isinstance(trace, str) and trace.startswith("s3://"): + trace = self._resolve_s3_path(trace) + if reader_init_params is None: reader_init_params = ReaderInitParam() @@ -36,6 +43,74 @@ def __init__( self._reader = Reader(trace, trace_type, reader_init_params) + def _validate_s3_uri(self, s3_uri: str) -> tuple[str, str]: + """ + Validate and parse S3 URI. + + Args: + s3_uri: S3 URI like "s3://bucket/key" + + Returns: + Tuple of (bucket, key) + + Raises: + ValueError: If URI is invalid + """ + parsed = urlparse(s3_uri) + + if parsed.scheme != "s3": + raise ValueError(f"Invalid S3 URI scheme. Expected 's3', got '{parsed.scheme}': {s3_uri}") + + if not parsed.netloc: + raise ValueError(f"Missing bucket name in S3 URI: {s3_uri}") + + bucket = parsed.netloc + key = parsed.path.lstrip('/') + + if not key: + raise ValueError(f"Missing key (object path) in S3 URI: {s3_uri}") + + # Check for path traversal in the key part only + if '..' in key: + raise ValueError(f"S3 key contains path traversal patterns: {key}") + + # Check for double slashes in the key part (after s3://) + if '//' in key: + raise ValueError(f"S3 key contains double slashes: {key}") + + # Check for backslashes (not valid in URLs) + if '\\' in s3_uri: + raise ValueError(f"S3 URI contains backslashes: {s3_uri}") + + return bucket, key + + def _resolve_s3_path(self, s3_path: str) -> str: + """ + Resolve S3 path to local cached file path. + + Args: + s3_path: S3 URI like "s3://bucket/key" + + Returns: + Local file path + """ + try: + bucket, key = self._validate_s3_uri(s3_path) + except ValueError as e: + raise ValueError(f"Invalid S3 URI: {e}") + + # Get data loader for this bucket + try: + loader = get_data_loader(bucket) + except ValueError as e: + raise ValueError(f"Invalid bucket name '{bucket}': {e}") + + logger.info(f"Resolving S3 path: {s3_path}") + try: + return loader.get_cached_path(key) + except ValueError as e: + raise ValueError(f"Invalid S3 key '{key}': {e}") + @property def n_read_req(self) -> int: return self._reader.n_read_req @@ -221,4 +296,4 @@ def __getitem__(self, index: int) -> Request: self._reader.reset() self._reader.skip_n_req(index) req = Request() - return self._reader.read_one_req(req) + return self._reader.read_one_req(req) \ No newline at end of file diff --git a/src/libCacheSim b/src/libCacheSim index 4a2627d..e8a7194 160000 --- a/src/libCacheSim +++ b/src/libCacheSim @@ -1 +1 @@ -Subproject commit 4a2627d4221cd49c0182cc6aa1ddd82f3006cf83 +Subproject commit e8a7194d861857d4c4481b53cf24a1bfd33df2ad diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index 8712a8c..6f72fcf 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -1,4 +1,4 @@ -from libcachesim import TraceAnalyzer, TraceReader, DataLoader, AnalysisOption, AnalysisParam +from libcachesim import TraceAnalyzer, TraceReader, AnalysisOption, AnalysisParam import os import pytest @@ -9,13 +9,8 @@ def test_analyzer_common(): """ # Add debugging and error handling - loader = DataLoader() - loader.load("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst") - file_path = loader.get_cache_path( - "cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst" - ) - - reader = TraceReader(file_path) + URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" + reader = TraceReader(trace=URI) # For this specific small dataset (only 4 objects), configure analysis options more conservatively # to avoid bounds issues with the analysis modules diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py deleted file mode 100644 index 5aba6f5..0000000 --- a/tests/test_data_loader.py +++ /dev/null @@ -1,8 +0,0 @@ -from libcachesim import DataLoader - - -def test_data_loader_common(): - loader = DataLoader() - loader.load("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst") - path = loader.get_cache_path("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst") - filles = loader.list_s3_objects("cache_dataset_oracleGeneral/2007_msr/") diff --git a/tests/test_reader.py b/tests/test_reader.py index 688217a..3af8bbb 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -7,8 +7,8 @@ import pytest import tempfile import os -from libcachesim import TraceReader, SyntheticReader, DataLoader -from libcachesim.libcachesim_python import TraceType, SamplerType, Request, ReqOp, ReaderInitParam, Sampler +from libcachesim import TraceReader, SyntheticReader +from libcachesim.libcachesim_python import TraceType, SamplerType, Request, ReaderInitParam, Sampler class TestSyntheticReader: @@ -330,6 +330,14 @@ def test_invalid_sampling_ratio(self): finally: os.unlink(temp_file) + + def test_trace_reader_s3(self): + """Test trace reader with S3""" + URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" + reader = TraceReader(trace=URI) + for req in reader: + assert req.valid == True + break class TestReaderCompatibility: From ef10981f90e7c9c54a015ed69ede9ffcb2a70e71 Mon Sep 17 00:00:00 2001 From: haochengxia Date: Tue, 5 Aug 2025 10:23:19 +0000 Subject: [PATCH 2/4] Update docs for data loading --- README.md | 33 +++++++++--------- docs/src/en/getting_started/quickstart.md | 33 +++++++++--------- examples/basic_usage.py | 41 +++++++++++++---------- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 9996c72..f7a836f 100644 --- a/README.md +++ b/README.md @@ -54,33 +54,34 @@ With libcachesim installed, you can start cache simulation for some eviction alg ```python import libcachesim as lcs -# Step 1: Get one trace from S3 bucket -URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" -dl = lcs.DataLoader() -dl.load(URI) - -# Step 2: Open trace and process efficiently +# Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset) +URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" reader = lcs.TraceReader( - trace = dl.get_cache_path(URI), + trace = URI, trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) ) -# Step 3: Initialize cache -cache = lcs.S3FIFO(cache_size=1024*1024) +# Step 2: Initialize cache +cache = lcs.S3FIFO( + cache_size=1024*1024, + # Cache specific parameters + small_size_ratio=0.2, + ghost_size_ratio=0.8, + move_to_main_threshold=2, +) -# Step 4: Process entire trace efficiently (C++ backend) -obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) -print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") +# Step 3: Process entire trace efficiently (C++ backend) +req_miss_ratio, byte_miss_ratio = cache.process_trace(reader) +print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") -# Step 4.1: Process with limited number of requests -cache = lcs.S3FIFO(cache_size=1024*1024) -obj_miss_ratio, byte_miss_ratio = cache.process_trace( +# Step 3.1: Further process the first 1000 requests again +req_miss_ratio, byte_miss_ratio = cache.process_trace( reader, start_req=0, max_req=1000 ) -print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") +print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") ``` ## Plugin System diff --git a/docs/src/en/getting_started/quickstart.md b/docs/src/en/getting_started/quickstart.md index 70bfe22..0871526 100644 --- a/docs/src/en/getting_started/quickstart.md +++ b/docs/src/en/getting_started/quickstart.md @@ -56,33 +56,34 @@ With libcachesim installed, you can start cache simulation for some eviction alg ```python import libcachesim as lcs - # Step 1: Get one trace from S3 bucket - URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" - dl = lcs.DataLoader() - dl.load(URI) - - # Step 2: Open trace and process efficiently + # Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset) + URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" reader = lcs.TraceReader( - trace = dl.get_cache_path(URI), + trace = URI, trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) ) - # Step 3: Initialize cache - cache = lcs.S3FIFO(cache_size=1024*1024) + # Step 2: Initialize cache + cache = lcs.S3FIFO( + cache_size=1024*1024, + # Cache specific parameters + small_size_ratio=0.2, + ghost_size_ratio=0.8, + move_to_main_threshold=2, + ) - # Step 4: Process entire trace efficiently (C++ backend) - obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) - print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") + # Step 3: Process entire trace efficiently (C++ backend) + req_miss_ratio, byte_miss_ratio = cache.process_trace(reader) + print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") - # Step 4.1: Process with limited number of requests - cache = lcs.S3FIFO(cache_size=1024*1024) - obj_miss_ratio, byte_miss_ratio = cache.process_trace( + # Step 3.1: Further process the first 1000 requests again + req_miss_ratio, byte_miss_ratio = cache.process_trace( reader, start_req=0, max_req=1000 ) - print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") + print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") ``` The above example demonstrates the basic workflow of using `libcachesim` for cache simulation: diff --git a/examples/basic_usage.py b/examples/basic_usage.py index 2a4bd60..16a1b49 100644 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -1,25 +1,30 @@ import libcachesim as lcs -# Step 1: Get one trace from S3 bucket -URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" -dl = lcs.DataLoader() -dl.load(URI) - -# Step 2: Open trace and process efficiently +# Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset) +URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" reader = lcs.TraceReader( - trace=dl.get_cache_path(URI), - trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE, - reader_init_params=lcs.ReaderInitParam(ignore_obj_size=False), + trace = URI, + trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) ) -# Step 3: Initialize cache -cache = lcs.S3FIFO(cache_size=1024 * 1024) +# Step 2: Initialize cache +cache = lcs.S3FIFO( + cache_size=1024*1024, + # Cache specific parameters + small_size_ratio=0.2, + ghost_size_ratio=0.8, + move_to_main_threshold=2, +) -# Step 4: Process entire trace efficiently (C++ backend) -obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) -print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") +# Step 3: Process entire trace efficiently (C++ backend) +req_miss_ratio, byte_miss_ratio = cache.process_trace(reader) +print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") -# Step 4.1: Process with limited number of requests -cache = lcs.S3FIFO(cache_size=1024 * 1024) -obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000) -print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") +# Step 3.1: Further process the first 1000 requests again +req_miss_ratio, byte_miss_ratio = cache.process_trace( + reader, + start_req=0, + max_req=1000 +) +print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") \ No newline at end of file From b1870103800215357f465576b245b9960db22f6f Mon Sep 17 00:00:00 2001 From: haochengxia Date: Tue, 5 Aug 2025 10:23:55 +0000 Subject: [PATCH 3/4] Ruff format --- examples/basic_usage.py | 16 ++---- libcachesim/_s3_cache.py | 110 +++++++++++++++++++++--------------- libcachesim/trace_reader.py | 40 ++++++------- tests/test_reader.py | 2 +- 4 files changed, 92 insertions(+), 76 deletions(-) diff --git a/examples/basic_usage.py b/examples/basic_usage.py index 16a1b49..d1bf0b1 100644 --- a/examples/basic_usage.py +++ b/examples/basic_usage.py @@ -3,14 +3,14 @@ # Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset) URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" reader = lcs.TraceReader( - trace = URI, - trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, - reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) + trace=URI, + trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params=lcs.ReaderInitParam(ignore_obj_size=False), ) # Step 2: Initialize cache cache = lcs.S3FIFO( - cache_size=1024*1024, + cache_size=1024 * 1024, # Cache specific parameters small_size_ratio=0.2, ghost_size_ratio=0.8, @@ -22,9 +22,5 @@ print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") # Step 3.1: Further process the first 1000 requests again -req_miss_ratio, byte_miss_ratio = cache.process_trace( - reader, - start_req=0, - max_req=1000 -) -print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") \ No newline at end of file +req_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000) +print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") diff --git a/libcachesim/_s3_cache.py b/libcachesim/_s3_cache.py index 5fa60e3..4295886 100644 --- a/libcachesim/_s3_cache.py +++ b/libcachesim/_s3_cache.py @@ -16,7 +16,7 @@ class _DataLoader: """Internal S3 data loader with local caching.""" - + DEFAULT_BUCKET = "cache-datasets" DEFAULT_CACHE_DIR = Path(os.environ.get("LCS_HUB_CACHE", Path.home() / ".cache/libcachesim/hub")) @@ -24,9 +24,28 @@ class _DataLoader: INVALID_CHARS = set('<>:"|?*\x00') # Reserved names on Windows RESERVED_NAMES = { - 'CON', 'PRN', 'AUX', 'NUL', - 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', - 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' + "CON", + "PRN", + "AUX", + "NUL", + "COM1", + "COM2", + "COM3", + "COM4", + "COM5", + "COM6", + "COM7", + "COM8", + "COM9", + "LPT1", + "LPT2", + "LPT3", + "LPT4", + "LPT5", + "LPT6", + "LPT7", + "LPT8", + "LPT9", } def __init__( @@ -42,65 +61,65 @@ def _validate_bucket_name(self, bucket_name: str) -> str: """Validate S3 bucket name according to AWS rules.""" if not bucket_name: raise ValueError("Bucket name cannot be empty") - + if len(bucket_name) < 3 or len(bucket_name) > 63: raise ValueError("Bucket name must be between 3 and 63 characters") - - if not re.match(r'^[a-z0-9.-]+$', bucket_name): + + if not re.match(r"^[a-z0-9.-]+$", bucket_name): raise ValueError("Bucket name can only contain lowercase letters, numbers, periods, and hyphens") - - if bucket_name.startswith('.') or bucket_name.endswith('.'): + + if bucket_name.startswith(".") or bucket_name.endswith("."): raise ValueError("Bucket name cannot start or end with a period") - - if bucket_name.startswith('-') or bucket_name.endswith('-'): + + if bucket_name.startswith("-") or bucket_name.endswith("-"): raise ValueError("Bucket name cannot start or end with a hyphen") - - if '..' in bucket_name: + + if ".." in bucket_name: raise ValueError("Bucket name cannot contain consecutive periods") - + return bucket_name def _validate_and_sanitize_key(self, key: str) -> str: """Validate and sanitize S3 key for safe local filesystem usage.""" if not key: raise ValueError("S3 key cannot be empty") - + if len(key) > 1024: # S3 limit is 1024 bytes raise ValueError("S3 key is too long (max 1024 characters)") - + # Check for path traversal attempts - if '..' in key: + if ".." in key: raise ValueError("S3 key cannot contain '..' (path traversal not allowed)") - - if key.startswith('/'): + + if key.startswith("/"): raise ValueError("S3 key cannot start with '/'") - + # Split key into parts and validate each part - parts = key.split('/') + parts = key.split("/") sanitized_parts = [] - + for part in parts: if not part: # Empty part (double slash) continue - + # Check for reserved names (case insensitive) if part.upper() in self.RESERVED_NAMES: raise ValueError(f"S3 key contains reserved name: {part}") - + # Check for invalid characters if any(c in self.INVALID_CHARS for c in part): raise ValueError(f"S3 key contains invalid characters in part: {part}") - + # Check if part is too long for filesystem if len(part) > 255: # Most filesystems have 255 char limit per component raise ValueError(f"S3 key component too long: {part}") - + sanitized_parts.append(part) - + if not sanitized_parts: raise ValueError("S3 key resulted in empty path after sanitization") - - return '/'.join(sanitized_parts) + + return "/".join(sanitized_parts) def _ensure_cache_dir(self) -> None: (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True) @@ -114,10 +133,11 @@ def _get_available_disk_space(self, path: Path) -> int: # Fallback for Windows or other systems try: import shutil + return shutil.disk_usage(path).free except Exception: logger.warning("Could not determine available disk space") - return float('inf') # Assume unlimited space if we can't check + return float("inf") # Assume unlimited space if we can't check @property def s3_client(self): @@ -138,20 +158,20 @@ def _cache_path(self, key: str) -> Path: """Create cache path that mirrors S3 structure after validation.""" sanitized_key = self._validate_and_sanitize_key(key) cache_path = self.cache_dir / self.bucket_name / sanitized_key - + # Double-check that the resolved path is still within cache directory try: cache_path.resolve().relative_to(self.cache_dir.resolve()) except ValueError: raise ValueError(f"S3 key resolves outside cache directory: {key}") - + return cache_path def _get_object_size(self, key: str) -> int: """Get the size of an S3 object without downloading it.""" try: response = self.s3_client.head_object(Bucket=self.bucket_name, Key=key) - return response['ContentLength'] + return response["ContentLength"] except Exception as e: logger.warning(f"Could not determine object size for s3://{self.bucket_name}/{key}: {e}") return 0 @@ -254,10 +274,10 @@ def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict: def set_cache_dir(cache_dir: Union[str, Path]) -> None: """ Set the global cache directory for S3 downloads. - + Args: cache_dir: Path to the cache directory - + Example: >>> import libcachesim as lcs >>> lcs.set_cache_dir("/tmp/my_cache") @@ -269,10 +289,10 @@ def set_cache_dir(cache_dir: Union[str, Path]) -> None: def get_cache_dir() -> Path: """ Get the current cache directory. - + Returns: Path to the current cache directory - + Example: >>> import libcachesim as lcs >>> print(lcs.get_cache_dir()) @@ -284,10 +304,10 @@ def get_cache_dir() -> Path: def clear_cache(s3_path: Optional[str] = None) -> None: """ Clear cached files. - + Args: s3_path: Specific S3 path to clear, or None to clear all cache - + Example: >>> import libcachesim as lcs >>> # Clear specific file @@ -298,7 +318,7 @@ def clear_cache(s3_path: Optional[str] = None) -> None: if s3_path and s3_path.startswith("s3://"): parsed = urlparse(s3_path) bucket = parsed.netloc - key = parsed.path.lstrip('/') + key = parsed.path.lstrip("/") if bucket == _data_loader.bucket_name: _data_loader.clear_cache(key) else: @@ -310,10 +330,10 @@ def clear_cache(s3_path: Optional[str] = None) -> None: def get_cache_size() -> int: """ Get total size of cached files in bytes. - + Returns: Total cache size in bytes - + Example: >>> import libcachesim as lcs >>> size_mb = lcs.get_cache_size() / (1024**2) @@ -325,10 +345,10 @@ def get_cache_size() -> int: def list_cached_files() -> list[str]: """ List all cached files. - + Returns: List of cached file paths - + Example: >>> import libcachesim as lcs >>> files = lcs.list_cached_files() @@ -344,4 +364,4 @@ def get_data_loader(bucket_name: str = None) -> _DataLoader: if bucket_name is None or bucket_name == _data_loader.bucket_name: return _data_loader else: - return _DataLoader(bucket_name=bucket_name, cache_dir=_data_loader.cache_dir.parent) \ No newline at end of file + return _DataLoader(bucket_name=bucket_name, cache_dir=_data_loader.cache_dir.parent) diff --git a/libcachesim/trace_reader.py b/libcachesim/trace_reader.py index 2f66456..e593dbb 100644 --- a/libcachesim/trace_reader.py +++ b/libcachesim/trace_reader.py @@ -46,51 +46,51 @@ def __init__( def _validate_s3_uri(self, s3_uri: str) -> tuple[str, str]: """ Validate and parse S3 URI. - + Args: s3_uri: S3 URI like "s3://bucket/key" - + Returns: Tuple of (bucket, key) - + Raises: ValueError: If URI is invalid """ parsed = urlparse(s3_uri) - + if parsed.scheme != "s3": raise ValueError(f"Invalid S3 URI scheme. Expected 's3', got '{parsed.scheme}': {s3_uri}") - + if not parsed.netloc: raise ValueError(f"Missing bucket name in S3 URI: {s3_uri}") - + bucket = parsed.netloc - key = parsed.path.lstrip('/') - + key = parsed.path.lstrip("/") + if not key: raise ValueError(f"Missing key (object path) in S3 URI: {s3_uri}") - + # Check for path traversal in the key part only - if '..' in key: + if ".." in key: raise ValueError(f"S3 key contains path traversal patterns: {key}") - + # Check for double slashes in the key part (after s3://) - if '//' in key: + if "//" in key: raise ValueError(f"S3 key contains double slashes: {key}") - + # Check for backslashes (not valid in URLs) - if '\\' in s3_uri: + if "\\" in s3_uri: raise ValueError(f"S3 URI contains backslashes: {s3_uri}") - + return bucket, key def _resolve_s3_path(self, s3_path: str) -> str: """ Resolve S3 path to local cached file path. - + Args: s3_path: S3 URI like "s3://bucket/key" - + Returns: Local file path """ @@ -98,13 +98,13 @@ def _resolve_s3_path(self, s3_path: str) -> str: bucket, key = self._validate_s3_uri(s3_path) except ValueError as e: raise ValueError(f"Invalid S3 URI: {e}") - + # Get data loader for this bucket try: loader = get_data_loader(bucket) except ValueError as e: raise ValueError(f"Invalid bucket name '{bucket}': {e}") - + logger.info(f"Resolving S3 path: {s3_path}") try: return loader.get_cached_path(key) @@ -296,4 +296,4 @@ def __getitem__(self, index: int) -> Request: self._reader.reset() self._reader.skip_n_req(index) req = Request() - return self._reader.read_one_req(req) \ No newline at end of file + return self._reader.read_one_req(req) diff --git a/tests/test_reader.py b/tests/test_reader.py index 3af8bbb..a49466b 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -330,7 +330,7 @@ def test_invalid_sampling_ratio(self): finally: os.unlink(temp_file) - + def test_trace_reader_s3(self): """Test trace reader with S3""" URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" From 5c87c4be229ed7f3c2b50253e9a66875b9dca6ab Mon Sep 17 00:00:00 2001 From: haochengxia Date: Tue, 5 Aug 2025 10:32:33 +0000 Subject: [PATCH 4/4] Update examples and remove not used import --- README.md | 11 +++++++---- docs/src/en/getting_started/quickstart.md | 11 +++++++---- libcachesim/_s3_cache.py | 1 - 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index f7a836f..f313bd7 100644 --- a/README.md +++ b/README.md @@ -76,11 +76,14 @@ req_miss_ratio, byte_miss_ratio = cache.process_trace(reader) print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") # Step 3.1: Further process the first 1000 requests again -req_miss_ratio, byte_miss_ratio = cache.process_trace( - reader, - start_req=0, - max_req=1000 +cache = lcs.S3FIFO( + cache_size=1024 * 1024, + # Cache specific parameters + small_size_ratio=0.2, + ghost_size_ratio=0.8, + move_to_main_threshold=2, ) +req_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000) print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") ``` diff --git a/docs/src/en/getting_started/quickstart.md b/docs/src/en/getting_started/quickstart.md index 0871526..4246046 100644 --- a/docs/src/en/getting_started/quickstart.md +++ b/docs/src/en/getting_started/quickstart.md @@ -78,11 +78,14 @@ With libcachesim installed, you can start cache simulation for some eviction alg print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") # Step 3.1: Further process the first 1000 requests again - req_miss_ratio, byte_miss_ratio = cache.process_trace( - reader, - start_req=0, - max_req=1000 + cache = lcs.S3FIFO( + cache_size=1024 * 1024, + # Cache specific parameters + small_size_ratio=0.2, + ghost_size_ratio=0.8, + move_to_main_threshold=2, ) + req_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000) print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") ``` diff --git a/libcachesim/_s3_cache.py b/libcachesim/_s3_cache.py index 4295886..36874ff 100644 --- a/libcachesim/_s3_cache.py +++ b/libcachesim/_s3_cache.py @@ -2,7 +2,6 @@ from __future__ import annotations -import hashlib import logging import os import re