From 0db2e1668db4865af452f0e9a7c958fe9936b376 Mon Sep 17 00:00:00 2001
From: haochengxia <xhc_1007@163.com>
Date: Tue, 5 Aug 2025 10:12:42 +0000
Subject: [PATCH 1/4] Feat: Make data load internal

---
 libcachesim/__init__.py     |   3 -
 libcachesim/_s3_cache.py    | 347 ++++++++++++++++++++++++++++++++++++
 libcachesim/data_loader.py  | 118 ------------
 libcachesim/trace_reader.py |  81 ++++++++-
 src/libCacheSim             |   2 +-
 tests/test_analyzer.py      |  11 +-
 tests/test_data_loader.py   |   8 -
 tests/test_reader.py        |  12 +-
 8 files changed, 439 insertions(+), 143 deletions(-)
 create mode 100644 libcachesim/_s3_cache.py
 delete mode 100644 libcachesim/data_loader.py
 delete mode 100644 tests/test_data_loader.py

diff --git a/libcachesim/__init__.py b/libcachesim/__init__.py
index c9fc1e7..86baf3a 100644
--- a/libcachesim/__init__.py
+++ b/libcachesim/__init__.py
@@ -59,7 +59,6 @@
 from .trace_analyzer import TraceAnalyzer
 from .synthetic_reader import SyntheticReader, create_zipf_requests, create_uniform_requests
 from .util import Util
-from .data_loader import DataLoader
 
 __all__ = [
     # Core classes
@@ -118,8 +117,6 @@
     "create_uniform_requests",
     # Utilities
     "Util",
-    # Data loader
-    "DataLoader",
     # Metadata
     "__doc__",
     "__version__",
diff --git a/libcachesim/_s3_cache.py b/libcachesim/_s3_cache.py
new file mode 100644
index 0000000..5fa60e3
--- /dev/null
+++ b/libcachesim/_s3_cache.py
@@ -0,0 +1,347 @@
+"""S3 Bucket data loader with local caching (HuggingFace-style)."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import os
+import re
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+
+
+class _DataLoader:
+    """Internal S3 data loader with local caching."""
+    
+    DEFAULT_BUCKET = "cache-datasets"
+    DEFAULT_CACHE_DIR = Path(os.environ.get("LCS_HUB_CACHE", Path.home() / ".cache/libcachesim/hub"))
+
+    # Characters that are problematic on various filesystems
+    INVALID_CHARS = set('<>:"|?*\x00')
+    # Reserved names on Windows
+    RESERVED_NAMES = {
+        'CON', 'PRN', 'AUX', 'NUL',
+        'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
+        'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
+    }
+
+    def __init__(
+        self, bucket_name: str = DEFAULT_BUCKET, cache_dir: Optional[Union[str, Path]] = None, use_auth: bool = False
+    ):
+        self.bucket_name = self._validate_bucket_name(bucket_name)
+        self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR
+        self.use_auth = use_auth
+        self._s3_client = None
+        self._ensure_cache_dir()
+
+    def _validate_bucket_name(self, bucket_name: str) -> str:
+        """Validate S3 bucket name according to AWS rules."""
+        if not bucket_name:
+            raise ValueError("Bucket name cannot be empty")
+        
+        if len(bucket_name) < 3 or len(bucket_name) > 63:
+            raise ValueError("Bucket name must be between 3 and 63 characters")
+        
+        if not re.match(r'^[a-z0-9.-]+$', bucket_name):
+            raise ValueError("Bucket name can only contain lowercase letters, numbers, periods, and hyphens")
+        
+        if bucket_name.startswith('.') or bucket_name.endswith('.'):
+            raise ValueError("Bucket name cannot start or end with a period")
+        
+        if bucket_name.startswith('-') or bucket_name.endswith('-'):
+            raise ValueError("Bucket name cannot start or end with a hyphen")
+        
+        if '..' in bucket_name:
+            raise ValueError("Bucket name cannot contain consecutive periods")
+        
+        return bucket_name
+
+    def _validate_and_sanitize_key(self, key: str) -> str:
+        """Validate and sanitize S3 key for safe local filesystem usage."""
+        if not key:
+            raise ValueError("S3 key cannot be empty")
+        
+        if len(key) > 1024:  # S3 limit is 1024 bytes
+            raise ValueError("S3 key is too long (max 1024 characters)")
+        
+        # Check for path traversal attempts
+        if '..' in key:
+            raise ValueError("S3 key cannot contain '..' (path traversal not allowed)")
+        
+        if key.startswith('/'):
+            raise ValueError("S3 key cannot start with '/'")
+        
+        # Split key into parts and validate each part
+        parts = key.split('/')
+        sanitized_parts = []
+        
+        for part in parts:
+            if not part:  # Empty part (double slash)
+                continue
+                
+            # Check for reserved names (case insensitive)
+            if part.upper() in self.RESERVED_NAMES:
+                raise ValueError(f"S3 key contains reserved name: {part}")
+            
+            # Check for invalid characters
+            if any(c in self.INVALID_CHARS for c in part):
+                raise ValueError(f"S3 key contains invalid characters in part: {part}")
+            
+            # Check if part is too long for filesystem
+            if len(part) > 255:  # Most filesystems have 255 char limit per component
+                raise ValueError(f"S3 key component too long: {part}")
+            
+            sanitized_parts.append(part)
+        
+        if not sanitized_parts:
+            raise ValueError("S3 key resulted in empty path after sanitization")
+        
+        return '/'.join(sanitized_parts)
+
+    def _ensure_cache_dir(self) -> None:
+        (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True)
+
+    def _get_available_disk_space(self, path: Path) -> int:
+        """Get available disk space in bytes."""
+        try:
+            stat = os.statvfs(path)
+            return stat.f_bavail * stat.f_frsize
+        except (OSError, AttributeError):
+            # Fallback for Windows or other systems
+            try:
+                import shutil
+                return shutil.disk_usage(path).free
+            except Exception:
+                logger.warning("Could not determine available disk space")
+                return float('inf')  # Assume unlimited space if we can't check
+
+    @property
+    def s3_client(self):
+        if self._s3_client is None:
+            try:
+                import boto3
+                from botocore.config import Config
+                from botocore import UNSIGNED
+
+                self._s3_client = boto3.client(
+                    "s3", config=None if self.use_auth else Config(signature_version=UNSIGNED)
+                )
+            except ImportError:
+                raise ImportError("Install boto3: pip install boto3")
+        return self._s3_client
+
+    def _cache_path(self, key: str) -> Path:
+        """Create cache path that mirrors S3 structure after validation."""
+        sanitized_key = self._validate_and_sanitize_key(key)
+        cache_path = self.cache_dir / self.bucket_name / sanitized_key
+        
+        # Double-check that the resolved path is still within cache directory
+        try:
+            cache_path.resolve().relative_to(self.cache_dir.resolve())
+        except ValueError:
+            raise ValueError(f"S3 key resolves outside cache directory: {key}")
+        
+        return cache_path
+
+    def _get_object_size(self, key: str) -> int:
+        """Get the size of an S3 object without downloading it."""
+        try:
+            response = self.s3_client.head_object(Bucket=self.bucket_name, Key=key)
+            return response['ContentLength']
+        except Exception as e:
+            logger.warning(f"Could not determine object size for s3://{self.bucket_name}/{key}: {e}")
+            return 0
+
+    def _download(self, key: str, dest: Path) -> None:
+        temp = dest.with_suffix(dest.suffix + ".tmp")
+        temp.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            # Check available disk space before downloading
+            object_size = self._get_object_size(key)
+            if object_size > 0:
+                available_space = self._get_available_disk_space(temp.parent)
+                if object_size > available_space:
+                    raise RuntimeError(
+                        f"Insufficient disk space. Need {object_size / (1024**3):.2f} GB, "
+                        f"but only {available_space / (1024**3):.2f} GB available"
+                    )
+
+            logger.info(f"Downloading s3://{self.bucket_name}/{key}")
+            obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key)
+            with open(temp, "wb") as f:
+                f.write(obj["Body"].read())
+            shutil.move(str(temp), str(dest))
+            logger.info(f"Saved to: {dest}")
+        except Exception as e:
+            if temp.exists():
+                temp.unlink()
+            raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}")
+
+    def load(self, key: str, force: bool = False, mode: str = "rb") -> Union[bytes, str]:
+        path = self._cache_path(key)
+        if not path.exists() or force:
+            self._download(key, path)
+        with open(path, mode) as f:
+            return f.read()
+
+    def get_cached_path(self, key: str) -> str:
+        """Get the local cached file path, downloading if necessary."""
+        path = self._cache_path(key)
+        if not path.exists():
+            self._download(key, path)
+        return str(path)
+
+    def is_cached(self, key: str) -> bool:
+        try:
+            return self._cache_path(key).exists()
+        except ValueError:
+            return False
+
+    def clear_cache(self, key: Optional[str] = None) -> None:
+        if key:
+            try:
+                path = self._cache_path(key)
+                if path.exists():
+                    path.unlink()
+                    logger.info(f"Cleared: {path}")
+            except ValueError as e:
+                logger.warning(f"Cannot clear cache for invalid key {key}: {e}")
+        else:
+            shutil.rmtree(self.cache_dir, ignore_errors=True)
+            logger.info(f"Cleared entire cache: {self.cache_dir}")
+
+    def list_cached_files(self) -> list[str]:
+        if not self.cache_dir.exists():
+            return []
+        return [str(p) for p in self.cache_dir.rglob("*") if p.is_file() and not p.name.endswith(".tmp")]
+
+    def get_cache_size(self) -> int:
+        return sum(p.stat().st_size for p in self.cache_dir.rglob("*") if p.is_file())
+
+    def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict:
+        """
+        List S3 objects and pseudo-folders under a prefix.
+
+        Args:
+            prefix: The S3 prefix to list under (like folder path)
+            delimiter: Use "/" to simulate folder structure
+
+        Returns:
+            A dict with two keys:
+                - "folders": list of sub-prefixes (folders)
+                - "files": list of object keys (files)
+        """
+        paginator = self.s3_client.get_paginator("list_objects_v2")
+        result = {"folders": [], "files": []}
+
+        for page in paginator.paginate(Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter):
+            # CommonPrefixes are like subdirectories
+            result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", []))
+            result["files"].extend(obj["Key"] for obj in page.get("Contents", []))
+
+        return result
+
+
+# Global data loader instance
+_data_loader = _DataLoader()
+
+
+def set_cache_dir(cache_dir: Union[str, Path]) -> None:
+    """
+    Set the global cache directory for S3 downloads.
+    
+    Args:
+        cache_dir: Path to the cache directory
+        
+    Example:
+        >>> import libcachesim as lcs
+        >>> lcs.set_cache_dir("/tmp/my_cache")
+    """
+    global _data_loader
+    _data_loader = _DataLoader(cache_dir=cache_dir)
+
+
+def get_cache_dir() -> Path:
+    """
+    Get the current cache directory.
+    
+    Returns:
+        Path to the current cache directory
+        
+    Example:
+        >>> import libcachesim as lcs
+        >>> print(lcs.get_cache_dir())
+        /home/user/.cache/libcachesim/hub
+    """
+    return _data_loader.cache_dir
+
+
+def clear_cache(s3_path: Optional[str] = None) -> None:
+    """
+    Clear cached files.
+    
+    Args:
+        s3_path: Specific S3 path to clear, or None to clear all cache
+        
+    Example:
+        >>> import libcachesim as lcs
+        >>> # Clear specific file
+        >>> lcs.clear_cache("s3://cache-datasets/trace1.lcs.zst")
+        >>> # Clear all cache
+        >>> lcs.clear_cache()
+    """
+    if s3_path and s3_path.startswith("s3://"):
+        parsed = urlparse(s3_path)
+        bucket = parsed.netloc
+        key = parsed.path.lstrip('/')
+        if bucket == _data_loader.bucket_name:
+            _data_loader.clear_cache(key)
+        else:
+            logger.warning(f"Cannot clear cache for different bucket: {bucket}")
+    else:
+        _data_loader.clear_cache(s3_path)
+
+
+def get_cache_size() -> int:
+    """
+    Get total size of cached files in bytes.
+    
+    Returns:
+        Total cache size in bytes
+        
+    Example:
+        >>> import libcachesim as lcs
+        >>> size_mb = lcs.get_cache_size() / (1024**2)
+        >>> print(f"Cache size: {size_mb:.2f} MB")
+    """
+    return _data_loader.get_cache_size()
+
+
+def list_cached_files() -> list[str]:
+    """
+    List all cached files.
+    
+    Returns:
+        List of cached file paths
+        
+    Example:
+        >>> import libcachesim as lcs
+        >>> files = lcs.list_cached_files()
+        >>> for file in files:
+        ...     print(file)
+    """
+    return _data_loader.list_cached_files()
+
+
+def get_data_loader(bucket_name: str = None) -> _DataLoader:
+    """Get data loader instance for a specific bucket or the global one."""
+    global _data_loader
+    if bucket_name is None or bucket_name == _data_loader.bucket_name:
+        return _data_loader
+    else:
+        return _DataLoader(bucket_name=bucket_name, cache_dir=_data_loader.cache_dir.parent)
\ No newline at end of file
diff --git a/libcachesim/data_loader.py b/libcachesim/data_loader.py
deleted file mode 100644
index f889364..0000000
--- a/libcachesim/data_loader.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""S3 Bucket data loader with local caching (HuggingFace-style)."""
-
-from __future__ import annotations
-
-import hashlib
-import logging
-import shutil
-from pathlib import Path
-from typing import Optional, Union
-from urllib.parse import quote
-
-logger = logging.getLogger(__name__)
-
-
-class DataLoader:
-    DEFAULT_BUCKET = "cache-datasets"
-    DEFAULT_CACHE_DIR = Path.home() / ".cache/libcachesim_hub"
-
-    def __init__(
-        self, bucket_name: str = DEFAULT_BUCKET, cache_dir: Optional[Union[str, Path]] = None, use_auth: bool = False
-    ):
-        self.bucket_name = bucket_name
-        self.cache_dir = Path(cache_dir) if cache_dir else self.DEFAULT_CACHE_DIR
-        self.use_auth = use_auth
-        self._s3_client = None
-        self._ensure_cache_dir()
-
-    def _ensure_cache_dir(self) -> None:
-        (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True)
-
-    @property
-    def s3_client(self):
-        if self._s3_client is None:
-            try:
-                import boto3
-                from botocore.config import Config
-                from botocore import UNSIGNED
-
-                self._s3_client = boto3.client(
-                    "s3", config=None if self.use_auth else Config(signature_version=UNSIGNED)
-                )
-            except ImportError:
-                raise ImportError("Install boto3: pip install boto3")
-        return self._s3_client
-
-    def _cache_path(self, key: str) -> Path:
-        safe_name = hashlib.sha256(key.encode()).hexdigest()[:16] + "_" + quote(key, safe="")
-        return self.cache_dir / self.bucket_name / safe_name
-
-    def _download(self, key: str, dest: Path) -> None:
-        temp = dest.with_suffix(dest.suffix + ".tmp")
-        temp.parent.mkdir(parents=True, exist_ok=True)
-
-        try:
-            logger.info(f"Downloading s3://{self.bucket_name}/{key}")
-            obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=key)
-            with open(temp, "wb") as f:
-                f.write(obj["Body"].read())
-            shutil.move(str(temp), str(dest))
-            logger.info(f"Saved to: {dest}")
-        except Exception as e:
-            if temp.exists():
-                temp.unlink()
-            raise RuntimeError(f"Download failed for s3://{self.bucket_name}/{key}: {e}")
-
-    def load(self, key: str, force: bool = False, mode: str = "rb") -> Union[bytes, str]:
-        path = self._cache_path(key)
-        if not path.exists() or force:
-            self._download(key, path)
-        with open(path, mode) as f:
-            return f.read()
-
-    def is_cached(self, key: str) -> bool:
-        return self._cache_path(key).exists()
-
-    def get_cache_path(self, key: str) -> Path:
-        return self._cache_path(key).as_posix()
-
-    def clear_cache(self, key: Optional[str] = None) -> None:
-        if key:
-            path = self._cache_path(key)
-            if path.exists():
-                path.unlink()
-                logger.info(f"Cleared: {path}")
-        else:
-            shutil.rmtree(self.cache_dir, ignore_errors=True)
-            logger.info(f"Cleared entire cache: {self.cache_dir}")
-
-    def list_cached_files(self) -> list[str]:
-        if not self.cache_dir.exists():
-            return []
-        return [str(p) for p in self.cache_dir.rglob("*") if p.is_file() and not p.name.endswith(".tmp")]
-
-    def get_cache_size(self) -> int:
-        return sum(p.stat().st_size for p in self.cache_dir.rglob("*") if p.is_file())
-
-    def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict:
-        """
-        List S3 objects and pseudo-folders under a prefix.
-
-        Args:
-            prefix: The S3 prefix to list under (like folder path)
-            delimiter: Use "/" to simulate folder structure
-
-        Returns:
-            A dict with two keys:
-                - "folders": list of sub-prefixes (folders)
-                - "files": list of object keys (files)
-        """
-        paginator = self.s3_client.get_paginator("list_objects_v2")
-        result = {"folders": [], "files": []}
-
-        for page in paginator.paginate(Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter):
-            # CommonPrefixes are like subdirectories
-            result["folders"].extend(cp["Prefix"] for cp in page.get("CommonPrefixes", []))
-            result["files"].extend(obj["Key"] for obj in page.get("Contents", []))
-
-        return result
diff --git a/libcachesim/trace_reader.py b/libcachesim/trace_reader.py
index d282a68..2f66456 100644
--- a/libcachesim/trace_reader.py
+++ b/libcachesim/trace_reader.py
@@ -1,12 +1,15 @@
-"""Wrapper of Reader"""
+"""Wrapper of Reader with S3 support."""
 
 import logging
 from typing import overload, Union, Optional
 from collections.abc import Iterator
+from urllib.parse import urlparse
 
 from .protocols import ReaderProtocol
-
 from .libcachesim_python import TraceType, SamplerType, Request, ReaderInitParam, Reader, Sampler, ReadDirection
+from ._s3_cache import get_data_loader
+
+logger = logging.getLogger(__name__)
 
 
 class TraceReader(ReaderProtocol):
@@ -28,6 +31,10 @@ def __init__(
             self._reader = trace
             return
 
+        # Handle S3 URIs
+        if isinstance(trace, str) and trace.startswith("s3://"):
+            trace = self._resolve_s3_path(trace)
+
         if reader_init_params is None:
             reader_init_params = ReaderInitParam()
 
@@ -36,6 +43,74 @@ def __init__(
 
         self._reader = Reader(trace, trace_type, reader_init_params)
 
+    def _validate_s3_uri(self, s3_uri: str) -> tuple[str, str]:
+        """
+        Validate and parse S3 URI.
+        
+        Args:
+            s3_uri: S3 URI like "s3://bucket/key"
+            
+        Returns:
+            Tuple of (bucket, key)
+            
+        Raises:
+            ValueError: If URI is invalid
+        """
+        parsed = urlparse(s3_uri)
+        
+        if parsed.scheme != "s3":
+            raise ValueError(f"Invalid S3 URI scheme. Expected 's3', got '{parsed.scheme}': {s3_uri}")
+        
+        if not parsed.netloc:
+            raise ValueError(f"Missing bucket name in S3 URI: {s3_uri}")
+        
+        bucket = parsed.netloc
+        key = parsed.path.lstrip('/')
+        
+        if not key:
+            raise ValueError(f"Missing key (object path) in S3 URI: {s3_uri}")
+        
+        # Check for path traversal in the key part only
+        if '..' in key:
+            raise ValueError(f"S3 key contains path traversal patterns: {key}")
+        
+        # Check for double slashes in the key part (after s3://)
+        if '//' in key:
+            raise ValueError(f"S3 key contains double slashes: {key}")
+        
+        # Check for backslashes (not valid in URLs)
+        if '\\' in s3_uri:
+            raise ValueError(f"S3 URI contains backslashes: {s3_uri}")
+        
+        return bucket, key
+
+    def _resolve_s3_path(self, s3_path: str) -> str:
+        """
+        Resolve S3 path to local cached file path.
+        
+        Args:
+            s3_path: S3 URI like "s3://bucket/key"
+            
+        Returns:
+            Local file path
+        """
+        try:
+            bucket, key = self._validate_s3_uri(s3_path)
+        except ValueError as e:
+            raise ValueError(f"Invalid S3 URI: {e}")
+        
+        # Get data loader for this bucket
+        try:
+            loader = get_data_loader(bucket)
+        except ValueError as e:
+            raise ValueError(f"Invalid bucket name '{bucket}': {e}")
+        
+        logger.info(f"Resolving S3 path: {s3_path}")
+        try:
+            return loader.get_cached_path(key)
+        except ValueError as e:
+            raise ValueError(f"Invalid S3 key '{key}': {e}")
+
     @property
     def n_read_req(self) -> int:
         return self._reader.n_read_req
@@ -221,4 +296,4 @@ def __getitem__(self, index: int) -> Request:
         self._reader.reset()
         self._reader.skip_n_req(index)
         req = Request()
-        return self._reader.read_one_req(req)
+        return self._reader.read_one_req(req)
\ No newline at end of file
diff --git a/src/libCacheSim b/src/libCacheSim
index 4a2627d..e8a7194 160000
--- a/src/libCacheSim
+++ b/src/libCacheSim
@@ -1 +1 @@
-Subproject commit 4a2627d4221cd49c0182cc6aa1ddd82f3006cf83
+Subproject commit e8a7194d861857d4c4481b53cf24a1bfd33df2ad
diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
index 8712a8c..6f72fcf 100644
--- a/tests/test_analyzer.py
+++ b/tests/test_analyzer.py
@@ -1,4 +1,4 @@
-from libcachesim import TraceAnalyzer, TraceReader, DataLoader, AnalysisOption, AnalysisParam
+from libcachesim import TraceAnalyzer, TraceReader, AnalysisOption, AnalysisParam
 import os
 import pytest
 
@@ -9,13 +9,8 @@ def test_analyzer_common():
     """
 
     # Add debugging and error handling
-    loader = DataLoader()
-    loader.load("cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst")
-    file_path = loader.get_cache_path(
-        "cache_dataset_oracleGeneral/2020_tencentBlock/1K/tencentBlock_1621.oracleGeneral.zst"
-    )
-
-    reader = TraceReader(file_path)
+    URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
+    reader = TraceReader(trace=URI)
 
     # For this specific small dataset (only 4 objects), configure analysis options more conservatively
     # to avoid bounds issues with the analysis modules
diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py
deleted file mode 100644
index 5aba6f5..0000000
--- a/tests/test_data_loader.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from libcachesim import DataLoader
-
-
-def test_data_loader_common():
-    loader = DataLoader()
-    loader.load("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst")
-    path = loader.get_cache_path("cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst")
-    filles = loader.list_s3_objects("cache_dataset_oracleGeneral/2007_msr/")
diff --git a/tests/test_reader.py b/tests/test_reader.py
index 688217a..3af8bbb 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -7,8 +7,8 @@
 import pytest
 import tempfile
 import os
-from libcachesim import TraceReader, SyntheticReader, DataLoader
-from libcachesim.libcachesim_python import TraceType, SamplerType, Request, ReqOp, ReaderInitParam, Sampler
+from libcachesim import TraceReader, SyntheticReader
+from libcachesim.libcachesim_python import TraceType, SamplerType, Request, ReaderInitParam, Sampler
 
 
 class TestSyntheticReader:
@@ -330,6 +330,14 @@ def test_invalid_sampling_ratio(self):
 
         finally:
             os.unlink(temp_file)
+    
+    def test_trace_reader_s3(self):
+        """Test trace reader with S3"""
+        URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
+        reader = TraceReader(trace=URI)
+        for req in reader:
+            assert req.valid == True
+            break
 
 
 class TestReaderCompatibility:

From ef10981f90e7c9c54a015ed69ede9ffcb2a70e71 Mon Sep 17 00:00:00 2001
From: haochengxia <xhc_1007@163.com>
Date: Tue, 5 Aug 2025 10:23:19 +0000
Subject: [PATCH 2/4] Update docs for data loading

---
 README.md                                 | 33 +++++++++---------
 docs/src/en/getting_started/quickstart.md | 33 +++++++++---------
 examples/basic_usage.py                   | 41 +++++++++++++----------
 3 files changed, 57 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 9996c72..f7a836f 100644
--- a/README.md
+++ b/README.md
@@ -54,33 +54,34 @@ With libcachesim installed, you can start cache simulation for some eviction alg
 ```python
 import libcachesim as lcs
 
-# Step 1: Get one trace from S3 bucket
-URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
-dl = lcs.DataLoader()
-dl.load(URI)
-
-# Step 2: Open trace and process efficiently
+# Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset)
+URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
 reader = lcs.TraceReader(
-    trace = dl.get_cache_path(URI),
+    trace = URI,
     trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE,
     reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False)
 )
 
-# Step 3: Initialize cache
-cache = lcs.S3FIFO(cache_size=1024*1024)
+# Step 2: Initialize cache
+cache = lcs.S3FIFO(
+    cache_size=1024*1024,
+    # Cache specific parameters
+    small_size_ratio=0.2,
+    ghost_size_ratio=0.8,
+    move_to_main_threshold=2,
+)
 
-# Step 4: Process entire trace efficiently (C++ backend)
-obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
-print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
+# Step 3: Process entire trace efficiently (C++ backend)
+req_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
+print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 
-# Step 4.1: Process with limited number of requests
-cache = lcs.S3FIFO(cache_size=1024*1024)
-obj_miss_ratio, byte_miss_ratio = cache.process_trace(
+# Step 3.1: Further process the first 1000 requests again
+req_miss_ratio, byte_miss_ratio = cache.process_trace(
     reader,
     start_req=0,
     max_req=1000
 )
-print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
+print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 ```
 
 ## Plugin System
diff --git a/docs/src/en/getting_started/quickstart.md b/docs/src/en/getting_started/quickstart.md
index 70bfe22..0871526 100644
--- a/docs/src/en/getting_started/quickstart.md
+++ b/docs/src/en/getting_started/quickstart.md
@@ -56,33 +56,34 @@ With libcachesim installed, you can start cache simulation for some eviction alg
     ```python
     import libcachesim as lcs
 
-    # Step 1: Get one trace from S3 bucket
-    URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
-    dl = lcs.DataLoader()
-    dl.load(URI)
-
-    # Step 2: Open trace and process efficiently
+    # Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset)
+    URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
     reader = lcs.TraceReader(
-        trace = dl.get_cache_path(URI),
+        trace = URI,
         trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE,
         reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False)
     )
 
-    # Step 3: Initialize cache
-    cache = lcs.S3FIFO(cache_size=1024*1024)
+    # Step 2: Initialize cache
+    cache = lcs.S3FIFO(
+        cache_size=1024*1024,
+        # Cache specific parameters
+        small_size_ratio=0.2,
+        ghost_size_ratio=0.8,
+        move_to_main_threshold=2,
+    )
 
-    # Step 4: Process entire trace efficiently (C++ backend)
-    obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
-    print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
+    # Step 3: Process entire trace efficiently (C++ backend)
+    req_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
+    print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 
-    # Step 4.1: Process with limited number of requests
-    cache = lcs.S3FIFO(cache_size=1024*1024)
-    obj_miss_ratio, byte_miss_ratio = cache.process_trace(
+    # Step 3.1: Further process the first 1000 requests again
+    req_miss_ratio, byte_miss_ratio = cache.process_trace(
         reader,
         start_req=0,
         max_req=1000
     )
-    print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
+    print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
     ```
 
 The above example demonstrates the basic workflow of using `libcachesim` for cache simulation:
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
index 2a4bd60..16a1b49 100644
--- a/examples/basic_usage.py
+++ b/examples/basic_usage.py
@@ -1,25 +1,30 @@
 import libcachesim as lcs
 
-# Step 1: Get one trace from S3 bucket
-URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
-dl = lcs.DataLoader()
-dl.load(URI)
-
-# Step 2: Open trace and process efficiently
+# Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset)
+URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
 reader = lcs.TraceReader(
-    trace=dl.get_cache_path(URI),
-    trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE,
-    reader_init_params=lcs.ReaderInitParam(ignore_obj_size=False),
+    trace = URI,
+    trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE,
+    reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False)
 )
 
-# Step 3: Initialize cache
-cache = lcs.S3FIFO(cache_size=1024 * 1024)
+# Step 2: Initialize cache
+cache = lcs.S3FIFO(
+    cache_size=1024*1024,
+    # Cache specific parameters
+    small_size_ratio=0.2,
+    ghost_size_ratio=0.8,
+    move_to_main_threshold=2,
+)
 
-# Step 4: Process entire trace efficiently (C++ backend)
-obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
-print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
+# Step 3: Process entire trace efficiently (C++ backend)
+req_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
+print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 
-# Step 4.1: Process with limited number of requests
-cache = lcs.S3FIFO(cache_size=1024 * 1024)
-obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000)
-print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
+# Step 3.1: Further process the first 1000 requests again
+req_miss_ratio, byte_miss_ratio = cache.process_trace(
+    reader,
+    start_req=0,
+    max_req=1000
+)
+print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
\ No newline at end of file

From b1870103800215357f465576b245b9960db22f6f Mon Sep 17 00:00:00 2001
From: haochengxia <xhc_1007@163.com>
Date: Tue, 5 Aug 2025 10:23:55 +0000
Subject: [PATCH 3/4] Ruff format

---
 examples/basic_usage.py     |  16 ++----
 libcachesim/_s3_cache.py    | 110 +++++++++++++++++++++---------------
 libcachesim/trace_reader.py |  40 ++++++-------
 tests/test_reader.py        |   2 +-
 4 files changed, 92 insertions(+), 76 deletions(-)

diff --git a/examples/basic_usage.py b/examples/basic_usage.py
index 16a1b49..d1bf0b1 100644
--- a/examples/basic_usage.py
+++ b/examples/basic_usage.py
@@ -3,14 +3,14 @@
 # Step 1: Open a trace hosted on S3 (find more via https://github.com/cacheMon/cache_dataset)
 URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"
 reader = lcs.TraceReader(
-    trace = URI,
-    trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE,
-    reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False)
+    trace=URI,
+    trace_type=lcs.TraceType.ORACLE_GENERAL_TRACE,
+    reader_init_params=lcs.ReaderInitParam(ignore_obj_size=False),
 )
 
 # Step 2: Initialize cache
 cache = lcs.S3FIFO(
-    cache_size=1024*1024,
+    cache_size=1024 * 1024,
     # Cache specific parameters
     small_size_ratio=0.2,
     ghost_size_ratio=0.8,
@@ -22,9 +22,5 @@
 print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 
 # Step 3.1: Further process the first 1000 requests again
-req_miss_ratio, byte_miss_ratio = cache.process_trace(
-    reader,
-    start_req=0,
-    max_req=1000
-)
-print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
\ No newline at end of file
+req_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000)
+print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
diff --git a/libcachesim/_s3_cache.py b/libcachesim/_s3_cache.py
index 5fa60e3..4295886 100644
--- a/libcachesim/_s3_cache.py
+++ b/libcachesim/_s3_cache.py
@@ -16,7 +16,7 @@
 
 class _DataLoader:
     """Internal S3 data loader with local caching."""
-    
+
     DEFAULT_BUCKET = "cache-datasets"
     DEFAULT_CACHE_DIR = Path(os.environ.get("LCS_HUB_CACHE", Path.home() / ".cache/libcachesim/hub"))
 
@@ -24,9 +24,28 @@ class _DataLoader:
     INVALID_CHARS = set('<>:"|?*\x00')
     # Reserved names on Windows
     RESERVED_NAMES = {
-        'CON', 'PRN', 'AUX', 'NUL',
-        'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
-        'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'
+        "CON",
+        "PRN",
+        "AUX",
+        "NUL",
+        "COM1",
+        "COM2",
+        "COM3",
+        "COM4",
+        "COM5",
+        "COM6",
+        "COM7",
+        "COM8",
+        "COM9",
+        "LPT1",
+        "LPT2",
+        "LPT3",
+        "LPT4",
+        "LPT5",
+        "LPT6",
+        "LPT7",
+        "LPT8",
+        "LPT9",
     }
 
     def __init__(
@@ -42,65 +61,65 @@ def _validate_bucket_name(self, bucket_name: str) -> str:
         """Validate S3 bucket name according to AWS rules."""
         if not bucket_name:
             raise ValueError("Bucket name cannot be empty")
-        
+
         if len(bucket_name) < 3 or len(bucket_name) > 63:
             raise ValueError("Bucket name must be between 3 and 63 characters")
-        
-        if not re.match(r'^[a-z0-9.-]+$', bucket_name):
+
+        if not re.match(r"^[a-z0-9.-]+$", bucket_name):
             raise ValueError("Bucket name can only contain lowercase letters, numbers, periods, and hyphens")
-        
-        if bucket_name.startswith('.') or bucket_name.endswith('.'):
+
+        if bucket_name.startswith(".") or bucket_name.endswith("."):
             raise ValueError("Bucket name cannot start or end with a period")
-        
-        if bucket_name.startswith('-') or bucket_name.endswith('-'):
+
+        if bucket_name.startswith("-") or bucket_name.endswith("-"):
             raise ValueError("Bucket name cannot start or end with a hyphen")
-        
-        if '..' in bucket_name:
+
+        if ".." in bucket_name:
             raise ValueError("Bucket name cannot contain consecutive periods")
-        
+
         return bucket_name
 
     def _validate_and_sanitize_key(self, key: str) -> str:
         """Validate and sanitize S3 key for safe local filesystem usage."""
         if not key:
             raise ValueError("S3 key cannot be empty")
-        
+
         if len(key) > 1024:  # S3 limit is 1024 bytes
             raise ValueError("S3 key is too long (max 1024 characters)")
-        
+
         # Check for path traversal attempts
-        if '..' in key:
+        if ".." in key:
             raise ValueError("S3 key cannot contain '..' (path traversal not allowed)")
-        
-        if key.startswith('/'):
+
+        if key.startswith("/"):
             raise ValueError("S3 key cannot start with '/'")
-        
+
         # Split key into parts and validate each part
-        parts = key.split('/')
+        parts = key.split("/")
         sanitized_parts = []
-        
+
         for part in parts:
             if not part:  # Empty part (double slash)
                 continue
-                
+
             # Check for reserved names (case insensitive)
             if part.upper() in self.RESERVED_NAMES:
                 raise ValueError(f"S3 key contains reserved name: {part}")
-            
+
             # Check for invalid characters
             if any(c in self.INVALID_CHARS for c in part):
                 raise ValueError(f"S3 key contains invalid characters in part: {part}")
-            
+
             # Check if part is too long for filesystem
             if len(part) > 255:  # Most filesystems have 255 char limit per component
                 raise ValueError(f"S3 key component too long: {part}")
-            
+
             sanitized_parts.append(part)
-        
+
         if not sanitized_parts:
             raise ValueError("S3 key resulted in empty path after sanitization")
-        
-        return '/'.join(sanitized_parts)
+
+        return "/".join(sanitized_parts)
 
     def _ensure_cache_dir(self) -> None:
         (self.cache_dir / self.bucket_name).mkdir(parents=True, exist_ok=True)
@@ -114,10 +133,11 @@ def _get_available_disk_space(self, path: Path) -> int:
             # Fallback for Windows or other systems
             try:
                 import shutil
+
                 return shutil.disk_usage(path).free
             except Exception:
                 logger.warning("Could not determine available disk space")
-                return float('inf')  # Assume unlimited space if we can't check
+                return float("inf")  # Assume unlimited space if we can't check
 
     @property
     def s3_client(self):
@@ -138,20 +158,20 @@ def _cache_path(self, key: str) -> Path:
         """Create cache path that mirrors S3 structure after validation."""
         sanitized_key = self._validate_and_sanitize_key(key)
         cache_path = self.cache_dir / self.bucket_name / sanitized_key
-        
+
         # Double-check that the resolved path is still within cache directory
         try:
             cache_path.resolve().relative_to(self.cache_dir.resolve())
         except ValueError:
             raise ValueError(f"S3 key resolves outside cache directory: {key}")
-        
+
         return cache_path
 
     def _get_object_size(self, key: str) -> int:
         """Get the size of an S3 object without downloading it."""
         try:
             response = self.s3_client.head_object(Bucket=self.bucket_name, Key=key)
-            return response['ContentLength']
+            return response["ContentLength"]
         except Exception as e:
             logger.warning(f"Could not determine object size for s3://{self.bucket_name}/{key}: {e}")
             return 0
@@ -254,10 +274,10 @@ def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict:
 def set_cache_dir(cache_dir: Union[str, Path]) -> None:
     """
     Set the global cache directory for S3 downloads.
-    
+
     Args:
         cache_dir: Path to the cache directory
-        
+
     Example:
         >>> import libcachesim as lcs
         >>> lcs.set_cache_dir("/tmp/my_cache")
@@ -269,10 +289,10 @@ def set_cache_dir(cache_dir: Union[str, Path]) -> None:
 def get_cache_dir() -> Path:
     """
     Get the current cache directory.
-    
+
     Returns:
         Path to the current cache directory
-        
+
     Example:
         >>> import libcachesim as lcs
         >>> print(lcs.get_cache_dir())
@@ -284,10 +304,10 @@ def get_cache_dir() -> Path:
 def clear_cache(s3_path: Optional[str] = None) -> None:
     """
     Clear cached files.
-    
+
     Args:
         s3_path: Specific S3 path to clear, or None to clear all cache
-        
+
     Example:
         >>> import libcachesim as lcs
         >>> # Clear specific file
@@ -298,7 +318,7 @@ def clear_cache(s3_path: Optional[str] = None) -> None:
     if s3_path and s3_path.startswith("s3://"):
         parsed = urlparse(s3_path)
         bucket = parsed.netloc
-        key = parsed.path.lstrip('/')
+        key = parsed.path.lstrip("/")
         if bucket == _data_loader.bucket_name:
             _data_loader.clear_cache(key)
         else:
@@ -310,10 +330,10 @@ def clear_cache(s3_path: Optional[str] = None) -> None:
 def get_cache_size() -> int:
     """
     Get total size of cached files in bytes.
-    
+
     Returns:
         Total cache size in bytes
-        
+
     Example:
         >>> import libcachesim as lcs
         >>> size_mb = lcs.get_cache_size() / (1024**2)
@@ -325,10 +345,10 @@ def get_cache_size() -> int:
 def list_cached_files() -> list[str]:
     """
     List all cached files.
-    
+
     Returns:
         List of cached file paths
-        
+
     Example:
         >>> import libcachesim as lcs
         >>> files = lcs.list_cached_files()
@@ -344,4 +364,4 @@ def get_data_loader(bucket_name: str = None) -> _DataLoader:
     if bucket_name is None or bucket_name == _data_loader.bucket_name:
         return _data_loader
     else:
-        return _DataLoader(bucket_name=bucket_name, cache_dir=_data_loader.cache_dir.parent)
\ No newline at end of file
+        return _DataLoader(bucket_name=bucket_name, cache_dir=_data_loader.cache_dir.parent)
diff --git a/libcachesim/trace_reader.py b/libcachesim/trace_reader.py
index 2f66456..e593dbb 100644
--- a/libcachesim/trace_reader.py
+++ b/libcachesim/trace_reader.py
@@ -46,51 +46,51 @@ def __init__(
     def _validate_s3_uri(self, s3_uri: str) -> tuple[str, str]:
         """
         Validate and parse S3 URI.
-        
+
         Args:
             s3_uri: S3 URI like "s3://bucket/key"
-            
+
         Returns:
             Tuple of (bucket, key)
-            
+
         Raises:
             ValueError: If URI is invalid
         """
         parsed = urlparse(s3_uri)
-        
+
         if parsed.scheme != "s3":
             raise ValueError(f"Invalid S3 URI scheme. Expected 's3', got '{parsed.scheme}': {s3_uri}")
-        
+
         if not parsed.netloc:
             raise ValueError(f"Missing bucket name in S3 URI: {s3_uri}")
-        
+
         bucket = parsed.netloc
-        key = parsed.path.lstrip('/')
-        
+        key = parsed.path.lstrip("/")
+
         if not key:
             raise ValueError(f"Missing key (object path) in S3 URI: {s3_uri}")
-        
+
         # Check for path traversal in the key part only
-        if '..' in key:
+        if ".." in key:
             raise ValueError(f"S3 key contains path traversal patterns: {key}")
-        
+
         # Check for double slashes in the key part (after s3://)
-        if '//' in key:
+        if "//" in key:
             raise ValueError(f"S3 key contains double slashes: {key}")
-        
+
         # Check for backslashes (not valid in URLs)
-        if '\\' in s3_uri:
+        if "\\" in s3_uri:
             raise ValueError(f"S3 URI contains backslashes: {s3_uri}")
-        
+
         return bucket, key
 
     def _resolve_s3_path(self, s3_path: str) -> str:
         """
         Resolve S3 path to local cached file path.
-        
+
         Args:
             s3_path: S3 URI like "s3://bucket/key"
-            
+
         Returns:
             Local file path
         """
@@ -98,13 +98,13 @@ def _resolve_s3_path(self, s3_path: str) -> str:
             bucket, key = self._validate_s3_uri(s3_path)
         except ValueError as e:
             raise ValueError(f"Invalid S3 URI: {e}")
-        
+
         # Get data loader for this bucket
         try:
             loader = get_data_loader(bucket)
         except ValueError as e:
             raise ValueError(f"Invalid bucket name '{bucket}': {e}")
-        
+
         logger.info(f"Resolving S3 path: {s3_path}")
         try:
             return loader.get_cached_path(key)
@@ -296,4 +296,4 @@ def __getitem__(self, index: int) -> Request:
         self._reader.reset()
         self._reader.skip_n_req(index)
         req = Request()
-        return self._reader.read_one_req(req)
\ No newline at end of file
+        return self._reader.read_one_req(req)
diff --git a/tests/test_reader.py b/tests/test_reader.py
index 3af8bbb..a49466b 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -330,7 +330,7 @@ def test_invalid_sampling_ratio(self):
 
         finally:
             os.unlink(temp_file)
-    
+
     def test_trace_reader_s3(self):
         """Test trace reader with S3"""
         URI = "s3://cache-datasets/cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst"

From 5c87c4be229ed7f3c2b50253e9a66875b9dca6ab Mon Sep 17 00:00:00 2001
From: haochengxia <xhc_1007@163.com>
Date: Tue, 5 Aug 2025 10:32:33 +0000
Subject: [PATCH 4/4] Update examples and remove not used import

---
 README.md                                 | 11 +++++++----
 docs/src/en/getting_started/quickstart.md | 11 +++++++----
 libcachesim/_s3_cache.py                  |  1 -
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index f7a836f..f313bd7 100644
--- a/README.md
+++ b/README.md
@@ -76,11 +76,14 @@ req_miss_ratio, byte_miss_ratio = cache.process_trace(reader)
 print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 
 # Step 3.1: Further process the first 1000 requests again
-req_miss_ratio, byte_miss_ratio = cache.process_trace(
-    reader,
-    start_req=0,
-    max_req=1000
+cache = lcs.S3FIFO(
+    cache_size=1024 * 1024,
+    # Cache specific parameters
+    small_size_ratio=0.2,
+    ghost_size_ratio=0.8,
+    move_to_main_threshold=2,
 )
+req_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000)
 print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 ```
 
diff --git a/docs/src/en/getting_started/quickstart.md b/docs/src/en/getting_started/quickstart.md
index 0871526..4246046 100644
--- a/docs/src/en/getting_started/quickstart.md
+++ b/docs/src/en/getting_started/quickstart.md
@@ -78,11 +78,14 @@ With libcachesim installed, you can start cache simulation for some eviction alg
     print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
 
     # Step 3.1: Further process the first 1000 requests again
-    req_miss_ratio, byte_miss_ratio = cache.process_trace(
-        reader,
-        start_req=0,
-        max_req=1000
+    cache = lcs.S3FIFO(
+        cache_size=1024 * 1024,
+        # Cache specific parameters
+        small_size_ratio=0.2,
+        ghost_size_ratio=0.8,
+        move_to_main_threshold=2,
     )
+    req_miss_ratio, byte_miss_ratio = cache.process_trace(reader, start_req=0, max_req=1000)
     print(f"Request miss ratio: {req_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}")
     ```
 
diff --git a/libcachesim/_s3_cache.py b/libcachesim/_s3_cache.py
index 4295886..36874ff 100644
--- a/libcachesim/_s3_cache.py
+++ b/libcachesim/_s3_cache.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import hashlib
 import logging
 import os
 import re