💥 Rely on global CACHE env var instead of --use-cache

simonwoerpel · simonwoerpel · commit 9a9f9c45cf84 · 2025-01-17T07:40:52.000+01:00
diff --git a/docs/cache.md b/docs/cache.md
@@ -2,6 +2,8 @@ For incremental processing of tasks, `leakrfc` uses a global cache to track task
 
 `leakrfc` is using [anystore](https://docs.investigraph.dev/lib/anystore/cache/) for the cache implementation, so any supported backend is possible. Recommended backends are redis or sql, but a distributed cloud-backend (such as a shared s3 bucket) can make sense, too.
 
+As long as caching is enabled (globally via `CACHE=1`, the default), all operations will look in the global cache if a task has already been processed. When disabling cache (`CACHE=0`) for a run, the cache is not respected but still populated for next runs.
+
 Per default, an in-memory cache is used, which doesn't persist.
 
 ## Configure
@@ -14,4 +16,7 @@ LEAKRFC_CACHE__URI=redis://localhost
 # additional config
 LEAKRFC_CACHE__DEFAULT_TTL=3600  # seconds
 LEAKRFC_CACHE__BACKEND_CONFIG__REDIS_PREFIX=my-prefix
+
+# disable cache
+CACHE=0
 ```
diff --git a/docs/sync/aleph.md b/docs/sync/aleph.md
@@ -2,7 +2,7 @@ Sync a leakrfc dataset into an [Aleph](https://docs.aleph.occrp.org/) instance.
 
 Collections will be created if they don't exist and their metadata will be updated (this can be disabled via `--no-metadata`). The Aleph collections _foreign id_ can be set via `--foreign-id` and defaults to the leakrfc dataset name.
 
-As long as using `--use-cache` (default) only new documents are synced. The cache handles multiple Aleph instances and keeps track of the individual status for each of them.
+As long as using the global cache (environment `CACHE=1`, default) only new documents are synced. The cache handles multiple Aleph instances and keeps track of the individual status for each of them.
 
 Aleph api configuration can as well set via command line:
 
diff --git a/docs/sync/memorious.md b/docs/sync/memorious.md
@@ -1,6 +1,6 @@
 Import [memorious](https://github.com/alephdata/memorious) crawler results into a `leakrfc` dataset.
 
-As long as using `--use-cache` (default) only new documents are synced.
+As long as using the global cache (environment `CACHE=1`, default) only new documents are synced.
 
 ```bash
 leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset
@@ -16,7 +16,7 @@ leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset --name-
 leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset --strip-prefix "assets/docs"
 ```
 
-Or use a template that will replace values from the original memorious "*.json" file for the source file. Given a json file stored by memorious like this:
+Or use a template that will replace values from the original memorious "\*.json" file for the source file. Given a json file stored by memorious like this:
 
 ```json
 {
@@ -49,7 +49,6 @@ To import this file as "2022/05/Berlin/Beratungsvorgang/19-11840.pdf":
 leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset --key-template "{{ date[:4] }}/{{ date[5:7] }}/{{ state }}/{{ category }}/{{ reference.replace('/','-') }}.{{ url.split('.')[-1] }}"
 ```
 
-
 ## Reference
 
 ::: leakrfc.sync.memorious
diff --git a/leakrfc/cli.py b/leakrfc/cli.py
@@ -159,7 +159,6 @@ def cli_diff(
 @cli.command("make")
 def cli_make(
     out_uri: Annotated[str, typer.Option("-o")] = "-",
-    use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
     check_integrity: Annotated[
         Optional[bool], typer.Option(help="Check checksums")
     ] = True,
@@ -181,9 +180,7 @@ def cli_make(
             dataset.make_index()
             obj = dataset._storage.get(dataset._get_index_path(), model=DatasetModel)
         else:
-            obj = make_dataset(
-                dataset, use_cache, check_integrity, cleanup, metadata_only
-            )
+            obj = make_dataset(dataset, check_integrity, cleanup, metadata_only)
         write_obj(obj, out_uri)
 
 
@@ -242,7 +239,6 @@ def cli_crawl(
     out_uri: Annotated[
         str, typer.Option("-o", help="Write results to this destination")
     ] = "-",
-    use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
     skip_existing: Annotated[
         Optional[bool],
         typer.Option(
@@ -276,7 +272,6 @@ def cli_crawl(
             crawl(
                 uri,
                 dataset,
-                use_cache=use_cache,
                 skip_existing=skip_existing,
                 extract=extract,
                 extract_keep_source=extract_keep_source,
@@ -300,7 +295,6 @@ def cli_export(out: str):
 @memorious.command("sync")
 def cli_sync_memorious(
     uri: Annotated[str, typer.Option("-i")],
-    use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
     name_only: Annotated[
         Optional[bool], typer.Option(help="Use only file name as key")
     ] = False,
@@ -323,13 +317,12 @@ def cli_sync_memorious(
             key_func = get_file_name_templ_func(key_template)
         else:
             key_func = None
-        res = import_memorious(dataset, uri, key_func, use_cache=use_cache)
+        res = import_memorious(dataset, uri, key_func)
         write_obj(res, "-")
 
 
 @aleph.command("sync")
 def cli_aleph_sync(
-    use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
     host: Annotated[Optional[str], typer.Option(help="Aleph host")] = None,
     api_key: Annotated[Optional[str], typer.Option(help="Aleph api key")] = None,
     folder: Annotated[Optional[str], typer.Option(help="Base folder path")] = None,
@@ -350,7 +343,6 @@ def cli_aleph_sync(
             api_key=api_key,
             prefix=folder,
             foreign_id=foreign_id,
-            use_cache=use_cache,
             metadata=metadata,
         )
         write_obj(res, "-")
@@ -359,7 +351,6 @@ def cli_aleph_sync(
 @aleph.command("load-dataset")
 def cli_aleph_load_dataset(
     uri: Annotated[str, typer.Argument(help="Dataset index.json uri")],
-    use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
     host: Annotated[Optional[str], typer.Option(help="Aleph host")] = None,
     api_key: Annotated[Optional[str], typer.Option(help="Aleph api key")] = None,
     foreign_id: Annotated[
@@ -378,7 +369,6 @@ def cli_aleph_load_dataset(
             host=host,
             api_key=api_key,
             foreign_id=foreign_id,
-            use_cache=use_cache,
             metadata=metadata,
         )
         write_obj(res, "-")
@@ -395,7 +385,6 @@ def cli_aleph_load_catalog(
     #     Optional[list[str]],
     #     typer.Argument(help="Dataset foreign_ids to exclude, can be a glob"),
     # ] = None,
-    use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
     host: Annotated[Optional[str], typer.Option(help="Aleph host")] = None,
     api_key: Annotated[Optional[str], typer.Option(help="Aleph api key")] = None,
     metadata: Annotated[
@@ -412,7 +401,6 @@ def cli_aleph_load_catalog(
             api_key=api_key,
             # include_dataset=include_dataset,
             # exclude_dataset=exclude_dataset,
-            use_cache=use_cache,
             metadata=metadata,
         ):
             write_obj(res, "-")
diff --git a/leakrfc/crawl.py b/leakrfc/crawl.py
@@ -138,7 +138,6 @@ def crawl(
     extract: bool | None = False,
     extract_keep_source: bool | None = False,
     extract_ensure_subdir: bool | None = False,
-    use_cache: bool | None = True,
     write_documents_db: bool | None = True,
     exclude: str | None = None,
     include: str | None = None,
@@ -157,7 +156,6 @@ def crawl(
         extract_ensure_subdir: Make sub-directories for extracted files with the
             archive name to avoid overwriting existing files during extraction
             of multiple archives with the same directory structure
-        use_cache: Use global processing cache to skip tasks
         write_documents_db: Create csv-based document tables at the end of crawl run
         exclude: Exclude glob for file paths not to crawl
         include: Include glob for file paths to crawl
@@ -180,7 +178,6 @@ def crawl(
         extract=extract,
         extract_keep_source=extract_keep_source,
         extract_ensure_subdir=extract_ensure_subdir,
-        use_cache=use_cache,
         write_documents_db=write_documents_db,
         exclude=exclude,
         include=include,
diff --git a/leakrfc/make.py b/leakrfc/make.py
@@ -113,7 +113,6 @@ def done(self) -> None:
 
 def make_dataset(
     dataset: DatasetArchive,
-    use_cache: bool | None = True,
     check_integrity: bool | None = True,
     cleanup: bool | None = True,
     metadata_only: bool | None = False,
@@ -129,15 +128,12 @@ def make_dataset(
 
     Args:
         dataset: leakrfc Dataset instance
-        use_cache: Use global processing cache to skip tasks
         check_integrity: Check checksum for each file (logs mismatches)
         cleanup: When checking integrity, fix mismatched metadata and delete
             unreferenced metadata files
         metadata_only: Only iterate through existing metadata files, don't look
             for new source files
 
     """
-    worker = MakeWorker(
-        check_integrity, cleanup, metadata_only, dataset, use_cache=use_cache
-    )
+    worker = MakeWorker(check_integrity, cleanup, metadata_only, dataset)
     return worker.run()
diff --git a/leakrfc/sync/aleph.py b/leakrfc/sync/aleph.py
@@ -147,21 +147,17 @@ def sync_to_aleph(
     api_key: str | None,
     prefix: str | None = None,
     foreign_id: str | None = None,
-    use_cache: bool | None = True,
     metadata: bool | None = True,
 ) -> AlephUploadStatus:
     """
     Incrementally sync a leakrfc dataset into an Aleph instance.
 
-    As long as using `use_cache`, only new documents will be imported.
-
     Args:
         dataset: leakrfc Dataset instance
         host: Aleph host (can be set via env `ALEPHCLIENT_HOST`)
         api_key: Aleph api key (can be set via env `ALEPHCLIENT_API_KEY`)
         prefix: Add a folder prefix to import documents into
         foreign_id: Aleph collection foreign_id (if different from leakrfc dataset name)
-        use_cache: Use global processing cache to skip tasks
         metadata: Update Aleph collection metadata
     """
     worker = AlephUploadWorker(
@@ -170,7 +166,6 @@ def sync_to_aleph(
         api_key=api_key,
         prefix=prefix,
         foreign_id=foreign_id,
-        use_cache=use_cache,
         metadata=metadata,
     )
     worker.log_info(f"Starting sync to Aleph `{worker.host}` ...")
diff --git a/leakrfc/sync/aleph_entities.py b/leakrfc/sync/aleph_entities.py
@@ -92,7 +92,6 @@ def load_dataset(
     host: str | None,
     api_key: str | None,
     foreign_id: str | None = None,
-    use_cache: bool | None = True,
     metadata: bool | None = True,
 ) -> AlephLoadDatasetStatus:
     dataset = Dataset._from_uri(uri)
@@ -103,7 +102,6 @@ def load_dataset(
         host=host,
         api_key=api_key,
         foreign_id=foreign_id,
-        use_cache=use_cache,
         metadata=metadata,
     )
     res = worker.run()
@@ -115,7 +113,6 @@ def load_catalog(
     host: str | None,
     api_key: str | None,
     foreign_id: str | None = None,
-    use_cache: bool | None = True,
     metadata: bool | None = True,
     exclude_dataset: str | None = None,
     include_dataset: str | None = None,
@@ -132,6 +129,5 @@ def load_catalog(
             host=host,
             api_key=api_key,
             foreign_id=foreign_id,
-            use_cache=use_cache,
             metadata=metadata,
         )
diff --git a/leakrfc/sync/memorious.py b/leakrfc/sync/memorious.py
@@ -93,10 +93,7 @@ def done(self) -> None:
 
 
 def import_memorious(
-    dataset: DatasetArchive,
-    uri: Uri,
-    key_func: Callable | None = None,
-    use_cache: bool | None = True,
+    dataset: DatasetArchive, uri: Uri, key_func: Callable | None = None
 ) -> MemoriousStatus:
     """
     Convert a "memorious collection" (the output format of the store->directory
@@ -118,10 +115,9 @@ def import_memorious(
             listing
         key_func: A function to generate file keys (their relative paths), per
             default it is generated from the source url.
-        use_cache: Use global processing cache to skip tasks
     """
 
-    worker = MemoriousWorker(uri, key_func, dataset=dataset, use_cache=use_cache)
+    worker = MemoriousWorker(uri, key_func, dataset=dataset)
     worker.log_info(f"Starting memorious import from `{worker.memorious.uri}` ...")
     return worker.run()
 
diff --git a/leakrfc/worker.py b/leakrfc/worker.py
@@ -20,18 +20,13 @@
 
 
 def make_cache_key(worker: "DatasetWorker", action: str, *extra: str) -> str | None:
-    if not worker.use_cache:
-        return
     return f"{leakrfc_settings.cache_prefix}/{worker.dataset.name}/{action}/{'/'.join(extra)}"
 
 
 class DatasetWorker(Worker):
-    def __init__(
-        self, dataset: "DatasetArchive", use_cache: bool | None = True, *args, **kwargs
-    ) -> None:
+    def __init__(self, dataset: "DatasetArchive", *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.dataset = dataset
-        self.use_cache = use_cache
 
     def get_tasks(self) -> Any:
         yield from self.dataset.iter_files()
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,4 +72,5 @@ LEAKRFC_URI = "tests/fixtures/archive.yml"
 AWS_SECRET_ACCESS_KEY = "leakrfc"
 AWS_ACCESS_KEY_ID = "leakrfc"
 DEBUG = 1
+CACHE = 0
 FSSPEC_S3_ENDPOINT_URL = ""
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -40,7 +40,7 @@ def test_dataset(tmp_path_factory) -> DatasetArchive:
     tmp_path = tmp_path_factory.mktemp("test-archive")
     tmp_path.mkdir(parents=True, exist_ok=True)
     dataset = get_dataset("test_dataset", uri=tmp_path / "archive")
-    crawl(FIXTURES_PATH / "src", dataset, use_cache=False)
+    crawl(FIXTURES_PATH / "src", dataset)
     return dataset
 
 
diff --git a/tests/test_crawl.py b/tests/test_crawl.py
@@ -6,14 +6,14 @@
 def test_crawl(tmp_path, fixtures_path):
     url = "http://localhost:8000/src"
     dataset = get_dataset("test", uri=tmp_path / "test1")
-    crawl(url, dataset, use_cache=False)
+    crawl(url, dataset)
     files1 = [f for f in dataset.iter_files(use_db=False)]
     assert len(files1) == 74
     files1 = [f for f in dataset.iter_files()]
     assert len(files1) == 74
 
     dataset = get_dataset("test", uri=tmp_path / "test2")
-    crawl(fixtures_path / "src", dataset, use_cache=False)
+    crawl(fixtures_path / "src", dataset)
     files2 = [f for f in dataset.iter_files()]
     assert len(files2) == 74
 
@@ -31,12 +31,7 @@ def test_crawl(tmp_path, fixtures_path):
 def test_crawl_extract(tmp_path, fixtures_path):
     # default
     dataset = get_dataset("test0", uri=tmp_path / "test0")
-    res = crawl(
-        fixtures_path / "src",
-        dataset,
-        extract=True,
-        use_cache=False,
-    )
+    res = crawl(fixtures_path / "src", dataset, extract=True)
     assert res.extracted == 28
     assert res.packages == 4
     assert res.done == 74
@@ -56,7 +51,6 @@ def test_crawl_extract(tmp_path, fixtures_path):
         dataset,
         extract=True,
         extract_ensure_subdir=True,
-        use_cache=False,
     )
     assert res.extracted == 28
     assert res.packages == 4
@@ -68,7 +62,7 @@ def test_crawl_extract(tmp_path, fixtures_path):
 
     url = "http://localhost:8000/src"
     dataset = get_dataset("test2", uri=tmp_path / "test2")
-    res = crawl(url, dataset, extract=True, extract_ensure_subdir=True, use_cache=False)
+    res = crawl(url, dataset, extract=True, extract_ensure_subdir=True)
     assert res.extracted == 28
     assert res.packages == 4
     assert res.extracted + res.done - res.packages == len(
@@ -86,7 +80,6 @@ def test_crawl_extract_keep(tmp_path, fixtures_path):
         extract=True,
         extract_keep_source=True,
         extract_ensure_subdir=True,
-        use_cache=False,
     )
     assert res.extracted == 28
     assert res.packages == 4
@@ -104,7 +97,6 @@ def test_crawl_extract_keep(tmp_path, fixtures_path):
         extract=True,
         extract_keep_source=True,
         extract_ensure_subdir=True,
-        use_cache=False,
     )
     assert res.extracted == 28
     assert res.packages == 4
@@ -117,7 +109,7 @@ def test_crawl_extract_keep(tmp_path, fixtures_path):
 
 def test_crawl_globs(tmp_path, fixtures_path):
     dataset = get_dataset("test", uri=tmp_path / "test3")
-    res = crawl(fixtures_path / "src", dataset, use_cache=False, include="*.pdf")
+    res = crawl(fixtures_path / "src", dataset, include="*.pdf")
     assert res.done == 12
-    res = crawl(fixtures_path / "src", dataset, use_cache=False, exclude="*.pdf")
+    res = crawl(fixtures_path / "src", dataset, exclude="*.pdf")
     assert res.done == 74 - 12
diff --git a/tests/test_make.py b/tests/test_make.py
diff --git a/tests/test_sync.py b/tests/test_sync.py