Skip to content

Commit 9a9f9c4

Browse files
committed
💥 Rely on global CACHE env var instead of --use-cache
1 parent d6db0f9 commit 9a9f9c4

File tree

15 files changed

+40
-78
lines changed

15 files changed

+40
-78
lines changed

docs/cache.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ For incremental processing of tasks, `leakrfc` uses a global cache to track task
22

33
`leakrfc` is using [anystore](https://docs.investigraph.dev/lib/anystore/cache/) for the cache implementation, so any supported backend is possible. Recommended backends are redis or sql, but a distributed cloud-backend (such as a shared s3 bucket) can make sense, too.
44

5+
As long as caching is enabled (globally via `CACHE=1`, the default), all operations will look in the global cache if a task has already been processed. When disabling cache (`CACHE=0`) for a run, the cache is not respected but still populated for next runs.
6+
57
Per default, an in-memory cache is used, which doesn't persist.
68

79
## Configure
@@ -14,4 +16,7 @@ LEAKRFC_CACHE__URI=redis://localhost
1416
# additional config
1517
LEAKRFC_CACHE__DEFAULT_TTL=3600 # seconds
1618
LEAKRFC_CACHE__BACKEND_CONFIG__REDIS_PREFIX=my-prefix
19+
20+
# disable cache
21+
CACHE=0
1722
```

docs/sync/aleph.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Sync a leakrfc dataset into an [Aleph](https://docs.aleph.occrp.org/) instance.
22

33
Collections will be created if they don't exist and their metadata will be updated (this can be disabled via `--no-metadata`). The Aleph collections _foreign id_ can be set via `--foreign-id` and defaults to the leakrfc dataset name.
44

5-
As long as using `--use-cache` (default) only new documents are synced. The cache handles multiple Aleph instances and keeps track of the individual status for each of them.
5+
As long as using the global cache (environment `CACHE=1`, default) only new documents are synced. The cache handles multiple Aleph instances and keeps track of the individual status for each of them.
66

77
Aleph api configuration can as well set via command line:
88

docs/sync/memorious.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Import [memorious](https://github.com/alephdata/memorious) crawler results into a `leakrfc` dataset.
22

3-
As long as using `--use-cache` (default) only new documents are synced.
3+
As long as using the global cache (environment `CACHE=1`, default) only new documents are synced.
44

55
```bash
66
leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset
@@ -16,7 +16,7 @@ leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset --name-
1616
leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset --strip-prefix "assets/docs"
1717
```
1818

19-
Or use a template that will replace values from the original memorious "*.json" file for the source file. Given a json file stored by memorious like this:
19+
Or use a template that will replace values from the original memorious "\*.json" file for the source file. Given a json file stored by memorious like this:
2020

2121
```json
2222
{
@@ -49,7 +49,6 @@ To import this file as "2022/05/Berlin/Beratungsvorgang/19-11840.pdf":
4949
leakrfc -d my_dataset memorious sync -i /memorious/data/store/my_dataset --key-template "{{ date[:4] }}/{{ date[5:7] }}/{{ state }}/{{ category }}/{{ reference.replace('/','-') }}.{{ url.split('.')[-1] }}"
5050
```
5151

52-
5352
## Reference
5453

5554
::: leakrfc.sync.memorious

leakrfc/cli.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ def cli_diff(
159159
@cli.command("make")
160160
def cli_make(
161161
out_uri: Annotated[str, typer.Option("-o")] = "-",
162-
use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
163162
check_integrity: Annotated[
164163
Optional[bool], typer.Option(help="Check checksums")
165164
] = True,
@@ -181,9 +180,7 @@ def cli_make(
181180
dataset.make_index()
182181
obj = dataset._storage.get(dataset._get_index_path(), model=DatasetModel)
183182
else:
184-
obj = make_dataset(
185-
dataset, use_cache, check_integrity, cleanup, metadata_only
186-
)
183+
obj = make_dataset(dataset, check_integrity, cleanup, metadata_only)
187184
write_obj(obj, out_uri)
188185

189186

@@ -242,7 +239,6 @@ def cli_crawl(
242239
out_uri: Annotated[
243240
str, typer.Option("-o", help="Write results to this destination")
244241
] = "-",
245-
use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
246242
skip_existing: Annotated[
247243
Optional[bool],
248244
typer.Option(
@@ -276,7 +272,6 @@ def cli_crawl(
276272
crawl(
277273
uri,
278274
dataset,
279-
use_cache=use_cache,
280275
skip_existing=skip_existing,
281276
extract=extract,
282277
extract_keep_source=extract_keep_source,
@@ -300,7 +295,6 @@ def cli_export(out: str):
300295
@memorious.command("sync")
301296
def cli_sync_memorious(
302297
uri: Annotated[str, typer.Option("-i")],
303-
use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
304298
name_only: Annotated[
305299
Optional[bool], typer.Option(help="Use only file name as key")
306300
] = False,
@@ -323,13 +317,12 @@ def cli_sync_memorious(
323317
key_func = get_file_name_templ_func(key_template)
324318
else:
325319
key_func = None
326-
res = import_memorious(dataset, uri, key_func, use_cache=use_cache)
320+
res = import_memorious(dataset, uri, key_func)
327321
write_obj(res, "-")
328322

329323

330324
@aleph.command("sync")
331325
def cli_aleph_sync(
332-
use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
333326
host: Annotated[Optional[str], typer.Option(help="Aleph host")] = None,
334327
api_key: Annotated[Optional[str], typer.Option(help="Aleph api key")] = None,
335328
folder: Annotated[Optional[str], typer.Option(help="Base folder path")] = None,
@@ -350,7 +343,6 @@ def cli_aleph_sync(
350343
api_key=api_key,
351344
prefix=folder,
352345
foreign_id=foreign_id,
353-
use_cache=use_cache,
354346
metadata=metadata,
355347
)
356348
write_obj(res, "-")
@@ -359,7 +351,6 @@ def cli_aleph_sync(
359351
@aleph.command("load-dataset")
360352
def cli_aleph_load_dataset(
361353
uri: Annotated[str, typer.Argument(help="Dataset index.json uri")],
362-
use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
363354
host: Annotated[Optional[str], typer.Option(help="Aleph host")] = None,
364355
api_key: Annotated[Optional[str], typer.Option(help="Aleph api key")] = None,
365356
foreign_id: Annotated[
@@ -378,7 +369,6 @@ def cli_aleph_load_dataset(
378369
host=host,
379370
api_key=api_key,
380371
foreign_id=foreign_id,
381-
use_cache=use_cache,
382372
metadata=metadata,
383373
)
384374
write_obj(res, "-")
@@ -395,7 +385,6 @@ def cli_aleph_load_catalog(
395385
# Optional[list[str]],
396386
# typer.Argument(help="Dataset foreign_ids to exclude, can be a glob"),
397387
# ] = None,
398-
use_cache: Annotated[Optional[bool], typer.Option(help="Use runtime cache")] = True,
399388
host: Annotated[Optional[str], typer.Option(help="Aleph host")] = None,
400389
api_key: Annotated[Optional[str], typer.Option(help="Aleph api key")] = None,
401390
metadata: Annotated[
@@ -412,7 +401,6 @@ def cli_aleph_load_catalog(
412401
api_key=api_key,
413402
# include_dataset=include_dataset,
414403
# exclude_dataset=exclude_dataset,
415-
use_cache=use_cache,
416404
metadata=metadata,
417405
):
418406
write_obj(res, "-")

leakrfc/crawl.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ def crawl(
138138
extract: bool | None = False,
139139
extract_keep_source: bool | None = False,
140140
extract_ensure_subdir: bool | None = False,
141-
use_cache: bool | None = True,
142141
write_documents_db: bool | None = True,
143142
exclude: str | None = None,
144143
include: str | None = None,
@@ -157,7 +156,6 @@ def crawl(
157156
extract_ensure_subdir: Make sub-directories for extracted files with the
158157
archive name to avoid overwriting existing files during extraction
159158
of multiple archives with the same directory structure
160-
use_cache: Use global processing cache to skip tasks
161159
write_documents_db: Create csv-based document tables at the end of crawl run
162160
exclude: Exclude glob for file paths not to crawl
163161
include: Include glob for file paths to crawl
@@ -180,7 +178,6 @@ def crawl(
180178
extract=extract,
181179
extract_keep_source=extract_keep_source,
182180
extract_ensure_subdir=extract_ensure_subdir,
183-
use_cache=use_cache,
184181
write_documents_db=write_documents_db,
185182
exclude=exclude,
186183
include=include,

leakrfc/make.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ def done(self) -> None:
113113

114114
def make_dataset(
115115
dataset: DatasetArchive,
116-
use_cache: bool | None = True,
117116
check_integrity: bool | None = True,
118117
cleanup: bool | None = True,
119118
metadata_only: bool | None = False,
@@ -129,15 +128,12 @@ def make_dataset(
129128
130129
Args:
131130
dataset: leakrfc Dataset instance
132-
use_cache: Use global processing cache to skip tasks
133131
check_integrity: Check checksum for each file (logs mismatches)
134132
cleanup: When checking integrity, fix mismatched metadata and delete
135133
unreferenced metadata files
136134
metadata_only: Only iterate through existing metadata files, don't look
137135
for new source files
138136
139137
"""
140-
worker = MakeWorker(
141-
check_integrity, cleanup, metadata_only, dataset, use_cache=use_cache
142-
)
138+
worker = MakeWorker(check_integrity, cleanup, metadata_only, dataset)
143139
return worker.run()

leakrfc/sync/aleph.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,21 +147,17 @@ def sync_to_aleph(
147147
api_key: str | None,
148148
prefix: str | None = None,
149149
foreign_id: str | None = None,
150-
use_cache: bool | None = True,
151150
metadata: bool | None = True,
152151
) -> AlephUploadStatus:
153152
"""
154153
Incrementally sync a leakrfc dataset into an Aleph instance.
155154
156-
As long as using `use_cache`, only new documents will be imported.
157-
158155
Args:
159156
dataset: leakrfc Dataset instance
160157
host: Aleph host (can be set via env `ALEPHCLIENT_HOST`)
161158
api_key: Aleph api key (can be set via env `ALEPHCLIENT_API_KEY`)
162159
prefix: Add a folder prefix to import documents into
163160
foreign_id: Aleph collection foreign_id (if different from leakrfc dataset name)
164-
use_cache: Use global processing cache to skip tasks
165161
metadata: Update Aleph collection metadata
166162
"""
167163
worker = AlephUploadWorker(
@@ -170,7 +166,6 @@ def sync_to_aleph(
170166
api_key=api_key,
171167
prefix=prefix,
172168
foreign_id=foreign_id,
173-
use_cache=use_cache,
174169
metadata=metadata,
175170
)
176171
worker.log_info(f"Starting sync to Aleph `{worker.host}` ...")

leakrfc/sync/aleph_entities.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ def load_dataset(
9292
host: str | None,
9393
api_key: str | None,
9494
foreign_id: str | None = None,
95-
use_cache: bool | None = True,
9695
metadata: bool | None = True,
9796
) -> AlephLoadDatasetStatus:
9897
dataset = Dataset._from_uri(uri)
@@ -103,7 +102,6 @@ def load_dataset(
103102
host=host,
104103
api_key=api_key,
105104
foreign_id=foreign_id,
106-
use_cache=use_cache,
107105
metadata=metadata,
108106
)
109107
res = worker.run()
@@ -115,7 +113,6 @@ def load_catalog(
115113
host: str | None,
116114
api_key: str | None,
117115
foreign_id: str | None = None,
118-
use_cache: bool | None = True,
119116
metadata: bool | None = True,
120117
exclude_dataset: str | None = None,
121118
include_dataset: str | None = None,
@@ -132,6 +129,5 @@ def load_catalog(
132129
host=host,
133130
api_key=api_key,
134131
foreign_id=foreign_id,
135-
use_cache=use_cache,
136132
metadata=metadata,
137133
)

leakrfc/sync/memorious.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,7 @@ def done(self) -> None:
9393

9494

9595
def import_memorious(
96-
dataset: DatasetArchive,
97-
uri: Uri,
98-
key_func: Callable | None = None,
99-
use_cache: bool | None = True,
96+
dataset: DatasetArchive, uri: Uri, key_func: Callable | None = None
10097
) -> MemoriousStatus:
10198
"""
10299
Convert a "memorious collection" (the output format of the store->directory
@@ -118,10 +115,9 @@ def import_memorious(
118115
listing
119116
key_func: A function to generate file keys (their relative paths), per
120117
default it is generated from the source url.
121-
use_cache: Use global processing cache to skip tasks
122118
"""
123119

124-
worker = MemoriousWorker(uri, key_func, dataset=dataset, use_cache=use_cache)
120+
worker = MemoriousWorker(uri, key_func, dataset=dataset)
125121
worker.log_info(f"Starting memorious import from `{worker.memorious.uri}` ...")
126122
return worker.run()
127123

leakrfc/worker.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,13 @@
2020

2121

2222
def make_cache_key(worker: "DatasetWorker", action: str, *extra: str) -> str | None:
23-
if not worker.use_cache:
24-
return
2523
return f"{leakrfc_settings.cache_prefix}/{worker.dataset.name}/{action}/{'/'.join(extra)}"
2624

2725

2826
class DatasetWorker(Worker):
29-
def __init__(
30-
self, dataset: "DatasetArchive", use_cache: bool | None = True, *args, **kwargs
31-
) -> None:
27+
def __init__(self, dataset: "DatasetArchive", *args, **kwargs) -> None:
3228
super().__init__(*args, **kwargs)
3329
self.dataset = dataset
34-
self.use_cache = use_cache
3530

3631
def get_tasks(self) -> Any:
3732
yield from self.dataset.iter_files()

0 commit comments

Comments
 (0)