Skip to content

Commit d6db0f9

Browse files
committed
🩹 (sync/aleph) Respect existing source url in metadata
1 parent dcea6c4 commit d6db0f9

File tree

1 file changed

+13
-1
lines changed

1 file changed

+13
-1
lines changed

leakrfc/sync/aleph.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010
from anystore import anycache
1111
from anystore.io import logged_items
12+
from anystore.types import SDict
1213
from anystore.worker import WorkerStatus
14+
from banal import ensure_dict
1315

1416
from leakrfc.archive.cache import get_cache
1517
from leakrfc.archive.dataset import DatasetArchive
@@ -39,6 +41,16 @@ def make_current_version_cache_key(self: "AlephUploadWorker") -> str:
3941
return aleph.make_aleph_cache_key(self, version)
4042

4143

44+
def get_source_url(data: SDict) -> str | None:
45+
url = data.get("source_url")
46+
if url:
47+
return url
48+
url = ensure_dict(data.get("extra")).get("source_url")
49+
if url:
50+
return url
51+
return data.get("url")
52+
53+
4254
class AlephUploadStatus(WorkerStatus):
4355
uploaded: int = 0
4456
folders_created: int = 0
@@ -107,7 +119,7 @@ def handle_task(self, task: File) -> dict[str, Any]:
107119
foreign_id=self.foreign_id,
108120
)
109121
metadata = {**task.extra, "file_name": task.name, "foreign_id": task.key}
110-
metadata["source_url"] = metadata.get("url")
122+
metadata["source_url"] = get_source_url(metadata)
111123
parent = self.get_parent(task.key, self.prefix)
112124
if parent:
113125
metadata["parent"] = parent

0 commit comments

Comments
 (0)