From a50c1ded462900cda8e9cc6f91c928473df26681 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 7 Jul 2025 21:25:01 +0200 Subject: [PATCH 1/4] Add initial mapping based on #436 --- src/sssom/sexpr.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 src/sssom/sexpr.py diff --git a/src/sssom/sexpr.py b/src/sssom/sexpr.py new file mode 100644 index 00000000..b988fcf6 --- /dev/null +++ b/src/sssom/sexpr.py @@ -0,0 +1,74 @@ +"""Generate canonical s-expressions and mapping hashes.""" + +import hashlib +import re +import unittest + +import zbase32 + +from sssom import Mapping +from sssom.constants import _get_sssom_schema_object + +__all__ = [ + "get_mapping_hash", +] + +def get_mapping_hash(x: Mapping) -> str: + """Hash the mapping by converting to canonical s-expression, sha256 hashing, then zbase32 encoding.""" + s = hashlib.sha256() + s.update(to_sexpr(x).encode("utf-8")) + dig = s.digest() + return zbase32.encode(dig) + + +SKIP_SLOTS = {"record_id", "mapping_cardinality"} + + +def to_sexpr(x: Mapping) -> str: + # todo get canonical order + rv = "(7:mapping(" + for slot in _get_sssom_schema_object().slots: + if slot in SKIP_SLOTS: + continue + value = getattr(x, slot, None) + if not value: + continue + elif isinstance(value, str): + rv += f"({len(slot)}:{slot}{len(value)}:{value})" + elif isinstance(value, float): + raise NotImplementedError + elif isinstance(value, list): + rv += f"({len(slot)}:{slot}(" + for v in value: + rv += f"{len(v)}:{v}" + rv += "))" + return rv + "))" + + +class TestSExpressions(unittest.TestCase): + def test_big_example(self) -> None: + """""" + s = """ + (7:mapping( + (10:subject_id44:http://purl.obolibrary.org/obo/FBbt_00001234) + (12:predicate_id46:http://www.w3.org/2004/02/skos/core#exactMatch) + (9:object_id45:http://purl.obolibrary.org/obo/UBERON_0005678) + (21:mapping_justification51:https://w3id.org/semapv/vocab/ManualMappingCuration) + (10:creator_id( + 37:https://orcid.org/0000-0000-1234-5678 + 37:https://orcid.org/0000-0000-5678-1234 + )) + )) + """ + x = Mapping( + subject_id="http://purl.obolibrary.org/obo/FBbt_00001234", + predicate_id="http://www.w3.org/2004/02/skos/core#exactMatch", + object_id="http://purl.obolibrary.org/obo/UBERON_0005678", + mapping_justification="https://w3id.org/semapv/vocab/ManualMappingCuration", + creator_id=[ + "https://orcid.org/0000-0000-1234-5678", + "https://orcid.org/0000-0000-5678-1234", + ], + ) + self.assertEqual(re.sub("\s", "", s), to_sexpr(x)) + self.assertEqual("hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty", get_mapping_hash(x)) From ac7312186bbb904729e7f242f59e3402ffb5cf46 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 9 Sep 2025 15:37:45 +0200 Subject: [PATCH 2/4] Reorg --- src/sssom/sexpr.py | 48 +++++++--------------- tests/data/sexpr_test.sssom.tsv | 7 ++++ tests/test_sexpr.py | 71 +++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 34 deletions(-) create mode 100644 tests/data/sexpr_test.sssom.tsv create mode 100644 tests/test_sexpr.py diff --git a/src/sssom/sexpr.py b/src/sssom/sexpr.py index b988fcf6..bd2baffa 100644 --- a/src/sssom/sexpr.py +++ b/src/sssom/sexpr.py @@ -1,9 +1,8 @@ """Generate canonical s-expressions and mapping hashes.""" import hashlib -import re -import unittest +import curies import zbase32 from sssom import Mapping @@ -13,10 +12,11 @@ "get_mapping_hash", ] -def get_mapping_hash(x: Mapping) -> str: + +def get_mapping_hash(mapping: Mapping, converter: curies.Converter) -> str: """Hash the mapping by converting to canonical s-expression, sha256 hashing, then zbase32 encoding.""" s = hashlib.sha256() - s.update(to_sexpr(x).encode("utf-8")) + s.update(to_sexpr(mapping, converter).encode("utf-8")) dig = s.digest() return zbase32.encode(dig) @@ -24,7 +24,11 @@ def get_mapping_hash(x: Mapping) -> str: SKIP_SLOTS = {"record_id", "mapping_cardinality"} -def to_sexpr(x: Mapping) -> str: +def _should_expand(slot: str) -> bool: + return True + + +def to_sexpr(x: Mapping, converter: curies.Converter) -> str: # todo get canonical order rv = "(7:mapping(" for slot in _get_sssom_schema_object().slots: @@ -34,41 +38,17 @@ def to_sexpr(x: Mapping) -> str: if not value: continue elif isinstance(value, str): + if _should_expand(slot): + value = converter.expand_or_standardize(value, strict=True) + # TODO check if it's an entity reference and should be expanded rv += f"({len(slot)}:{slot}{len(value)}:{value})" elif isinstance(value, float): raise NotImplementedError elif isinstance(value, list): rv += f"({len(slot)}:{slot}(" for v in value: + if _should_expand(slot): + v = converter.expand_or_standardize(v, strict=True) rv += f"{len(v)}:{v}" rv += "))" return rv + "))" - - -class TestSExpressions(unittest.TestCase): - def test_big_example(self) -> None: - """""" - s = """ - (7:mapping( - (10:subject_id44:http://purl.obolibrary.org/obo/FBbt_00001234) - (12:predicate_id46:http://www.w3.org/2004/02/skos/core#exactMatch) - (9:object_id45:http://purl.obolibrary.org/obo/UBERON_0005678) - (21:mapping_justification51:https://w3id.org/semapv/vocab/ManualMappingCuration) - (10:creator_id( - 37:https://orcid.org/0000-0000-1234-5678 - 37:https://orcid.org/0000-0000-5678-1234 - )) - )) - """ - x = Mapping( - subject_id="http://purl.obolibrary.org/obo/FBbt_00001234", - predicate_id="http://www.w3.org/2004/02/skos/core#exactMatch", - object_id="http://purl.obolibrary.org/obo/UBERON_0005678", - mapping_justification="https://w3id.org/semapv/vocab/ManualMappingCuration", - creator_id=[ - "https://orcid.org/0000-0000-1234-5678", - "https://orcid.org/0000-0000-5678-1234", - ], - ) - self.assertEqual(re.sub("\s", "", s), to_sexpr(x)) - self.assertEqual("hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty", get_mapping_hash(x)) diff --git a/tests/data/sexpr_test.sssom.tsv b/tests/data/sexpr_test.sssom.tsv new file mode 100644 index 00000000..8a28ffe3 --- /dev/null +++ b/tests/data/sexpr_test.sssom.tsv @@ -0,0 +1,7 @@ +#curie_map: +# FBbt: "http://purl.obolibrary.org/obo/FBbt_" +# UBERON: "http://purl.obolibrary.org/obo/UBERON_" +# sssom.record: "https://example.org/sssom.record/" +# orcid: "https://orcid.org/" +record_id subject_id predicate_id object_id mapping_justification creator_id +sssom.record:hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty FBbt:00001234 skos:exactMatch UBERON:0005678 semapv:ManualMappingCuration orcid:0000-0000-1234-5678|orcid:0000-0000-5678-1234 diff --git a/tests/test_sexpr.py b/tests/test_sexpr.py new file mode 100644 index 00000000..09a4db6d --- /dev/null +++ b/tests/test_sexpr.py @@ -0,0 +1,71 @@ +"""Test s-expressions.""" + +import re +import unittest +from pathlib import Path + +import pandas as pd +from curies import Converter + +import sssom.io +from sssom import Mapping +from sssom.sexpr import get_mapping_hash, to_sexpr + +HERE = Path(__file__).parent.resolve() +PATH = HERE.joinpath("data", "sexpr_test.sssom.tsv") + + +class TestSExpressions(unittest.TestCase): + """Test creation of canonical S-expressions.""" + + def test_explicit_example(self) -> None: + """Test a hard-coded example, explicit in the code.""" + converter = Converter.from_prefix_map( + { + "FBbt": "http://purl.obolibrary.org/obo/FBbt_", + "UBERON": "http://purl.obolibrary.org/obo/UBERON_", + "orcid": "https://orcid.org/", + "semapv": "https://w3id.org/semapv/vocab/", + "skos": "http://www.w3.org/2004/02/skos/core#", + } + ) + sexpr = """ + (7:mapping( + (10:subject_id44:http://purl.obolibrary.org/obo/FBbt_00001234) + (12:predicate_id46:http://www.w3.org/2004/02/skos/core#exactMatch) + (9:object_id45:http://purl.obolibrary.org/obo/UBERON_0005678) + (21:mapping_justification51:https://w3id.org/semapv/vocab/ManualMappingCuration) + (10:creator_id( + 37:https://orcid.org/0000-0000-1234-5678 + 37:https://orcid.org/0000-0000-5678-1234 + )) + )) + """ + mapping = Mapping( + subject_id="http://purl.obolibrary.org/obo/FBbt_00001234", + predicate_id="http://www.w3.org/2004/02/skos/core#exactMatch", + object_id="http://purl.obolibrary.org/obo/UBERON_0005678", + mapping_justification="https://w3id.org/semapv/vocab/ManualMappingCuration", + creator_id=[ + "https://orcid.org/0000-0000-1234-5678", + "https://orcid.org/0000-0000-5678-1234", + ], + ) + self.assertEqual(re.sub(r"\s", "", sexpr), to_sexpr(mapping, converter)) + self.assertEqual( + "hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty", + get_mapping_hash(mapping, converter), + ) + + def test_all(self) -> None: + """Test all.""" + msdf = sssom.parse_tsv(PATH) + + # After new SSSOM schema release, this will be part of the mapping data model + record_ids = pd.read_csv(PATH, sep="\t", skiprows=5)["record_id"] + for record_id, mapping in zip(record_ids, msdf.to_mappings()): + self.assertEqual( + record_id.removeprefix("sssom.record:"), + get_mapping_hash(mapping, msdf.converter), + msg=to_sexpr(mapping, msdf.converter), + ) From dafb3a876bc23ce023c851afe31da43d5bad3513 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 9 Sep 2025 15:52:36 +0200 Subject: [PATCH 3/4] Start constructing meta-test --- tests/data/sexpr_test.sssom.tsv | 4 ++-- tests/test_sexpr.py | 34 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/data/sexpr_test.sssom.tsv b/tests/data/sexpr_test.sssom.tsv index 8a28ffe3..8fb0d4ef 100644 --- a/tests/data/sexpr_test.sssom.tsv +++ b/tests/data/sexpr_test.sssom.tsv @@ -3,5 +3,5 @@ # UBERON: "http://purl.obolibrary.org/obo/UBERON_" # sssom.record: "https://example.org/sssom.record/" # orcid: "https://orcid.org/" -record_id subject_id predicate_id object_id mapping_justification creator_id -sssom.record:hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty FBbt:00001234 skos:exactMatch UBERON:0005678 semapv:ManualMappingCuration orcid:0000-0000-1234-5678|orcid:0000-0000-5678-1234 +record_id subject_id predicate_id object_id mapping_justification creator_id reviewer_id author_label mapping_tool_version similarity_score comment license author_id mapping_tool object_type predicate_modifier reviewer_label issue_tracker_item subject_source object_match_field mapping_provider subject_label object_category subject_source_version subject_preprocessing subject_category object_label mapping_source predicate_label curation_rule_text similarity_measure see_also publication_date mapping_date other object_source mapping_cardinality subject_type confidence subject_match_field curation_rule object_source_version object_preprocessing match_string creator_label +sssom.record:hq6bs14aptzepwgk6pw7j8ysnft6riqrw7har84rtk8r9xmbcwty FBbt:00001234 skos:exactMatch UBERON:0005678 semapv:ManualMappingCuration orcid:0000-0000-1234-5678|orcid:0000-0000-5678-1234 ventral abdominal es5 exact match someone diff --git a/tests/test_sexpr.py b/tests/test_sexpr.py index 09a4db6d..8445fac9 100644 --- a/tests/test_sexpr.py +++ b/tests/test_sexpr.py @@ -9,6 +9,7 @@ import sssom.io from sssom import Mapping +from sssom.constants import SSSOMSchemaView from sssom.sexpr import get_mapping_hash, to_sexpr HERE = Path(__file__).parent.resolve() @@ -57,12 +58,43 @@ def test_explicit_example(self) -> None: get_mapping_hash(mapping, converter), ) + def test_test_completion(self) -> None: + """Test that the example file is complete over the whole SSSOM schema.""" + view = SSSOMSchemaView() + + df = pd.read_csv(PATH, sep="\t", comment="#") + missing = set(view.mapping_slots).difference(df.columns) + if missing: + msg = "\n".join(sorted(missing)) + self.fail(msg=f"comprehensive testing file is missing slots:\n{msg}") + + for slot in view.mapping_slots: + with self.subTest(slot=slot): + series = df[slot] + self.assertTrue(series.any(), msg=f"there is no row that has a value for: {slot}") + + values = series.unique() + if slot in view.multivalued_slots: + self.assertTrue( + any("|" in value for value in values), + msg=f"missing a multi-valued example for slot: {slot}", + ) + self.assertTrue( + any("|" not in value for value in values), + msg=f"missing a single valued example for slot: {slot}", + ) + else: + self.assertFalse( + any("|" in value for value in values), + msg=f"should not have a pipe delimiter in single valued slot: {slot}", + ) + def test_all(self) -> None: """Test all.""" msdf = sssom.parse_tsv(PATH) # After new SSSOM schema release, this will be part of the mapping data model - record_ids = pd.read_csv(PATH, sep="\t", skiprows=5)["record_id"] + record_ids = pd.read_csv(PATH, sep="\t", comment="#")["record_id"] for record_id, mapping in zip(record_ids, msdf.to_mappings()): self.assertEqual( record_id.removeprefix("sssom.record:"), From 8ede3bf4f5a6c03cee49c7bf6c078765306f7509 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 9 Sep 2025 15:58:39 +0200 Subject: [PATCH 4/4] Update sexpr.py --- src/sssom/sexpr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sssom/sexpr.py b/src/sssom/sexpr.py index bd2baffa..8b884011 100644 --- a/src/sssom/sexpr.py +++ b/src/sssom/sexpr.py @@ -25,13 +25,15 @@ def get_mapping_hash(mapping: Mapping, converter: curies.Converter) -> str: def _should_expand(slot: str) -> bool: - return True + return slot in _get_sssom_schema_object().entity_reference_slots def to_sexpr(x: Mapping, converter: curies.Converter) -> str: # todo get canonical order + + schema_object = _get_sssom_schema_object() rv = "(7:mapping(" - for slot in _get_sssom_schema_object().slots: + for slot in schema_object.mapping_slots: if slot in SKIP_SLOTS: continue value = getattr(x, slot, None)