Add reusable test for split dataframe function (#610)

cthoyt · web-flow · commit 3721439ca9b6 · 2025-09-06T15:41:29.000+03:00
diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
@@ -977,6 +977,11 @@ def split_dataframe(
     )
 
 
+def _get_split_key(subject_prefix: str, relation_luid: str, object_prefix: str) -> str:
+    split = f"{subject_prefix.lower()}_{relation_luid.lower()}_{object_prefix.lower()}"
+    return split
+
+
 def split_dataframe_by_prefix(
     msdf: MappingSetDataFrame,
     subject_prefixes: Iterable[str],
@@ -998,7 +1003,7 @@ def split_dataframe_by_prefix(
         subject_prefixes, object_prefixes, relations
     ):
         relation_prefix, relation_id = relation.split(":")
-        split = f"{subject_prefix.lower()}_{relation_id.lower()}_{object_prefix.lower()}"
+        split = _get_split_key(subject_prefix, relation_id, object_prefix)
         if subject_prefix not in msdf.converter.bimap:
             logging.warning(f"{split} - missing subject prefix - {subject_prefix}")
             continue
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -28,6 +28,7 @@
     from_sssom_json,
     from_sssom_rdf,
     parse_sssom_table,
+    split_dataframe_by_prefix,
 )
 from sssom.util import MappingSetDataFrame, sort_df_rows_columns
 from sssom.writers import WRITER_FUNCTIONS, write_table
@@ -513,3 +514,60 @@ def test_check_irregular_metadata(self):
         self.assertTrue(is_irregular_metadata_fail_missing_property_case)
         self.assertTrue(is_valid_extension)
         self.assertFalse(is_irregular_metadata_ok_case)
+
+
+class TestSplit(unittest.TestCase):
+    """A test case for dataframe utilities."""
+
+    def test_split_df(self) -> None:
+        """Test the precursor to SSSOM function."""
+        converter = Converter.from_prefix_map(
+            {
+                "p1": "https://example.org/p1/",
+                "p2": "https://example.org/p2/",
+                "p3": "https://example.org/p3/",
+                "p4": "https://example.org/p4/",
+                "p5": "https://example.org/p5/",
+                "p6": "https://example.org/p6/",
+                "skos": "http://www.w3.org/2004/02/skos/core#",
+                "semapv": "https://w3id.org/semapv/vocab/",
+            }
+        )
+        subrows = [
+            ("p1:1", "skos:exactMatch", "p2:1", "semapv:ManualMappingCuration"),
+            ("p1:2", "skos:exactMatch", "p2:2", "semapv:ManualMappingCuration"),
+        ]
+        rows = [
+            *subrows,
+            ("p1:2", "skos:exactMatch", "p3:2", "semapv:ManualMappingCuration"),
+            ("p4:1", "skos:exactMatch", "p1:1", "semapv:ManualMappingCuration"),
+            ("p5:1", "skos:broadMatch", "p6:1", "semapv:ManualMappingCuration"),
+            ("p1:7", "skos:broadMatch", "p2:7", "semapv:ManualMappingCuration"),
+        ]
+        columns = ["subject_id", "predicate_id", "object_id", "mapping_justification"]
+        df = pd.DataFrame(rows, columns=columns)
+        msdf = from_sssom_dataframe(df, converter)
+
+        # test that if there's ever an empty list, then it returns an empty dict
+        self.assertFalse(split_dataframe_by_prefix(msdf, [], ["p2"], ["skos:exactMatch"]))
+        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], []))
+        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], [], ["skos:exactMatch"]))
+
+        # test that missing prefixes don't result in anything
+        self.assertFalse(split_dataframe_by_prefix(msdf, ["nope"], ["p2"], ["skos:exactMatch"]))
+        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["nope"], ["skos:exactMatch"]))
+        self.assertFalse(split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["nope:nope"]))
+
+        sdf = pd.DataFrame(subrows, columns=columns)
+        # test an explicit return with only single entries
+        rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2"], ["skos:exactMatch"])
+        self.assertEqual(1, len(rv), msg="nothing was indexed")
+        self.assertIn("p1_exactmatch_p2", rv)
+        self.assertEqual(sdf.values.tolist(), rv["p1_exactmatch_p2"].df.values.tolist())
+
+        # test an explicit return with multiple entries
+        rv = split_dataframe_by_prefix(msdf, ["p1"], ["p2", "p3"], ["skos:exactMatch"])
+        self.assertEqual(2, len(rv), msg="nothing was indexed")
+        self.assertIn("p1_exactmatch_p2", rv)
+        self.assertIn("p1_exactmatch_p3", rv)
+        self.assertEqual(sdf.values.tolist(), rv["p1_exactmatch_p2"].df.values.tolist())