Merge pull request #268 from linkml/issue-1337-csv-delimiter

pkalita-lbl · web-flow · commit ad5c7f0868a4 · 2023-05-30T08:37:06.000-07:00
Add new TSV loader/dumper classes
diff --git a/linkml_runtime/dumpers/__init__.py b/linkml_runtime/dumpers/__init__.py
@@ -1,6 +1,7 @@
 from linkml_runtime.dumpers.json_dumper import JSONDumper
 from linkml_runtime.dumpers.rdf_dumper import RDFDumper
 from linkml_runtime.dumpers.rdflib_dumper import RDFLibDumper
+from linkml_runtime.dumpers.tsv_dumper import TSVDumper
 from linkml_runtime.dumpers.yaml_dumper import YAMLDumper
 from linkml_runtime.dumpers.csv_dumper import CSVDumper
 
@@ -9,3 +10,4 @@
 rdflib_dumper = RDFLibDumper()
 yaml_dumper = YAMLDumper()
 csv_dumper = CSVDumper()
+tsv_dumper = TSVDumper()
diff --git a/linkml_runtime/dumpers/csv_dumper.py b/linkml_runtime/dumpers/csv_dumper.py
@@ -1,34 +1,8 @@
-import io
-import yaml
-import json
-from typing import Union
-from pydantic import BaseModel
+from linkml_runtime.dumpers.delimited_file_dumper import DelimitedFileDumper
 
-from linkml_runtime.dumpers.dumper_root import Dumper
-from linkml_runtime.dumpers.json_dumper import JSONDumper
-from linkml_runtime.utils.yamlutils import YAMLRoot
-from linkml_runtime.linkml_model.meta import SlotDefinitionName, SchemaDefinition
-from linkml_runtime.utils.schemaview import SchemaView
 
-from linkml_runtime.utils.csvutils import GlobalConfig, get_configmap
-from json_flattener import flatten_to_csv
+class CSVDumper(DelimitedFileDumper):
 
-
-class CSVDumper(Dumper):
-
-    def dumps(self, element: Union[BaseModel, YAMLRoot],
-              index_slot: SlotDefinitionName = None,
-              schema: SchemaDefinition = None,
-              schemaview: SchemaView = None,
-              **kwargs) -> str:
-        """ Return element formatted as CSV lines """
-        json_dumper = JSONDumper()
-        element_j = json.loads(json_dumper.dumps(element))
-        objs = element_j[index_slot]
-        if schemaview is None:
-            schemaview = SchemaView(schema)
-        configmap = get_configmap(schemaview, index_slot)
-        config = GlobalConfig(key_configs=configmap)
-        output = io.StringIO()
-        flatten_to_csv(objs, output, config=config, **kwargs)
-        return output.getvalue()
+    @property
+    def delimiter(self):
+        return ","
diff --git a/linkml_runtime/dumpers/delimited_file_dumper.py b/linkml_runtime/dumpers/delimited_file_dumper.py
@@ -0,0 +1,40 @@
+import io
+import yaml
+import json
+from abc import ABC, abstractmethod
+from typing import Union
+from pydantic import BaseModel
+
+from linkml_runtime.dumpers.dumper_root import Dumper
+from linkml_runtime.dumpers.json_dumper import JSONDumper
+from linkml_runtime.utils.yamlutils import YAMLRoot
+from linkml_runtime.linkml_model.meta import SlotDefinitionName, SchemaDefinition
+from linkml_runtime.utils.schemaview import SchemaView
+
+from linkml_runtime.utils.csvutils import GlobalConfig, get_configmap
+from json_flattener import flatten_to_csv
+
+
+class DelimitedFileDumper(Dumper, ABC):
+
+    @property
+    @abstractmethod
+    def delimiter(self):
+        pass
+
+    def dumps(self, element: Union[BaseModel, YAMLRoot],
+              index_slot: SlotDefinitionName = None,
+              schema: SchemaDefinition = None,
+              schemaview: SchemaView = None,
+              **kwargs) -> str:
+        """ Return element formatted as CSV lines """
+        json_dumper = JSONDumper()
+        element_j = json.loads(json_dumper.dumps(element))
+        objs = element_j[index_slot]
+        if schemaview is None:
+            schemaview = SchemaView(schema)
+        configmap = get_configmap(schemaview, index_slot)
+        config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter)
+        output = io.StringIO()
+        flatten_to_csv(objs, output, config=config, **kwargs)
+        return output.getvalue()
diff --git a/linkml_runtime/dumpers/tsv_dumper.py b/linkml_runtime/dumpers/tsv_dumper.py
@@ -0,0 +1,8 @@
+from linkml_runtime.dumpers.delimited_file_dumper import DelimitedFileDumper
+
+
+class TSVDumper(DelimitedFileDumper):
+
+    @property
+    def delimiter(self):
+        return "\t"
diff --git a/linkml_runtime/loaders/__init__.py b/linkml_runtime/loaders/__init__.py
@@ -1,6 +1,7 @@
 from linkml_runtime.loaders.json_loader import JSONLoader
 from linkml_runtime.loaders.rdf_loader import RDFLoader
 from linkml_runtime.loaders.rdflib_loader import RDFLibLoader
+from linkml_runtime.loaders.tsv_loader import TSVLoader
 from linkml_runtime.loaders.yaml_loader import YAMLLoader
 from linkml_runtime.loaders.csv_loader import CSVLoader
 
@@ -9,3 +10,4 @@
 rdflib_loader = RDFLibLoader()
 yaml_loader = YAMLLoader()
 csv_loader = CSVLoader()
+tsv_loader = TSVLoader()
diff --git a/linkml_runtime/loaders/csv_loader.py b/linkml_runtime/loaders/csv_loader.py
@@ -1,45 +1,7 @@
-from json_flattener import unflatten_from_csv, KeyConfig, GlobalConfig, Serializer
-import json
-from typing import Type, Union, List
-from linkml_runtime.utils.yamlutils import YAMLRoot
-from pydantic import BaseModel
+from linkml_runtime.loaders.delimited_file_loader import DelimitedFileLoader
 
-from linkml_runtime.loaders.loader_root import Loader
-from linkml_runtime.loaders.json_loader import JSONLoader
-from linkml_runtime.linkml_model.meta import SlotDefinitionName, SchemaDefinition, ClassDefinition
-from linkml_runtime.utils.yamlutils import YAMLRoot
-from linkml_runtime.utils.schemaview import SchemaView
-from linkml_runtime.utils.csvutils import get_configmap
-
-class CSVLoader(Loader):
-
-    def load_any(self, *args, **kwargs) -> Union[YAMLRoot, List[YAMLRoot]]:
-        return self.load(*args, **kwargs)
-
-
-    def loads(self, input,
-              target_class: Type[Union[BaseModel, YAMLRoot]],
-              index_slot: SlotDefinitionName = None,
-              schema: SchemaDefinition = None,
-              schemaview: SchemaView = None,
-              **kwargs) -> str:
-        if schemaview is None:
-            schemaview = SchemaView(schema)
-        configmap = get_configmap(schemaview, index_slot)
-        config = GlobalConfig(key_configs=configmap)
-        objs = unflatten_from_csv(input, config=config, **kwargs)
-        return JSONLoader().loads(json.dumps({index_slot: objs}), target_class=target_class)
-
-    def load(self, source: str,
-             target_class: Type[Union[BaseModel, YAMLRoot]],
-             index_slot: SlotDefinitionName = None,
-             schema: SchemaDefinition = None,
-             schemaview: SchemaView = None,
-             **kwargs) -> str:
-        if schemaview is None:
-            schemaview = SchemaView(schema)
-        configmap = get_configmap(schemaview, index_slot)
-        config = GlobalConfig(key_configs=configmap)
-        print(f'Loading from {source}')
-        objs = unflatten_from_csv(source, config=config, **kwargs)
-        return JSONLoader().loads(json.dumps({index_slot: objs}), target_class=target_class)
+class CSVLoader(DelimitedFileLoader):
+    
+    @property
+    def delimiter(self):
+        return ","
diff --git a/linkml_runtime/loaders/delimited_file_loader.py b/linkml_runtime/loaders/delimited_file_loader.py
@@ -0,0 +1,52 @@
+from abc import ABC, abstractmethod
+from json_flattener import unflatten_from_csv, KeyConfig, GlobalConfig, Serializer
+import json
+from typing import Type, Union, List
+from linkml_runtime.utils.yamlutils import YAMLRoot
+from pydantic import BaseModel
+
+from linkml_runtime.loaders.loader_root import Loader
+from linkml_runtime.loaders.json_loader import JSONLoader
+from linkml_runtime.linkml_model.meta import SlotDefinitionName, SchemaDefinition, ClassDefinition
+from linkml_runtime.utils.yamlutils import YAMLRoot
+from linkml_runtime.utils.schemaview import SchemaView
+from linkml_runtime.utils.csvutils import get_configmap
+
+class DelimitedFileLoader(Loader, ABC):
+
+    @property
+    @abstractmethod
+    def delimiter(self):
+        pass
+
+
+    def load_any(self, *args, **kwargs) -> Union[YAMLRoot, List[YAMLRoot]]:
+        return self.load(*args, **kwargs)
+
+
+    def loads(self, input,
+              target_class: Type[Union[BaseModel, YAMLRoot]],
+              index_slot: SlotDefinitionName = None,
+              schema: SchemaDefinition = None,
+              schemaview: SchemaView = None,
+              **kwargs) -> str:
+        if schemaview is None:
+            schemaview = SchemaView(schema)
+        configmap = get_configmap(schemaview, index_slot)
+        config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter)
+        objs = unflatten_from_csv(input, config=config, **kwargs)
+        return JSONLoader().loads(json.dumps({index_slot: objs}), target_class=target_class)
+
+    def load(self, source: str,
+             target_class: Type[Union[BaseModel, YAMLRoot]],
+             index_slot: SlotDefinitionName = None,
+             schema: SchemaDefinition = None,
+             schemaview: SchemaView = None,
+             **kwargs) -> str:
+        if schemaview is None:
+            schemaview = SchemaView(schema)
+        configmap = get_configmap(schemaview, index_slot)
+        config = GlobalConfig(key_configs=configmap, csv_delimiter=self.delimiter)
+        print(f'Loading from {source}')
+        objs = unflatten_from_csv(source, config=config, **kwargs)
+        return JSONLoader().loads(json.dumps({index_slot: objs}), target_class=target_class)
diff --git a/linkml_runtime/loaders/tsv_loader.py b/linkml_runtime/loaders/tsv_loader.py
@@ -0,0 +1,7 @@
+from linkml_runtime.loaders.delimited_file_loader import DelimitedFileLoader
+
+class TSVLoader(DelimitedFileLoader):
+    
+    @property
+    def delimiter(self):
+        return "\t"
diff --git a/tests/test_loaders_dumpers/test_csv_tsv_loader_dumper.py b/tests/test_loaders_dumpers/test_csv_tsv_loader_dumper.py
@@ -9,10 +9,10 @@
 from linkml_runtime.loaders import yaml_loader
 from linkml_runtime.utils.formatutils import remove_empty_items, is_empty
 from linkml_runtime.utils.schemaview import SchemaView
-from linkml_runtime.dumpers import csv_dumper
-from linkml_runtime.loaders import csv_loader
+from linkml_runtime.dumpers import csv_dumper, tsv_dumper
+from linkml_runtime.loaders import csv_loader, tsv_loader
 from linkml_runtime.utils.yamlutils import as_json_object
-from tests.test_loaders_dumpers.models.books_normalized import Shop, Book, GenreEnum, BookSeries
+from tests.test_loaders_dumpers.models.books_normalized import Author, Review, Shop, Book, GenreEnum, BookSeries
 
 
 ROOT = os.path.abspath(os.path.dirname(__file__))
@@ -30,25 +30,27 @@ def _json(obj) -> str:
     return json.dumps(obj, indent=' ', sort_keys=True)
 
 
-class CSVGenTestCase(unittest.TestCase):
+class CsvAndTsvGenTestCase(unittest.TestCase):
 
     def test_object_model(self):
         book = Book(id='B1', genres=['fantasy'], creator={})
-        print(book.genres)
-        print(type(book.genres[0]))
         logging.debug(as_json_obj(book.genres[0]))
         assert str(book.genres[0]) == 'fantasy'
         assert book.genres[0].code.text == 'fantasy'
         processed = remove_empty_items(book.genres)
-        print(f'PR={processed}')
         assert processed[0] == 'fantasy'
-        series = BookSeries(id='S1')
+        series = BookSeries(id='S1', creator=Author(name="Q. Writer"), reviews=[Review(rating=5)])
         series.books.append(book)
         schemaview = SchemaView(SCHEMA)
         shop = Shop()
-        shop.all_book_series.append(book)
-        #csvstr = csv_dumper.dumps(shop, index_slot='all_book_series', schemaview=schemaview)
-        #logging.debug(csvstr)
+        shop.all_book_series.append(series)
+
+        csvstr = csv_dumper.dumps(shop, index_slot='all_book_series', schemaview=schemaview)
+        assert "," in csvstr
+        assert "\t" not in csvstr
+
+        tsvstr = tsv_dumper.dumps(shop, index_slot='all_book_series', schemaview=schemaview)
+        assert "\t" in tsvstr
 
     def test_csvgen_roundtrip(self):
         schemaview = SchemaView(SCHEMA)
@@ -60,6 +62,13 @@ def test_csvgen_roundtrip(self):
         logging.debug(f'COMPARE 2: {data}')
         assert roundtrip == data
 
+    def test_tsvgen_roundtrip(self):
+        schemaview = SchemaView(SCHEMA)
+        data = yaml_loader.load(DATA, target_class=Shop)
+        tsv_dumper.dump(data, to_file=OUTPUT, index_slot='all_book_series', schemaview=schemaview)
+        roundtrip = tsv_loader.load(OUTPUT, target_class=Shop, index_slot='all_book_series', schemaview=schemaview)
+        assert roundtrip == data
+
     def test_csvgen_unroundtrippable(self):
         schemaview = SchemaView(SCHEMA)
         #schema = YAMLGenerator(SCHEMA).schema
@@ -84,6 +93,13 @@ def test_csvgen_unroundtrippable(self):
         logging.debug(json_dumper.dumps(roundtrip))
         assert roundtrip == data
 
+    def test_tsvgen_unroundtrippable(self):
+        schemaview = SchemaView(SCHEMA)
+        data = yaml_loader.load(DATA2, target_class=Shop)
+        assert str(data.all_book_series[0].genres[0]) == 'fantasy'
+        tsv_dumper.dump(data, to_file=OUTPUT2, index_slot='all_book_series', schemaview=schemaview)
+        roundtrip = tsv_loader.load(OUTPUT2, target_class=Shop, index_slot='all_book_series', schemaview=schemaview)
+        assert roundtrip == data