Use adaptor in save_data

nathanjmcdougall · nathanjmcdougall · commit 566506d21477 · 2024-08-13T21:54:30.000+12:00
diff --git a/pins/adaptors.py b/pins/adaptors.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import json
 from abc import abstractmethod
 from typing import TYPE_CHECKING, Any, ClassVar, Self, TypeAlias, overload
 
@@ -25,6 +26,47 @@ class _Adaptor:
     def __init__(self, data: Any) -> None:
         self._d = data
 
+    @overload
+    def write_json(self, file: str) -> None: ...
+    @overload
+    def write_json(self, file: None) -> str: ...
+    def write_json(self, file=None):
+        if file is None:
+            msg = (
+                f"Writing to JSON string rather than file is not supported for "
+                f"{type(self._d)}"
+            )
+            raise NotImplementedError(msg)
+
+        import json
+
+        json.dump(self._d, open(file, mode="w"))
+
+    def write_joblib(self, file: str) -> None:
+        import joblib
+
+        joblib.dump(self._d, file)
+
+    def write_csv(self, file: str) -> None:
+        msg = f"Writing to CSV is not supported for {type(self._d)}"
+        raise NotImplementedError(msg)
+
+    def write_parquet(self, file: str) -> None:
+        msg = f"Writing to Parquet is not supported for {type(self._d)}"
+        raise NotImplementedError(msg)
+
+    def write_feather(self, file: str) -> None:
+        msg = f"Writing to Feather is not supported for {type(self._d)}"
+        raise NotImplementedError(msg)
+
+    @property
+    def data_preview(self) -> str:
+        # note that the R library uses jsonlite::toJSON
+        import json
+
+        # TODO(compat): set display none in index.html
+        return json.dumps({})
+
 
 class _DFAdaptor(_Adaptor):
     _d: ClassVar[_DataFrame]
@@ -39,12 +81,20 @@ def columns(self) -> list[Any]: ...
     @abstractmethod
     def head(self, n: int) -> Self: ...
 
-    @abstractmethod
-    def write_json(self) -> str:
-        """Write the dataframe to a JSON string.
+    @property
+    def data_preview(self) -> str:
+        # TODO(compat) is 100 hard-coded?
+        # Note that we go df -> json -> dict, to take advantage of type conversions in the dataframe library
+        data: list[dict[Any, Any]] = json.loads(self.head(100).write_json())
+        columns = [
+            {"name": [col], "label": [col], "align": ["left"], "type": [""]}
+            for col in self.columns
+        ]
 
-        In the format: list like [{column -> value}, ... , {column -> value}]
-        """
+        # this reproduces R pins behavior, by omitting entries that would be null
+        data_no_nulls = [{k: v for k, v in row.items() if v is not None} for row in data]
+
+        return json.dumps({"data": data_no_nulls, "columns": columns})
 
 
 class _PandasAdaptor(_DFAdaptor):
@@ -53,22 +103,43 @@ def __init__(self, data: _AbstractPandasFrame) -> None:
 
     @property
     def columns(self) -> list[Any]:
-        return self._d.columns
+        return self._d.columns.tolist()
 
     def head(self, n: int) -> Self:
         return _PandasAdaptor(self._d.head(n))
 
-    def write_json(self) -> str:
+    @overload
+    def write_json(self, file: str) -> None: ...
+    @overload
+    def write_json(self, file: None) -> str: ...
+    def write_json(self, file=None):
+        if file is not None:
+            msg = (
+                f"Writing to file rather than JSON string is not supported for "
+                f"{type(self._d)}"
+            )
+            raise NotImplementedError(msg)
+
         return self._d.to_json(orient="records")
 
+    def write_csv(self, file: str) -> None:
+        self._d.to_csv(file, index=False)
+
+    def write_parquet(self, file: str) -> None:
+        self._d.to_parquet(file)
+
+    def write_feather(self, file: str) -> None:
+        self._d.to_feather(file)
+
 
 @overload
-def _create_df_adaptor(df: _DataFrame) -> _DFAdaptor: ...
+def _create_adaptor(obj: Any) -> _Adaptor: ...
 @overload
-def _create_df_adaptor(df: _PandasDataFrame) -> _PandasAdaptor: ...
-def _create_df_adaptor(df):
-    if isinstance(df, _AbstractPandasFrame):
-        return _PandasAdaptor(df)
-
-    msg = f"Could not determine dataframe adaptor for {df}"
-    raise NotImplementedError(msg)
+def _create_adaptor(obj: _DataFrame) -> _DFAdaptor: ...
+@overload
+def _create_adaptor(obj: _PandasDataFrame) -> _PandasAdaptor: ...
+def _create_adaptor(obj):
+    if isinstance(obj, _AbstractPandasFrame):
+        return _PandasAdaptor(obj)
+    else:
+        return _Adaptor(obj)
diff --git a/pins/boards.py b/pins/boards.py
@@ -7,11 +7,11 @@
 from datetime import datetime, timedelta
 from io import IOBase
 from pathlib import Path
-from typing import Any, Mapping, Optional, Protocol, Sequence
+from typing import Mapping, Optional, Protocol, Sequence
 
 from importlib_resources import files
 
-from .adaptors import _create_df_adaptor, _DFAdaptor
+from .adaptors import _create_adaptor
 from .cache import PinsCache
 from .config import get_allow_rsc_short_name
 from .drivers import default_title, load_data, load_file, save_data
@@ -1124,11 +1124,7 @@ def user_name(self):
 
     # TODO(NAMC) what about the functions that call this one?
     def prepare_pin_version(self, pin_dir_path, x, name: "str | None", *args, **kwargs):
-        try:
-            x = _create_df_adaptor(x)
-        except NotImplementedError:
-            # Not a dataframe.
-            pass
+        adaptor = _create_adaptor(x)
 
         # RSC pin names can have form <user_name>/<name>, but this will try to
         # create the object in a directory named <user_name>. So we grab just
@@ -1138,7 +1134,7 @@ def prepare_pin_version(self, pin_dir_path, x, name: "str | None", *args, **kwar
         # TODO(compat): py pins always uses the short name, R pins uses w/e the
         # user passed, but guessing people want the long name?
         meta = super()._create_meta(
-            pin_dir_path, x, short_name, *args, **kwargs
+            pin_dir_path, adaptor, short_name, *args, **kwargs
         )  # TODO(NAMC) ensure .create_meta can accept adaptor
         meta.name = name
 
@@ -1165,36 +1161,9 @@ def prepare_pin_version(self, pin_dir_path, x, name: "str | None", *args, **kwar
             "pin_files": pin_files,
             "pin_metadata": meta,
             "board_deparse": board_deparse(self),
+            "data_preview": adaptor.data_preview,
         }
 
-        # data preview ----
-
-        # TODO: move out data_preview logic? Can we draw some limits here?
-        #       note that the R library uses jsonlite::toJSON
-
-        import json
-
-        if isinstance(x, _DFAdaptor):
-            # TODO(compat) is 100 hard-coded?
-            # Note that we go df -> json -> dict, to take advantage of type conversions in the dataframe library
-            data: list[dict[Any, Any]] = json.loads(x.head(100).write_json())
-            columns = [
-                {"name": [col], "label": [col], "align": ["left"], "type": [""]}
-                for col in x.columns
-            ]
-
-            # this reproduces R pins behavior, by omitting entries that would be null
-            data_no_nulls = [
-                {k: v for k, v in row.items() if v is not None} for row in data
-            ]
-
-            context["data_preview"] = json.dumps(
-                {"data": data_no_nulls, "columns": columns}
-            )
-        else:
-            # TODO(compat): set display none in index.html
-            context["data_preview"] = json.dumps({})
-
         # do not show r code if not round-trip friendly
         if meta.type in ["joblib"]:
             context["show_r_style"] = "display:none"
diff --git a/pins/drivers.py b/pins/drivers.py
@@ -1,6 +1,8 @@
 from pathlib import Path
 from typing import Sequence
 
+from pins.adaptors import _create_adaptor
+
 from .config import PINS_ENV_INSECURE_READ, get_allow_pickle_read
 from .errors import PinsInsecureReadError
 from .meta import Meta
@@ -13,15 +15,6 @@
 REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib", "file"])
 
 
-def _assert_is_pandas_df(x, file_type: str) -> None:
-    import pandas as pd
-
-    if not isinstance(x, pd.DataFrame):
-        raise NotImplementedError(
-            f"Currently only pandas.DataFrame can be saved as type {file_type!r}."
-        )
-
-
 def load_path(meta, path_to_version):
     # Check that only a single file name was given
     fnames = [meta.file] if isinstance(meta.file, str) else meta.file
@@ -141,6 +134,8 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen
     #       as argument to board, and then type dispatchers for explicit cases
     #       of saving / loading objects different ways.
 
+    adaptor = _create_adaptor(obj)
+
     if apply_suffix:
         if type == "file":
             suffix = "".join(Path(obj).suffixes)
@@ -152,47 +147,29 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen
     final_name = f"{fname}{suffix}"
 
     if type == "csv":
-        _assert_is_pandas_df(obj, file_type=type)
-
-        obj.to_csv(final_name, index=False)
-
+        adaptor.write_csv(final_name)
     elif type == "arrow":
         # NOTE: R pins accepts the type arrow, and saves it as feather.
         #       we allow reading this type, but raise an error for writing.
-        _assert_is_pandas_df(obj, file_type=type)
-
-        obj.to_feather(final_name)
-
+        adaptor.write_feather(final_name)
     elif type == "feather":
-        _assert_is_pandas_df(obj, file_type=type)
-
-        raise NotImplementedError(
+        msg = (
             'Saving data as type "feather" no longer supported. Use type "arrow" instead.'
         )
-
+        raise NotImplementedError(msg)
     elif type == "parquet":
-        _assert_is_pandas_df(obj, file_type=type)
-
-        obj.to_parquet(final_name)
-
+        adaptor.write_parquet(final_name)
     elif type == "joblib":
-        import joblib
-
-        joblib.dump(obj, final_name)
-
+        adaptor.write_joblib(final_name)
     elif type == "json":
-        import json
-
-        json.dump(obj, open(final_name, "w"))
-
+        adaptor.write_json(final_name)
     elif type == "file":
         import contextlib
         import shutil
 
         # ignore the case where the source is the same as the target
         with contextlib.suppress(shutil.SameFileError):
             shutil.copyfile(str(obj), final_name)
-
     else:
         raise NotImplementedError(f"Cannot save type: {type}")
 
diff --git a/pins/tests/test_adaptors.py b/pins/tests/test_adaptors.py