Implement drivers for geoparquet.

nathanjmcdougall · nathanjmcdougall · commit cd68b2a80931 · 2024-07-25T20:05:31.000+12:00
diff --git a/pins/drivers.py b/pins/drivers.py
@@ -22,6 +22,16 @@ def _assert_is_pandas_df(x, file_type: str) -> None:
         )
 
 
+def _assert_is_geopandas_df(x):
+    # Assume we have already protected against uninstalled geopandas
+    import geopandas as gpd
+
+    if not isinstance(x, gpd.GeoDataFrame):
+        raise NotImplementedError(
+            "Currently only geopandas.GeoDataFrame can be saved to a GeoParquet."
+        )
+
+
 def load_path(meta, path_to_version):
     # Check that only a single file name was given
     fnames = [meta.file] if isinstance(meta.file, str) else meta.file
@@ -104,6 +114,17 @@ def load_data(
 
             return pd.read_csv(f)
 
+        elif meta.type == "geoparquet":
+            try:
+                import geopandas as gpd
+            except ModuleNotFoundError:
+                raise ModuleNotFoundError(
+                    'The "geopandas" package is required to read "geoparquet" type '
+                    "files."
+                ) from None
+
+            return gpd.read_parquet(f)
+
         elif meta.type == "joblib":
             import joblib
 
@@ -144,6 +165,8 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen
     if apply_suffix:
         if type == "file":
             suffix = "".join(Path(obj).suffixes)
+        elif type == "geoparquet":
+            suffix = ".parquet"
         else:
             suffix = f".{type}"
     else:
@@ -175,6 +198,11 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen
 
         obj.to_parquet(final_name)
 
+    elif type == "geoparquet":
+        _assert_is_geopandas_df(obj)
+
+        obj.to_parquet(final_name)
+
     elif type == "joblib":
         import joblib
 
diff --git a/pins/tests/test_drivers.py b/pins/tests/test_drivers.py
@@ -76,6 +76,27 @@ def test_driver_roundtrip(tmp_path: Path, type_):
     assert df.equals(obj)
 
 
+def test_driver_geoparquet_roundtrip(tmp_dir2):
+    import geopandas as gpd
+
+    gdf = gpd.GeoDataFrame(
+        {"x": [1, 2, 3], "geometry": gpd.points_from_xy([1, 2, 3], [1, 2, 3])}
+    )
+
+    fname = "some_gdf"
+    full_file = f"{fname}.parquet"
+
+    p_obj = tmp_dir2 / fname
+    res_fname = save_data(gdf, p_obj, "geoparquet")
+
+    assert Path(res_fname).name == full_file
+
+    meta = MetaRaw(full_file, "geoparquet", "my_pin")
+    obj = load_data(meta, fsspec.filesystem("file"), tmp_dir2, allow_pickle_read=True)
+
+    assert gdf.equals(obj)
+
+
 @pytest.mark.parametrize(
     "type_",
     [