Skip to content

Commit 708f8af

Browse files
authored
Remove pyarrow dependency (#582)
For #581
1 parent abcb5d4 commit 708f8af

25 files changed

+1233
-1122
lines changed

lonboard/_cli.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,32 @@
55
from typing import Dict, List, Optional
66

77
import click
8-
import pyarrow as pa
98
import pyarrow.parquet as pq
9+
from arro3.core import Table
1010
from pyproj import CRS
1111

1212
from lonboard import viz
1313
from lonboard._constants import EXTENSION_NAME
1414

1515

16-
def read_pyogrio(path: Path) -> pa.Table:
16+
def read_pyogrio(path: Path) -> Table:
1717
"""Read path using pyogrio and convert field metadata to geoarrow
1818
1919
Args:
2020
path: Path to file readable by pyogrio
2121
"""
2222
try:
23-
from pyogrio.raw import read_arrow
23+
from pyogrio.raw import open_arrow
2424
except ImportError as e:
2525
raise ImportError(
2626
"pyogrio is a required dependency for the CLI. "
2727
"Install with `pip install pyogrio`."
2828
) from e
2929

30-
meta, table = read_arrow(path)
30+
with open_arrow(path, use_pyarrow=False) as source:
31+
meta, stream = source
32+
table = Table.from_arrow(stream)
33+
3134
# The `geometry_name` key always exists but can be an empty string. In the case of
3235
# an empty string, we want to default to `wkb_geometry`
3336
geometry_column_name = meta.get("geometry_name") or "wkb_geometry"
@@ -53,10 +56,10 @@ def read_pyogrio(path: Path) -> pa.Table:
5356

5457
new_field = field.with_name("geometry").with_metadata(metadata)
5558
new_schema = schema.set(geometry_column_index, new_field)
56-
return pa.Table.from_arrays(table.columns, schema=new_schema)
59+
return table.with_schema(new_schema)
5760

5861

59-
def read_geoparquet(path: Path):
62+
def read_geoparquet(path: Path) -> Table:
6063
"""Read GeoParquet file at path using pyarrow
6164
6265
Args:
@@ -67,7 +70,8 @@ def read_geoparquet(path: Path):
6770
if not geo_meta:
6871
raise ValueError("Expected geo metadata in Parquet file")
6972

70-
table = file.read()
73+
pyarrow_table = file.read()
74+
table = Table.from_arrow(pyarrow_table)
7175

7276
geo_meta = json.loads(geo_meta)
7377
geometry_column_name = geo_meta["primary_column"]
@@ -86,7 +90,7 @@ def read_geoparquet(path: Path):
8690

8791
new_field = table.schema.field(geometry_column_index).with_metadata(metadata)
8892
new_schema = table.schema.set(geometry_column_index, new_field)
89-
return pa.Table.from_arrays(table.columns, schema=new_schema)
93+
return table.with_schema(new_schema)
9094

9195

9296
@click.command()

lonboard/_geoarrow/_duckdb.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@
22

33
import json
44
import re
5-
from typing import TYPE_CHECKING, Optional, Union
5+
from typing import TYPE_CHECKING, List, Optional, Union
66

77
import numpy as np
8-
import pyarrow as pa
9-
import pyarrow.compute as pc
8+
from arro3.compute import struct_field
9+
from arro3.core import (
10+
Array,
11+
ChunkedArray,
12+
Field,
13+
Table,
14+
fixed_size_list_array,
15+
list_array,
16+
)
1017

1118
from lonboard._constants import EXTENSION_NAME
1219

@@ -29,7 +36,7 @@ def from_duckdb(
2936
*,
3037
con: Optional[duckdb.DuckDBPyConnection] = None,
3138
crs: Optional[Union[str, pyproj.CRS]] = None,
32-
) -> pa.Table:
39+
) -> Table:
3340
geom_col_idxs = [
3441
i for i, t in enumerate(rel.types) if str(t) in DUCKDB_SPATIAL_TYPES
3542
]
@@ -89,9 +96,9 @@ def _from_geometry(
8996
con: Optional[duckdb.DuckDBPyConnection] = None,
9097
geom_col_idx: int,
9198
crs: Optional[Union[str, pyproj.CRS]] = None,
92-
) -> pa.Table:
99+
) -> Table:
93100
other_col_names = [name for i, name in enumerate(rel.columns) if i != geom_col_idx]
94-
non_geo_table = rel.select(*other_col_names).arrow()
101+
non_geo_table = Table.from_arrow(rel.select(*other_col_names).arrow())
95102
geom_col_name = rel.columns[geom_col_idx]
96103

97104
# A poor-man's string interpolation check
@@ -102,9 +109,11 @@ def _from_geometry(
102109
), f"Expected geometry column name to match regex: {re_match}"
103110

104111
if con is not None:
105-
geom_table = con.sql(f"""
112+
geom_table = Table.from_arrow(
113+
con.sql(f"""
106114
SELECT ST_AsWKB( {geom_col_name} ) as {geom_col_name} FROM rel;
107115
""").arrow()
116+
)
108117
else:
109118
import duckdb
110119

@@ -119,7 +128,9 @@ def _from_geometry(
119128
SELECT ST_AsWKB( {geom_col_name} ) as {geom_col_name} FROM rel;
120129
"""
121130
try:
122-
geom_table = duckdb.execute(sql).arrow()
131+
geom_table = Table.from_arrow(
132+
duckdb.execute(sql, connection=duckdb.default_connection).arrow()
133+
)
123134
except duckdb.CatalogException as err:
124135
msg = (
125136
"Could not coerce type GEOMETRY to WKB.\n"
@@ -140,8 +151,8 @@ def _from_geoarrow(
140151
extension_type: EXTENSION_NAME,
141152
geom_col_idx: int,
142153
crs: Optional[Union[str, pyproj.CRS]] = None,
143-
) -> pa.Table:
144-
table = rel.arrow()
154+
) -> Table:
155+
table = Table.from_arrow(rel.arrow())
145156
metadata = _make_geoarrow_field_metadata(extension_type, crs)
146157
geom_field = table.schema.field(geom_col_idx).with_metadata(metadata)
147158
return table.set_column(geom_col_idx, geom_field, table.column(geom_col_idx))
@@ -152,21 +163,24 @@ def _from_box2d(
152163
*,
153164
geom_col_idx: int,
154165
crs: Optional[Union[str, pyproj.CRS]] = None,
155-
) -> pa.Table:
156-
table = rel.arrow()
166+
) -> Table:
167+
table = Table.from_arrow(rel.arrow())
157168
geom_col = table.column(geom_col_idx)
158169

159-
polygon_array = _convert_box2d_to_geoarrow_polygon_array(geom_col)
170+
polygon_chunks: List[Array] = []
171+
for geom_chunk in geom_col.chunks:
172+
polygon_array = _convert_box2d_to_geoarrow_polygon_array(geom_chunk)
173+
polygon_chunks.append(polygon_array)
160174

161175
metadata = _make_geoarrow_field_metadata(EXTENSION_NAME.POLYGON, crs)
162176
prev_field = table.schema.field(geom_col_idx)
163-
geom_field = pa.field(prev_field.name, polygon_array.type, metadata=metadata)
164-
return table.set_column(geom_col_idx, geom_field, polygon_array)
177+
geom_field = Field(prev_field.name, polygon_chunks[0].type, metadata=metadata)
178+
return table.set_column(geom_col_idx, geom_field, ChunkedArray(polygon_chunks))
165179

166180

167181
def _convert_box2d_to_geoarrow_polygon_array(
168-
geom_col: pa.StructArray,
169-
) -> pa.ListArray:
182+
geom_col: Array,
183+
) -> Array:
170184
"""
171185
This is a manual conversion of the duckdb box_2d type to a GeoArrow Polygon array.
172186
@@ -176,10 +190,10 @@ def _convert_box2d_to_geoarrow_polygon_array(
176190
# Extract the bounding box columns from the Arrow struct
177191
# NOTE: this assumes that the box ordering is minx, miny, maxx, maxy
178192
# Note sure whether the positional ordering or the named fields is more stable
179-
min_x = pc.struct_field(geom_col, 0)
180-
min_y = pc.struct_field(geom_col, 1)
181-
max_x = pc.struct_field(geom_col, 2)
182-
max_y = pc.struct_field(geom_col, 3)
193+
min_x = struct_field(geom_col, 0)
194+
min_y = struct_field(geom_col, 1)
195+
max_x = struct_field(geom_col, 2)
196+
max_y = struct_field(geom_col, 3)
183197

184198
# Provision memory for the output coordinates. For closed polygons, each input box
185199
# becomes 5 coordinates.
@@ -208,9 +222,10 @@ def _convert_box2d_to_geoarrow_polygon_array(
208222
geom_offsets = np.arange(0, len(ring_offsets), dtype=np.int32)
209223

210224
# Construct the final PolygonArray
211-
coords = pa.FixedSizeListArray.from_arrays(coords.ravel("C"), 2)
212-
ring_array = pa.ListArray.from_arrays(ring_offsets, coords)
213-
polygon_array = pa.ListArray.from_arrays(geom_offsets, ring_array)
225+
flat_coords: Array = Array.from_numpy(coords.ravel("C"))
226+
coords = fixed_size_list_array(flat_coords, 2)
227+
ring_array = list_array(Array.from_numpy(ring_offsets), coords)
228+
polygon_array = list_array(Array.from_numpy(geom_offsets), ring_array)
214229
return polygon_array
215230

216231

lonboard/_geoarrow/crs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import json
22
from typing import Optional
33

4-
import pyarrow as pa
4+
from arro3.core import Field
55

66

77
# Note: According to the spec, if the metadata key exists, its value should never be
88
# `null` or an empty dict, but we still check for those to be safe
9-
def get_field_crs(field: pa.Field) -> Optional[str]:
9+
def get_field_crs(field: Field) -> Optional[str]:
1010
extension_metadata_value = field.metadata.get(b"ARROW:extension:metadata")
1111
if not extension_metadata_value:
1212
return None

0 commit comments

Comments
 (0)