From 130566c5359235e292c6b15c07ca1bbb786c8f2c Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:00:37 -0700 Subject: [PATCH 01/40] update PydapArrayWrapper to support backend batching --- xarray/backends/pydap_.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4fbfe8ee210..f143959d010 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,8 +35,10 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array): + def __init__(self, array, batch=False, cache=None): self.array = array + self._batch = batch + self._cache = cache @property def shape(self) -> tuple[int, ...]: @@ -52,13 +54,29 @@ def __getitem__(self, key): ) def _getitem(self, key): - result = robust_getitem(self.array, key, catch=ValueError) - # in some cases, pydap doesn't squeeze axes automatically like numpy - result = np.asarray(result) + if self.array.id in self._cache.keys(): + # safely avoid re-downloading some coordinates + result = self._cache[self.array.id] + elif self._batch and hasattr(self.array, "dataset"): + # this are both True only for pydap>3.5.5 + from pydap.lib import resolve_batch_for_all_variables + + parent = self.array.parent # could be root ds | group + variables = list(parent.variables()) + resolve_batch_for_all_variables(parent, variables, key) + + result = np.asarray( + parent.dataset._current_batch_promise.wait_for_result(self.array.id) + ) + else: + result = robust_getitem(self.array, key, catch=ValueError) + try: + result = np.asarray(result.data) + except AttributeError: + result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != self.array.ndim and axis: result = np.squeeze(result, axis) - return result From 6e5e2bd6f5053bedc59eec614ec1c7bf430fa3b6 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:07:21 -0700 Subject: [PATCH 02/40] update PydapDataStore to use backend logic in dap4 to batch variables all together in single dap url --- xarray/backends/pydap_.py | 67 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index f143959d010..26957c0175c 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -99,7 +100,7 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None): + def __init__(self, dataset, group=None, session=None, batch=False, protocol=None): """ Parameters ---------- @@ -109,6 +110,11 @@ def __init__(self, dataset, group=None): """ self.dataset = dataset self.group = group + self.session = session + self._batch = batch + self._batch_done = False + self._array_cache = {} # holds 1D dimension data + self._protocol = protocol @classmethod def open( @@ -121,6 +127,7 @@ def open( timeout=None, verify=None, user_charset=None, + batch=False, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -135,6 +142,7 @@ def open( DeprecationWarning, ) output_grid = False # new default behavior + kwargs = { "url": url, "application": application, @@ -152,12 +160,26 @@ def open( dataset = url.ds args = {"dataset": dataset} if group: - # only then, change the default args["group"] = group + if url.startswith(("https", "dap2")): + args["protocol"] = "dap2" + else: + args["protocol"] = "dap4" + if batch: + if args["protocol"] == "dap2": + warnings.warn( + f"`batch={batch}` is currently only compatible with the `DAP4` " + "protocol. Make sue the OPeNDAP server implements the `DAP4` " + "protocol and then replace the scheme of the url with `dap4` " + "to make use of it. Setting `batch=False`.", + stacklevel=2, + ) + else: + # only update if dap4 + args["batch"] = batch return cls(**args) def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) try: dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims @@ -166,6 +188,25 @@ def open_store_variable(self, var): # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions + if ( + self._protocol == "dap4" + and var.name in dimensions + and hasattr(var, "dataset") # only True for pydap>3.5.5 + ): + if not var.dataset._batch_mode: + # for dap4, always batch all dimensions at once + var.dataset.enable_batch_mode() + data_array = self._get_data_array(var) + data = indexing.LazilyIndexedArray(data_array) + if not self._batch and var.dataset._batch_mode: + # if `batch=False``, restore it for all other variables + var.dataset.disable_batch_mode() + else: + # all non-dimension variables + data = indexing.LazilyIndexedArray( + PydapArrayWrapper(var, self._batch, self._array_cache) + ) + return Variable(dimensions, data, var.attributes) def get_variables(self): @@ -183,6 +224,7 @@ def get_variables(self): # check the key is not a BaseType or GridType if not isinstance(self.ds[var], GroupType) ] + return FrozenDict((k, self.open_store_variable(self.ds[k])) for k in _vars) def get_attrs(self): @@ -194,9 +236,11 @@ def get_attrs(self): "libdap", "invocation", "dimensions", + "path", + "Maps", ) - attrs = self.ds.attributes - list(map(attrs.pop, opendap_attrs, [None] * 6)) + attrs = dict(self.ds.attributes) + list(map(attrs.pop, opendap_attrs, [None] * 8)) return Frozen(attrs) def get_dimensions(self): @@ -206,6 +250,19 @@ def get_dimensions(self): def ds(self): return get_group(self.dataset, self.group) + def _get_data_array(self, var): + """gets dimension data all at once, storing the numpy + arrays within a cached dictionary + """ + from pydap.lib import get_batch_data + + if not self._batch_done or var.id not in self._array_cache: + # store all dim data into a dict for reuse + self._array_cache = get_batch_data(var.parent, self._array_cache) + self._batch_done = True + + return self._array_cache[var.id] + class PydapBackendEntrypoint(BackendEntrypoint): """ From 3b66103d968d71734175ad5509ac16765f43d88c Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:17:51 -0700 Subject: [PATCH 03/40] pydap-server it not necessary --- ci/requirements/environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index f56b2bc1d1c..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -37,7 +37,6 @@ dependencies: - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - - pydap-server - pytest - pytest-asyncio - pytest-cov From 6e517f7855ce6a9d222a46113d8c5d6686aeb593 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:35:52 -0700 Subject: [PATCH 04/40] set `batch=False` as default --- xarray/backends/pydap_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 26957c0175c..f8e55034146 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -306,6 +306,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, + batch=False, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -316,6 +317,7 @@ def open_dataset( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): From 2dfd4eec011821e812901c1ed0d96000f9ee9318 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:36:40 -0700 Subject: [PATCH 05/40] set `batch=False` as default in datatree --- xarray/backends/pydap_.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index f8e55034146..084260be423 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -350,6 +350,7 @@ def open_datatree( timeout=None, verify=None, user_charset=None, + batch=False, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -362,10 +363,11 @@ def open_datatree( decode_timedelta=decode_timedelta, group=group, application=None, - session=None, - timeout=None, - verify=None, - user_charset=None, + session=session, + timeout=timeout, + verify=application, + user_charset=user_charset, + batch=batch, ) return datatree_from_dict_with_io_cleanup(groups_dict) From b6304d0e7875c02f4afb327119261d9660ebea78 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 11:37:42 -0700 Subject: [PATCH 06/40] set `batch=False` as default in open groups as dict --- xarray/backends/pydap_.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 084260be423..acbc385a10e 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -389,6 +389,7 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, + batch=False, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -400,6 +401,7 @@ def open_groups_as_dict( timeout=timeout, verify=verify, user_charset=user_charset, + batch=batch, ) # Check for a group and make it a parent if it exists From 5d6af4adbdac0be4e67dbb969463feb2f5fbf797 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 12:16:42 -0700 Subject: [PATCH 07/40] for flaky, install pydap from repo for now --- ci/requirements/environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..91f2a70d0d6 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release From ef1fca07088b5f1d7aadc8f20b85f587e5bc81f1 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 17:38:11 -0700 Subject: [PATCH 08/40] initial tests - quantify cached url --- xarray/backends/pydap_.py | 4 +-- xarray/tests/test_backends.py | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index acbc385a10e..4dfc6bb4015 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -161,9 +161,9 @@ def open( args = {"dataset": dataset} if group: args["group"] = group - if url.startswith(("https", "dap2")): + if url.startswith(("http", "dap2")): args["protocol"] = "dap2" - else: + elif url.startswith("dap4"): args["protocol"] = "dap4" if batch: if args["protocol"] == "dap2": diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 996644e5c16..bc133a46f8c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6473,6 +6473,53 @@ def test_session(self) -> None: ) +@requires_pydap +@network +@pytest.mark.parametrize("protocol", ["dap2", "dap4"]) +def test_batchdap4_downloads(protocol) -> None: + """Test that in dap4, all dimensions are downloaded at once""" + import pydap + from requests_cache import CachedSession + + _version_ = Version(pydap.__version__) + session = CachedSession() + session.cache.clear() + url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + + open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) + if protocol == "dap4": + if _version_ > Version("3.5.5"): + # should download 2 urls only (1 dmr and 1 dap) + assert len(session.cache.urls()) == 2 + else: + assert len(session.cache.urls()) == 4 + # das + dds + 3 dods urls + elif protocol == "dap2": + assert len(session.cache.urls()) == 5 + + +@requires_pydap +@network +def test_batch_warnswithdap2() -> None: + from requests_cache import CachedSession + + session = CachedSession() + session.cache.clear() + url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" + with pytest.warns(UserWarning): + open_dataset( + url, engine="pydap", session=session, batch=True, decode_times=False + ) + + # no batching is supported here + assert len(session.cache.urls()) == 5 + + class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) From fe6d0aabe273e4ca161a1006e30420de957d4fc4 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 21:30:40 -0700 Subject: [PATCH 09/40] adds tests to datatree backend to assert multiple dimensions downloaded at once (per group) --- xarray/tests/test_backends_datatree.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 6b15e74c2e9..84e86cc189a 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from packaging.version import Version import xarray as xr from xarray import DataTree, load_datatree, open_datatree, open_groups @@ -639,7 +640,15 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: │ Temperature (time, Z, Y, X) float32 ... | Salinity (time, Z, Y, X) float32 ... """ - tree = open_datatree(url, engine=self.engine) + import pydap + from requests_cache import CachedSession + + _version_ = Version(pydap.__version__) + + session = CachedSession() + session.cache.clear() + + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) @@ -650,6 +659,19 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) + # group (including root). So in this case 3. In the future there + # should a only be 2 downloads (all dimensions should be downloaded) + # within single + + if _version_ > Version("3.5.5"): + # Total downloads are: 1 dmr, + 1 dap url per Group | root. + # since there is a group then 2 dap url. In the future there + # should only be 1 dap url downloaded. + assert len(session.cache.urls()) == 3 + else: + # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) + assert len(session.cache.urls()) == 5 + def test_open_groups_to_dict(self, url=all_aligned_child_nodes_url) -> None: aligned_dict_of_datasets = open_groups(url, engine=self.engine) aligned_dt = DataTree.from_dict(aligned_dict_of_datasets) From ba67bdbda8f99f17997c00898d069e629aa03c7e Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Tue, 12 Aug 2025 22:47:22 -0700 Subject: [PATCH 10/40] update testing to show number of download urls --- xarray/tests/test_backends.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bc133a46f8c..2b157554ca9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6476,7 +6476,8 @@ def test_session(self) -> None: @requires_pydap @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) -def test_batchdap4_downloads(protocol) -> None: +@pytest.mark.parametrize("batch", [False, True]) +def test_batchdap4_downloads(protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from requests_cache import CachedSession @@ -6486,20 +6487,36 @@ def test_batchdap4_downloads(protocol) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - open_dataset( - url.replace("https", protocol), - engine="pydap", - session=session, - decode_times=False, - ) + args = { + "filename_or_obj": url.replace("https", protocol), + "engine": "pydap", + "session": session, + "decode_times": False, + } + if protocol == "dap4": + ds = open_dataset(**args, batch=batch) if _version_ > Version("3.5.5"): - # should download 2 urls only (1 dmr and 1 dap) + # total downloads are: + # 1 dmr + 1 dap (dimensions) assert len(session.cache.urls()) == 2 + # now load the rest of the variables + ds.load() + if batch: + # all non-dimensions are downloaded in a single https requests + assert len(session.cache.urls()) == 2 + 1 + if not batch: + # each non-dimension array is downloaded with an individual + # https requests + assert len(session.cache.urls()) == 2 + 4 else: assert len(session.cache.urls()) == 4 - # das + dds + 3 dods urls + ds.load() + assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": + ds = open_dataset(**args) + # das + dds + 3 dods urls + assert len(session.cache.urls()) == 5 From d379f2d604861bc22b0e009ba0ea102b8c46bdac Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:01:01 -0700 Subject: [PATCH 11/40] simplified logic --- xarray/backends/pydap_.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4dfc6bb4015..49a9103c697 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -62,12 +62,10 @@ def _getitem(self, key): # this are both True only for pydap>3.5.5 from pydap.lib import resolve_batch_for_all_variables - parent = self.array.parent # could be root ds | group - variables = list(parent.variables()) - resolve_batch_for_all_variables(parent, variables, key) - + dataset = self.array.dataset + resolve_batch_for_all_variables(self.array, key) result = np.asarray( - parent.dataset._current_batch_promise.wait_for_result(self.array.id) + dataset._current_batch_promise.wait_for_result(self.array.id) ) else: result = robust_getitem(self.array, key, catch=ValueError) From 4d9b85d7216131f13fb40afb5294d0bf5f919bcf Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:01:24 -0700 Subject: [PATCH 12/40] specify cached session debug name to actually cache urls --- xarray/tests/test_backends.py | 2 +- xarray/tests/test_backends_datatree.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2b157554ca9..95761569a53 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6483,7 +6483,7 @@ def test_batchdap4_downloads(protocol, batch) -> None: from requests_cache import CachedSession _version_ = Version(pydap.__version__) - session = CachedSession() + session = CachedSession(cache_name="debug") # so that urls are cached session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 84e86cc189a..e38610b2ce3 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -645,7 +645,7 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: _version_ = Version(pydap.__version__) - session = CachedSession() + session = CachedSession(cache_name="debug") # so that urls are cached session.cache.clear() tree = open_datatree(url, engine=self.engine, session=session) @@ -659,15 +659,9 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: list(expected.dims) + ["Z", "nv"] ) - # group (including root). So in this case 3. In the future there - # should a only be 2 downloads (all dimensions should be downloaded) - # within single - if _version_ > Version("3.5.5"): - # Total downloads are: 1 dmr, + 1 dap url per Group | root. - # since there is a group then 2 dap url. In the future there - # should only be 1 dap url downloaded. - assert len(session.cache.urls()) == 3 + # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups + assert len(session.cache.urls()) == 2 else: # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) assert len(session.cache.urls()) == 5 From c8a6d72cb28567adab31fb8412a22157a906b54c Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 00:24:30 -0700 Subject: [PATCH 13/40] fix for mypy --- xarray/tests/test_backends.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 95761569a53..9cc80d77b14 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6487,15 +6487,14 @@ def test_batchdap4_downloads(protocol, batch) -> None: session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - args = { - "filename_or_obj": url.replace("https", protocol), - "engine": "pydap", - "session": session, - "decode_times": False, - } - if protocol == "dap4": - ds = open_dataset(**args, batch=batch) + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + batch=batch, + ) if _version_ > Version("3.5.5"): # total downloads are: # 1 dmr + 1 dap (dimensions) @@ -6514,9 +6513,13 @@ def test_batchdap4_downloads(protocol, batch) -> None: ds.load() assert len(session.cache.urls()) == 4 + 4 elif protocol == "dap2": - ds = open_dataset(**args) + ds = open_dataset( + url.replace("https", protocol), + engine="pydap", + session=session, + decode_times=False, + ) # das + dds + 3 dods urls - assert len(session.cache.urls()) == 5 From 6b21bef246e524c6485aa9812538cc2ad5d347e3 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:21:26 -0700 Subject: [PATCH 14/40] user visible changes on `whats-new.rst` --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7093c0043f..6fdb0531332 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -268,6 +268,9 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading + dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables + downloading multiple arrays in single response. Breaking changes ~~~~~~~~~~~~~~~~ From 284ee1d070d7219f780c7bc4cddc03e33d6550f9 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:50:21 -0700 Subject: [PATCH 15/40] impose sorted to `get_dimensions` method --- xarray/backends/pydap_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 49a9103c697..72027a4cefa 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -242,7 +242,7 @@ def get_attrs(self): return Frozen(attrs) def get_dimensions(self): - return Frozen(self.ds.dimensions) + return Frozen(sorted(self.ds.dimensions)) @property def ds(self): From 1249ef1b003026ff09461f014a54dbab2f930880 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 09:52:45 -0700 Subject: [PATCH 16/40] reformat `whats-new.rst` --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6fdb0531332..c08d0334a29 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -268,9 +268,8 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading - dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables - downloading multiple arrays in single response. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. + By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From 4ec9b73e9778ca0bc3388f9aa1647a35e6e9e9ac Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 10:12:24 -0700 Subject: [PATCH 17/40] revert to install pydap from conda and not from repo --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 91f2a70d0d6..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release From 20e64f15f1d348dff9f5111673657ccd376ad81d Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 14:34:26 -0700 Subject: [PATCH 18/40] expose checksum as user kwarg --- ci/requirements/environment.yml | 3 ++- xarray/backends/pydap_.py | 30 +++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..91f2a70d0d6 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 72027a4cefa..4d86a55dbab 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -36,10 +36,11 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, cache=None): + def __init__(self, array, batch=False, cache=None, checksums=True): self.array = array self._batch = batch self._cache = cache + self._checksums = checksums @property def shape(self) -> tuple[int, ...]: @@ -63,7 +64,7 @@ def _getitem(self, key): from pydap.lib import resolve_batch_for_all_variables dataset = self.array.dataset - resolve_batch_for_all_variables(self.array, key) + resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) result = np.asarray( dataset._current_batch_promise.wait_for_result(self.array.id) ) @@ -98,7 +99,15 @@ class PydapDataStore(AbstractDataStore): be useful if the netCDF4 library is not available. """ - def __init__(self, dataset, group=None, session=None, batch=False, protocol=None): + def __init__( + self, + dataset, + group=None, + session=None, + batch=False, + protocol=None, + checksums=True, + ): """ Parameters ---------- @@ -113,6 +122,7 @@ def __init__(self, dataset, group=None, session=None, batch=False, protocol=None self._batch_done = False self._array_cache = {} # holds 1D dimension data self._protocol = protocol + self._checksums = checksums # true by default @classmethod def open( @@ -126,6 +136,7 @@ def open( verify=None, user_charset=None, batch=False, + checksums=True, ): from pydap.client import open_url from pydap.net import DEFAULT_TIMEOUT @@ -157,6 +168,7 @@ def open( # pydap dataset dataset = url.ds args = {"dataset": dataset} + args["checksums"] = checksums if group: args["group"] = group if url.startswith(("http", "dap2")): @@ -202,7 +214,7 @@ def open_store_variable(self, var): else: # all non-dimension variables data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._array_cache) + PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) ) return Variable(dimensions, data, var.attributes) @@ -256,7 +268,9 @@ def _get_data_array(self, var): if not self._batch_done or var.id not in self._array_cache: # store all dim data into a dict for reuse - self._array_cache = get_batch_data(var.parent, self._array_cache) + self._array_cache = get_batch_data( + var.parent, self._array_cache, self._checksums + ) self._batch_done = True return self._array_cache[var.id] @@ -305,6 +319,7 @@ def open_dataset( verify=None, user_charset=None, batch=False, + checksums=True, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -316,6 +331,7 @@ def open_dataset( verify=verify, user_charset=user_charset, batch=batch, + checksums=checksums, ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): @@ -349,6 +365,7 @@ def open_datatree( verify=None, user_charset=None, batch=False, + checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, @@ -366,6 +383,7 @@ def open_datatree( verify=application, user_charset=user_charset, batch=batch, + checksums=checksums, ) return datatree_from_dict_with_io_cleanup(groups_dict) @@ -388,6 +406,7 @@ def open_groups_as_dict( verify=None, user_charset=None, batch=False, + checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -400,6 +419,7 @@ def open_groups_as_dict( verify=verify, user_charset=user_charset, batch=batch, + checksums=checksums, ) # Check for a group and make it a parent if it exists From 842728f8649b16ca8aa581184476c04508306bae Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 14:47:17 -0700 Subject: [PATCH 19/40] include `checksums` optional argument in `whats-new` --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c08d0334a29..7959b31d7de 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -268,7 +268,8 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. By `Miguel Jimenez-Urias `_. Breaking changes From 476aa4652206f60e0452e5e014b27ca7b8f97d28 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 15:06:08 -0700 Subject: [PATCH 20/40] update to newest release of pydap via pip until conda install is available --- ci/requirements/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 91f2a70d0d6..b012fe82e56 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -65,4 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git # just for now - will restore to conda after new release + - pydap==3.5.6 # just for now - will restore to conda after new release From c9786f4802213e80760c75d58fde2055de563c9c Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Wed, 13 Aug 2025 16:12:24 -0700 Subject: [PATCH 21/40] use requests_cache session with retry-params when 500 errors occur --- xarray/backends/pydap_.py | 1 - xarray/tests/test_backends.py | 11 +++++++---- xarray/tests/test_backends_datatree.py | 9 +++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 4d86a55dbab..fd66bce2643 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -117,7 +117,6 @@ def __init__( """ self.dataset = dataset self.group = group - self.session = session self._batch = batch self._batch_done = False self._array_cache = {} # holds 1D dimension data diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9cc80d77b14..06e3dafa3d8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6480,10 +6480,11 @@ def test_session(self) -> None: def test_batchdap4_downloads(protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap - from requests_cache import CachedSession + from pydap.net import create_session _version_ = Version(pydap.__version__) - session = CachedSession(cache_name="debug") # so that urls are cached + # Create a session with pre-set params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" @@ -6526,10 +6527,12 @@ def test_batchdap4_downloads(protocol, batch) -> None: @requires_pydap @network def test_batch_warnswithdap2() -> None: - from requests_cache import CachedSession + from pydap.net import create_session - session = CachedSession() + # Create a session with pre-set retry params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() + url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" with pytest.warns(UserWarning): open_dataset( diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index e38610b2ce3..c573716ad41 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -641,13 +641,14 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: | Salinity (time, Z, Y, X) float32 ... """ import pydap - from requests_cache import CachedSession + from pydap.net import create_session - _version_ = Version(pydap.__version__) - - session = CachedSession(cache_name="debug") # so that urls are cached + # Create a session with pre-set retry params in pydap backend, to cache urls + session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) session.cache.clear() + _version_ = Version(pydap.__version__) + tree = open_datatree(url, engine=self.engine, session=session) assert set(tree.dims) == {"time", "Z", "nv"} assert tree["/SimpleGroup"].coords["time"].dims == ("time",) From d4dd68dbaf8935c9e286d1c0252c3ca938ceccd1 Mon Sep 17 00:00:00 2001 From: Mikejmnez Date: Thu, 14 Aug 2025 09:45:07 -0700 Subject: [PATCH 22/40] update env yml file to use new pydap release via conda --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index b012fe82e56..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - pydap==3.5.6 # just for now - will restore to conda after new release From 06ee7b4d878d8a0d3a9f7e60766c5467ed4bfedc Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 25 Aug 2025 10:19:05 -0700 Subject: [PATCH 23/40] let `pydap` handle exceptions/warning --- xarray/backends/pydap_.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index fd66bce2643..cf36a5ecb08 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -166,8 +165,7 @@ def open( elif hasattr(url, "ds"): # pydap dataset dataset = url.ds - args = {"dataset": dataset} - args["checksums"] = checksums + args = {"dataset": dataset, "checksums": checksums} if group: args["group"] = group if url.startswith(("http", "dap2")): @@ -175,17 +173,7 @@ def open( elif url.startswith("dap4"): args["protocol"] = "dap4" if batch: - if args["protocol"] == "dap2": - warnings.warn( - f"`batch={batch}` is currently only compatible with the `DAP4` " - "protocol. Make sue the OPeNDAP server implements the `DAP4` " - "protocol and then replace the scheme of the url with `dap4` " - "to make use of it. Setting `batch=False`.", - stacklevel=2, - ) - else: - # only update if dap4 - args["batch"] = batch + args["batch"] = batch return cls(**args) def open_store_variable(self, var): From 2d9c4a6efb4b39ff26b55396c352a0070d1044a6 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 5 Sep 2025 16:20:39 -0700 Subject: [PATCH 24/40] process dims at once, one per group --- xarray/backends/pydap_.py | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index cf36a5ecb08..b381efd0275 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,10 +35,9 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, cache=None, checksums=True): + def __init__(self, array, batch=False, checksums=True): self.array = array self._batch = batch - self._cache = cache self._checksums = checksums @property @@ -55,10 +54,7 @@ def __getitem__(self, key): ) def _getitem(self, key): - if self.array.id in self._cache.keys(): - # safely avoid re-downloading some coordinates - result = self._cache[self.array.id] - elif self._batch and hasattr(self.array, "dataset"): + if self._batch and hasattr(self.array, "dataset"): # is self.array not loaded? # this are both True only for pydap>3.5.5 from pydap.lib import resolve_batch_for_all_variables @@ -69,10 +65,10 @@ def _getitem(self, key): ) else: result = robust_getitem(self.array, key, catch=ValueError) - try: - result = np.asarray(result.data) - except AttributeError: - result = np.asarray(result) + # try: + result = np.asarray(result.data) + # except AttributeError: + # result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != self.array.ndim and axis: result = np.squeeze(result, axis) @@ -117,8 +113,6 @@ def __init__( self.dataset = dataset self.group = group self._batch = batch - self._batch_done = False - self._array_cache = {} # holds 1D dimension data self._protocol = protocol self._checksums = checksums # true by default @@ -201,7 +195,7 @@ def open_store_variable(self, var): else: # all non-dimension variables data = indexing.LazilyIndexedArray( - PydapArrayWrapper(var, self._batch, self._array_cache, self._checksums) + PydapArrayWrapper(var, self._batch, self._checksums) ) return Variable(dimensions, data, var.attributes) @@ -248,19 +242,14 @@ def ds(self): return get_group(self.dataset, self.group) def _get_data_array(self, var): - """gets dimension data all at once, storing the numpy - arrays within a cached dictionary - """ + """gets dimension data all at once""" from pydap.lib import get_batch_data - if not self._batch_done or var.id not in self._array_cache: - # store all dim data into a dict for reuse - self._array_cache = get_batch_data( - var.parent, self._array_cache, self._checksums - ) - self._batch_done = True - - return self._array_cache[var.id] + if not var._is_data_loaded(): + # this implies dat has not been deserialized yet + # runs only once per store/hierarchy + get_batch_data(var.parent, checksums=self._checksums) + return self.dataset[var.id].data class PydapBackendEntrypoint(BackendEntrypoint): From 53e1b8280105eb83f85c9f8283bb1a082bb9720f Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 9 Sep 2025 13:18:18 -0700 Subject: [PATCH 25/40] debug --- xarray/backends/pydap_.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index b381efd0275..cd7979bc015 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -54,24 +54,34 @@ def __getitem__(self, key): ) def _getitem(self, key): - if self._batch and hasattr(self.array, "dataset"): # is self.array not loaded? + if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 - from pydap.lib import resolve_batch_for_all_variables + # from pydap.lib import resolve_batch_for_all_variables + from pydap.lib import get_batch_data dataset = self.array.dataset - resolve_batch_for_all_variables(self.array, key, checksums=self._checksums) - result = np.asarray( - dataset._current_batch_promise.wait_for_result(self.array.id) - ) + print("[batching]", self.array.id) + if not dataset[self.array.id]._is_data_loaded(): + print("data not loaded", self.array.id) + # data has not been deserialized yet + # runs only once per store/hierarchy + get_batch_data(self.array, checksums=self._checksums, key=key) + result = np.asarray(dataset[self.array.id].data) + result = robust_getitem(result, key, catch=ValueError) else: + print("[non-batching]", self.array.id) result = robust_getitem(self.array, key, catch=ValueError) - # try: result = np.asarray(result.data) - # except AttributeError: - # result = np.asarray(result) axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + print(key) + print("axis:", axis) + # print("ndim", result.ndim) + # print("array.ndim", self.array.ndim) if result.ndim + len(axis) != self.array.ndim and axis: + # print('here????') + # print("squeezed result", np.shape(result)) result = np.squeeze(result, axis) + # print("squeezed result", np.shape(result)) return result @@ -246,9 +256,9 @@ def _get_data_array(self, var): from pydap.lib import get_batch_data if not var._is_data_loaded(): - # this implies dat has not been deserialized yet + # data has not been deserialized yet # runs only once per store/hierarchy - get_batch_data(var.parent, checksums=self._checksums) + get_batch_data(var, checksums=self._checksums) return self.dataset[var.id].data From fb314bdf4c271016469f40aa9bf71d4bec126140 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 10:20:41 -0700 Subject: [PATCH 26/40] revert what`s new from previous commit --- doc/whats-new.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7959b31d7de..b7093c0043f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -268,9 +268,6 @@ New Features By `Matthew Willson `_. - Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) By `Pratiman Patel `_. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From adc6ff57e7c863b5eb22e4ba04f1b7eb2acc28d7 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 10:21:57 -0700 Subject: [PATCH 27/40] enable data checker for batched deserialized data --- xarray/backends/pydap_.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index cd7979bc015..acd2af44428 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -57,31 +57,17 @@ def _getitem(self, key): if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 # from pydap.lib import resolve_batch_for_all_variables - from pydap.lib import get_batch_data + from pydap.lib import data_check, get_batch_data dataset = self.array.dataset - print("[batching]", self.array.id) - if not dataset[self.array.id]._is_data_loaded(): - print("data not loaded", self.array.id) - # data has not been deserialized yet - # runs only once per store/hierarchy - get_batch_data(self.array, checksums=self._checksums, key=key) - result = np.asarray(dataset[self.array.id].data) - result = robust_getitem(result, key, catch=ValueError) + get_batch_data(self.array, checksums=self._checksums, key=key) + result = data_check(np.asarray(dataset[self.array.id].data), key) else: - print("[non-batching]", self.array.id) result = robust_getitem(self.array, key, catch=ValueError) result = np.asarray(result.data) - axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) - print(key) - print("axis:", axis) - # print("ndim", result.ndim) - # print("array.ndim", self.array.ndim) - if result.ndim + len(axis) != self.array.ndim and axis: - # print('here????') - # print("squeezed result", np.shape(result)) - result = np.squeeze(result, axis) - # print("squeezed result", np.shape(result)) + axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) + if result.ndim + len(axis) != self.array.ndim and axis: + result = np.squeeze(result, axis) return result From e43459b1d919457ab5898b70b1b807022ce5f6a7 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 10:56:19 -0700 Subject: [PATCH 28/40] temporarily install from source for testing - will revert to conda install after new release if no further change to backend --- ci/requirements/environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index eff54fe469e..9dd53b0ec48 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pydap + # - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,3 +65,4 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect + - git+https://github.com/pydap/pydap.git From 742ecbaf3631c7c6a621f2c258e7ef2bd7bab269 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 13:04:41 -0700 Subject: [PATCH 29/40] update `whats new` --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7093c0043f..5d501751558 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -258,6 +258,9 @@ Alfonso Ladino, Brigitta Sipőcz, Claude, Deepak Cherian, Dimitri Papadopoulos O New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. - Added :py:meth:`DataTree.prune` method to remove empty nodes while preserving tree structure. Useful for cleaning up DataTree after time-based filtering operations (:issue:`10590`, :pull:`10598`). By `Alfonso Ladino `_. From 78a5c4bf941293adc3bc3d42cfeb1e88d4c9a5e6 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 13:32:06 -0700 Subject: [PATCH 30/40] update tests --- xarray/tests/test_backends.py | 19 ------------------- xarray/tests/test_backends_datatree.py | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 06e3dafa3d8..0e019cef5c5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6524,25 +6524,6 @@ def test_batchdap4_downloads(protocol, batch) -> None: assert len(session.cache.urls()) == 5 -@requires_pydap -@network -def test_batch_warnswithdap2() -> None: - from pydap.net import create_session - - # Create a session with pre-set retry params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) - session.cache.clear() - - url = "dap2://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" - with pytest.warns(UserWarning): - open_dataset( - url, engine="pydap", session=session, batch=True, decode_times=False - ) - - # no batching is supported here - assert len(session.cache.urls()) == 5 - - class TestEncodingInvalid: def test_extract_nc4_variable_encoding(self) -> None: var = xr.Variable(("x",), [1, 2, 3], {}, {"foo": "bar"}) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index c573716ad41..76d9cc0c00f 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -661,8 +661,8 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: ) if _version_ > Version("3.5.5"): - # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups - assert len(session.cache.urls()) == 2 + # Total downloads are: 1 dmr, + 1 dap url for all dimensions across groups per group + assert len(session.cache.urls()) == 3 else: # 1 dmr + 1 dap url per dimension (total there are 4 dimension arrays) assert len(session.cache.urls()) == 5 From 0caa288b3dad5b493628140445292c051db766b6 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Thu, 18 Sep 2025 14:14:10 -0700 Subject: [PATCH 31/40] set `batch=None` as default --- xarray/backends/pydap_.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index acd2af44428..50add6dc073 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -95,7 +95,7 @@ def __init__( dataset, group=None, session=None, - batch=False, + batch=None, protocol=None, checksums=True, ): @@ -123,7 +123,7 @@ def open( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ): from pydap.client import open_url @@ -290,7 +290,7 @@ def open_dataset( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> Dataset: store = PydapDataStore.open( @@ -336,7 +336,7 @@ def open_datatree( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> DataTree: groups_dict = self.open_groups_as_dict( @@ -377,7 +377,7 @@ def open_groups_as_dict( timeout=None, verify=None, user_charset=None, - batch=False, + batch=None, checksums=True, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath From 20c4186e70763c4ef62c871e29993ea72fa0e5b7 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 19 Sep 2025 09:06:42 -0700 Subject: [PATCH 32/40] improve handling of dims vs dimensions deprecation warning --- xarray/backends/pydap_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 50add6dc073..5824bdc6e8b 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -35,7 +35,7 @@ class PydapArrayWrapper(BackendArray): - def __init__(self, array, batch=False, checksums=True): + def __init__(self, array, batch=None, checksums=True): self.array = array self._batch = batch self._checksums = checksums @@ -167,11 +167,11 @@ def open( return cls(**args) def open_store_variable(self, var): - try: + if hasattr(var, "dims"): dimensions = [ dim.split("/")[-1] if dim.startswith("/") else dim for dim in var.dims ] - except AttributeError: + else: # GridType does not have a dims attribute - instead get `dimensions` # see https://github.com/pydap/pydap/issues/485 dimensions = var.dimensions From d2e505b91bcee02e75cadcfaa50a48037ab05581 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 08:25:12 -0700 Subject: [PATCH 33/40] update to use latest version of pydap --- ci/requirements/environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 9dd53b0ec48..eff54fe469e 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -36,7 +36,7 @@ dependencies: - pooch - pre-commit - pyarrow # pandas raises a deprecation warning without this, breaking doctests - # - pydap + - pydap - pytest - pytest-asyncio - pytest-cov @@ -65,4 +65,3 @@ dependencies: - jax # no way to get cpu-only jaxlib from conda if gpu is present - types-defusedxml - types-pexpect - - git+https://github.com/pydap/pydap.git From 85ec17b9a204cc611f514462ec2e40a57b6643f7 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 08:25:23 -0700 Subject: [PATCH 34/40] update import --- xarray/backends/pydap_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 5824bdc6e8b..89eafcb0092 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -57,7 +57,7 @@ def _getitem(self, key): if self._batch and hasattr(self.array, "dataset"): # this are both True only for pydap>3.5.5 # from pydap.lib import resolve_batch_for_all_variables - from pydap.lib import data_check, get_batch_data + from pydap.client import data_check, get_batch_data dataset = self.array.dataset get_batch_data(self.array, checksums=self._checksums, key=key) @@ -239,7 +239,7 @@ def ds(self): def _get_data_array(self, var): """gets dimension data all at once""" - from pydap.lib import get_batch_data + from pydap.client import get_batch_data if not var._is_data_loaded(): # data has not been deserialized yet From bf30ed373fcc2cc6bfbd78c109886198a934ea4a Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 08:33:40 -0700 Subject: [PATCH 35/40] update `whats new docs --- doc/whats-new.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5d501751558..4bdb754852f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,6 +78,9 @@ Claude, Deepak Cherian, Dimitri Papadopoulos Orfanos, Dylan H. Morris, Emmanuel New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. - :py:func:`DataTree.from_dict` now supports passing in ``DataArray`` and nested dictionary values, and has a ``coords`` argument for specifying coordinates as ``DataArray`` objects (:pull:`10658`). @@ -258,9 +261,6 @@ Alfonso Ladino, Brigitta Sipőcz, Claude, Deepak Cherian, Dimitri Papadopoulos O New Features ~~~~~~~~~~~~ -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. - Added :py:meth:`DataTree.prune` method to remove empty nodes while preserving tree structure. Useful for cleaning up DataTree after time-based filtering operations (:issue:`10590`, :pull:`10598`). By `Alfonso Ladino `_. From 01fc07ca164c9e6d093c0714512b7aab58898049 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Fri, 26 Sep 2025 10:14:57 -0700 Subject: [PATCH 36/40] move cache session to `tmpdir` --- xarray/tests/test_backends.py | 5 +++-- xarray/tests/test_backends_datatree.py | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0e019cef5c5..ac422fb04bc 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6477,14 +6477,15 @@ def test_session(self) -> None: @network @pytest.mark.parametrize("protocol", ["dap2", "dap4"]) @pytest.mark.parametrize("batch", [False, True]) -def test_batchdap4_downloads(protocol, batch) -> None: +def test_batchdap4_downloads(tmpdir, protocol, batch) -> None: """Test that in dap4, all dimensions are downloaded at once""" import pydap from pydap.net import create_session _version_ = Version(pydap.__version__) # Create a session with pre-set params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) + cache_name = tmpdir / "debug" + session = create_session(use_cache=True, cache_kwargs={"cache_name": cache_name}) session.cache.clear() url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 76d9cc0c00f..fc58db1b6e8 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -614,7 +614,7 @@ def test_open_groups(self, url=unaligned_datatree_url) -> None: ) as expected: assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) - def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: + def test_inherited_coords(self, tmpdir, url=simplegroup_datatree_url) -> None: """Test that `open_datatree` inherits coordinates from root tree. This particular h5 file is a test file that inherits the time coordinate from the root @@ -644,7 +644,10 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: from pydap.net import create_session # Create a session with pre-set retry params in pydap backend, to cache urls - session = create_session(use_cache=True, cache_kwargs={"cache_name": "debug"}) + cache_name = tmpdir / "debug" + session = create_session( + use_cache=True, cache_kwargs={"cache_name": cache_name} + ) session.cache.clear() _version_ = Version(pydap.__version__) From 87092d7b1d1493b2d6ee31f5528f3194b05ca09c Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 30 Sep 2025 13:34:02 -0700 Subject: [PATCH 37/40] remove added functionality from whats new from newly released version --- doc/whats-new.rst | 3 --- xarray/backends/pydap_.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4bdb754852f..b7093c0043f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,9 +78,6 @@ Claude, Deepak Cherian, Dimitri Papadopoulos Orfanos, Dylan H. Morris, Emmanuel New Features ~~~~~~~~~~~~ -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. - :py:func:`DataTree.from_dict` now supports passing in ``DataArray`` and nested dictionary values, and has a ``coords`` argument for specifying coordinates as ``DataArray`` objects (:pull:`10658`). diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 89eafcb0092..b29b92abdeb 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -55,8 +55,7 @@ def __getitem__(self, key): def _getitem(self, key): if self._batch and hasattr(self.array, "dataset"): - # this are both True only for pydap>3.5.5 - # from pydap.lib import resolve_batch_for_all_variables + # True only for pydap>3.5.5 from pydap.client import data_check, get_batch_data dataset = self.array.dataset From 9b8b4a65f64f5569d180566e4e39a113bcbe3f14 Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Tue, 30 Sep 2025 13:36:45 -0700 Subject: [PATCH 38/40] add to `whats-new` for next release --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b7093c0043f..cd5f297040f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,6 +40,10 @@ v2025.10.0 (October 6, 2025) ---------------------------- This release reverts a breaking change to Xarray's preferred netCDF backend. +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. + Breaking changes ~~~~~~~~~~~~~~~~ From 9d819a2dcc667f94af0274ac09067f43bcaa96ac Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 6 Oct 2025 10:36:55 -0700 Subject: [PATCH 39/40] update `whats new` to describe changes in this PR --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cd5f297040f..d35d1db2e3a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,9 @@ v2025.10.1 (unreleased) New Features ~~~~~~~~~~~~ +- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). + ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. + By `Miguel Jimenez-Urias `_. Breaking changes ~~~~~~~~~~~~~~~~ From b0826fed3e0b079c8787f3179c7d19d07367788d Mon Sep 17 00:00:00 2001 From: Miguel Jimenez-Urias Date: Mon, 6 Oct 2025 11:15:54 -0700 Subject: [PATCH 40/40] fix double entry on whats new --- doc/whats-new.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d35d1db2e3a..c2435db8a1c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,9 +43,6 @@ v2025.10.0 (October 6, 2025) ---------------------------- This release reverts a breaking change to Xarray's preferred netCDF backend. -- Improved ``pydap`` backend behavior and performance when using :py:func:`open_dataset`, :py:func:`open_datatree` when downloading dap4 (opendap) data (:issue:`10628`, :pull:`10629`). - ``batch=True|False`` is a new ``backend_kwarg`` that further enables downloading multiple arrays in single response. In addition ``checksums`` is added as optional argument to be passed to ``pydap`` backend. - By `Miguel Jimenez-Urias `_. Breaking changes