Add support for reading .wfdb archive files.

tompollard · tompollard · commit 35a8e4fee58f · 2025-05-03T15:08:51.000-04:00
diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py
@@ -1120,6 +1120,7 @@ def _rd_segment(
     no_file=False,
     sig_data=None,
     return_res=64,
+    wfdb_archive=None,
 ):
     """
     Read the digital samples from a single segment record's associated
@@ -1264,6 +1265,7 @@ def _rd_segment(
             sampto=sampto,
             no_file=no_file,
             sig_data=sig_data,
+            wfdb_archive=wfdb_archive,
         )
 
         # Copy over the wanted signals
@@ -1288,6 +1290,7 @@ def _rd_dat_signals(
     sampto,
     no_file=False,
     sig_data=None,
+    wfdb_archive=None,
 ):
     """
     Read all signals from a WFDB dat file.
@@ -1390,7 +1393,8 @@ def _rd_dat_signals(
         )
     else:
         data_to_read = _rd_dat_file(
-            file_name, dir_name, pn_dir, fmt, start_byte, n_read_samples
+            file_name, dir_name, pn_dir, fmt, start_byte, n_read_samples,
+            wfdb_archive=wfdb_archive
         )
 
     if extra_flat_samples:
@@ -1630,7 +1634,8 @@ def _required_byte_num(mode, fmt, n_samp):
     return int(n_bytes)
 
 
-def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
+def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp,
+                 wfdb_archive=None):
     """
     Read data from a dat file, either local or remote, into a 1d numpy
     array.
@@ -1688,14 +1693,19 @@ def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
         element_count = n_samp
         byte_count = n_samp * BYTES_PER_SAMPLE[fmt]
 
-    # Local or cloud dat file
-    if pn_dir is None:
+    # Local file or .wfdb archive
+    if wfdb_archive is not None:
+        with wfdb_archive.open(file_name, "rb") as fp:
+            fp.seek(start_byte)
+            sig_data = util.fromfile(
+                fp, dtype=np.dtype(DATA_LOAD_TYPES[fmt]), count=element_count
+            )
+    elif pn_dir is None:
         with fsspec.open(os.path.join(dir_name, file_name), "rb") as fp:
             fp.seek(start_byte)
             sig_data = util.fromfile(
                 fp, dtype=np.dtype(DATA_LOAD_TYPES[fmt]), count=element_count
             )
-
     # Stream dat file from PhysioNet
     else:
         # check to make sure a cloud path isn't being passed under pn_dir
diff --git a/wfdb/io/archive.py b/wfdb/io/archive.py
@@ -0,0 +1,85 @@
+import os
+import zipfile
+from contextlib import contextmanager
+
+_archive_cache = {}
+
+
+class WFDBArchive:
+    """
+    Helper class for working with WFDB .wfdb ZIP archives.
+
+    Used only if:
+      - .wfdb is included in the record_name explicitly, or
+      - .wfdb is passed directly to the file loading function.
+    """
+    def __init__(self, record_name):
+        """
+        Initialize a WFDBArchive for a given record name (without extension).
+
+        record_name : str
+          The base name of the archive, without the .wfdb extension.
+        """
+        self.record_name = record_name
+        self.archive_path = f"{record_name}.wfdb"
+
+        if not os.path.exists(self.archive_path):
+            raise FileNotFoundError(f"Archive not found: {self.archive_path}")
+        if not zipfile.is_zipfile(self.archive_path):
+            raise ValueError(f"Invalid WFDB archive: {self.archive_path}")
+        self.zipfile = zipfile.ZipFile(self.archive_path, mode="r")
+
+    def exists(self, filename):
+        """
+        Check if a file exists in the archive.
+        """
+        return self.zipfile and filename in self.zipfile.namelist()
+
+    @contextmanager
+    def open(self, filename, mode="r"):
+        """
+        Open a file, either from disk or from the archive.
+        Mode 'r' (text) or 'rb' (binary) supported.
+        """
+        if self.zipfile and filename in self.zipfile.namelist():
+            with self.zipfile.open(filename, 'r') as f:
+                if "b" in mode:
+                    yield f
+                else:
+                    import io
+                    yield io.TextIOWrapper(f)
+        else:
+            raise FileNotFoundError(
+                f"Could not find '{filename}' as loose file or inside '{self.archive_path}'."
+                )
+
+    def close(self):
+        """
+        Close the archive if open.
+        """
+        if self.zipfile:
+            self.zipfile.close()
+
+    def create_archive(self, file_list, output_path=None):
+        """
+        Create a .wfdb archive containing the specified list of files.
+        If output_path is not specified, uses self.archive_path.
+        """
+        output_path = output_path or self.archive_path
+        with zipfile.ZipFile(output_path, mode="w") as zf:
+            for file in file_list:
+                compress = (
+                    zipfile.ZIP_STORED
+                    if file.endswith((".hea", ".hea.json", ".hea.yml"))
+                    else zipfile.ZIP_DEFLATED
+                )
+                zf.write(file, arcname=os.path.basename(file), compress_type=compress)
+
+
+def get_archive(record_base_name):
+    """
+    Get or create a WFDBArchive for the given record base name.
+    """
+    if record_base_name not in _archive_cache:
+        _archive_cache[record_base_name] = WFDBArchive(record_base_name)
+    return _archive_cache[record_base_name]
diff --git a/wfdb/io/record.py b/wfdb/io/record.py
@@ -11,6 +11,7 @@
 from wfdb.io import _header
 from wfdb.io import _signal
 from wfdb.io import _url
+from wfdb.io.archive import get_archive
 from wfdb.io import download
 from wfdb.io import header
 from wfdb.io import util
@@ -2030,25 +2031,41 @@ def rdrecord(
                                channels=[1, 3])
 
     """
-    dir_name, base_record_name = os.path.split(record_name)
-    # Update the dir_name using abspath unless it is a cloud path
-    if not any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
-        dir_name = os.path.abspath(dir_name)
+    is_wfdb_archive = record_name.endswith(".wfdb")
 
-    # Read the header fields
-    if pn_dir is not None:
-        # check to make sure a cloud path isn't being passed under pn_dir
-        if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
-            raise ValueError(
-                "Cloud paths should be passed under record_name, not under pn_dir"
-            )
-        if "." not in pn_dir:
-            dir_list = pn_dir.split("/")
-            pn_dir = posixpath.join(
-                dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
-            )
+    if is_wfdb_archive:
+        record_base = record_name[:-5]  # remove ".wfdb"
+        archive = get_archive(record_base)
+        hea_file = os.path.basename(record_base) + ".hea"
+
+        with archive.open(hea_file, "r") as f:
+            record = Record()
+            record.wfdb_archive = archive
+            record._read_header(f.read())
+
+        # Set dir_name to the archive base (needed for _rd_segment)
+        dir_name = os.path.dirname(record_base)
+
+    else:
+        dir_name, base_record_name = os.path.split(record_name)
+        # Update the dir_name using abspath unless it is a cloud path
+        if not any(dir_name.startswith(proto) for proto in CLOUD_PROTOCOLS):
+            dir_name = os.path.abspath(dir_name)
+
+        # Read the header fields
+        if pn_dir is not None:
+            # check to make sure a cloud path isn't being passed under pn_dir
+            if any(pn_dir.startswith(proto) for proto in CLOUD_PROTOCOLS):
+                raise ValueError(
+                    "Cloud paths should be passed under record_name, not under pn_dir"
+                )
+            if "." not in pn_dir:
+                dir_list = pn_dir.split("/")
+                pn_dir = posixpath.join(
+                    dir_list[0], download.get_version(dir_list[0]), *dir_list[1:]
+                )
 
-    record = rdheader(record_name, pn_dir=pn_dir, rd_segments=False)
+        record = rdheader(record_name, pn_dir=pn_dir, rd_segments=False)
 
     # Set defaults for sampto and channels input variables
     if sampto is None: