Implement reading signals in format 24.

Benjamin Moody · Benjamin Moody · commit 37d5a12635c4 · 2021-08-04T15:35:44.000-04:00
In signal format 24, each sample is stored in three bytes, as a
little-endian signed integer.  Since most architectures don't natively
support 24-bit integers, neither does numpy, and therefore in order to
read the signal, it must be converted from the on-disk format into
some format that numpy can work with (i.e., 32-bit signed integers.)

Previously, when loading a format-24 record using rdrecord or rdsamp,
the internal function _rd_dat_file made a halfhearted attempt to read
the signals, by extracting the two most significant bytes of each
sample, discarding the least significant byte, and storing the result
as a 16-bit integer.  This meant not only that the precision was
silently reduced, but the resulting physical sample values were
incorrect by a factor of 256.

(Additionally, the implementation of _rd_dat_file was broken if the
signal file contained a prolog, if the signal file could not be
mmapped, or if the entire signal file was not meant to be read at
once.)

Fix this by handling format 24 in the same way as the other unaligned
formats (212, 310, and 311):

 - Read the data initially as an array of unsigned bytes; this means
   the 'dtype' passed to numpy.fromfile is numpy.dtype('&lt;u1') and the
   'count' is the number of bytes rather than the number of samples.

 - Reformat the array of bytes into an array of integers in the
   _blocks_to_samples function.  This means format 24 must be
   considered an "unaligned" rather than an "aligned" format.

In order to avoid making unnecessary copies of the data, rather than
using numpy arithmetic operations, the middle and low bytes are
*copied* from the input array into the corresponding locations in an
8-bit view of the 32-bit output array.  This is dependent on the
system byte order.
diff --git a/wfdb/io/_signal.py b/wfdb/io/_signal.py
@@ -1,5 +1,6 @@
 import math
 import os
+import sys
 
 import numpy as np
 
@@ -10,10 +11,10 @@
 MAX_I32 = 2147483647
 MIN_I32 = -2147483648
 
-# Formats in which all samples align with byte boundaries
-ALIGNED_FMTS = ['8', '16', '24', '32', '61', '80', '160']
-# Formats in which not all samples align with byte boundaries
-UNALIGNED_FMTS = ['212', '310', '311']
+# Formats in which all samples align with integer (power-of-two) boundaries
+ALIGNED_FMTS = ['8', '16', '32', '61', '80', '160']
+# Formats in which not all samples align with integer boundaries
+UNALIGNED_FMTS = ['212', '310', '311', '24']
 # Formats which are stored in offset binary form
 OFFSET_FMTS = ['80', '160']
 # All WFDB dat formats - https://www.physionet.org/physiotools/wag/signal-5.htm
@@ -29,7 +30,7 @@
            '160': 16, '212': 12, '310': 10, '311': 10}
 
 # Numpy dtypes used to load dat files of each format.
-DATA_LOAD_TYPES = {'8': '<i1', '16': '<i2', '24': '<i3', '32': '<i4',
+DATA_LOAD_TYPES = {'8': '<i1', '16': '<i2', '24': '<u1', '32': '<i4',
                    '61': '>i2', '80': '<u1', '160': '<u2', '212': '<u1',
                    '310': '<u1', '311': '<u1'}
 
@@ -1398,6 +1399,9 @@ def _rd_dat_file(file_name, dir_name, pn_dir, fmt, start_byte, n_samp):
     elif fmt in ['310', '311']:
         byte_count = _required_byte_num('read', fmt, n_samp)
         element_count = byte_count
+    elif fmt == '24':
+        byte_count = n_samp * 3
+        element_count = byte_count
     else:
         element_count = n_samp
         byte_count = n_samp * BYTES_PER_SAMPLE[fmt]
@@ -1535,6 +1539,26 @@ def _blocks_to_samples(sig_data, n_samp, fmt):
         # Loaded values as un_signed. Convert to 2's complement form.
         # Values > 2^9-1 are negative.
         sig[sig > 511] -= 1024
+
+    elif fmt == '24':
+        # The following is equivalent to:
+        #   sig = (sig_data[2::3].view('int8').astype('int32') * 65536
+        #          + sig_data[1::3].astype('uint16') * 256
+        #          + sig_data[0::3])
+
+        # Treat the high byte as signed and shift it by 16 bits.
+        sig = np.left_shift(sig_data[2::3].view('int8'), 16, dtype='int32')
+
+        # Directly copy the low and middle bytes.
+        if sys.byteorder == 'little':
+            sig.view('uint8')[0::4] = sig_data[0::3]
+            sig.view('uint8')[1::4] = sig_data[1::3]
+        elif sys.byteorder == 'big':
+            sig.view('uint8')[3::4] = sig_data[0::3]
+            sig.view('uint8')[2::4] = sig_data[1::3]
+        else:
+            raise NotImplementedError
+
     return sig