Merge pull request #12 from matthew-brett/refactor-tck-read

MarcCote · web-flow · commit 2df75181af2a · 2017-03-29T08:54:08.000-04:00
RF: refactor tck read for speed
diff --git a/.travis.yml b/.travis.yml
@@ -33,19 +33,19 @@ matrix:
     # Absolute minimum dependencies
     - python: 2.7
       env:
-        - DEPENDS="numpy==1.6.0" PYDICOM=0
+        - DEPENDS="numpy==1.7.1" PYDICOM=0
     # Absolute minimum dependencies plus oldest MPL
     # Check these against:
     # nibabel/info.py
     # doc/source/installation.rst
     # requirements.txt
     - python: 2.7
       env:
-        - DEPENDS="numpy==1.6.0 matplotlib==1.3.1" PYDICOM=0
+        - DEPENDS="numpy==1.7.1 matplotlib==1.3.1" PYDICOM=0
     # Minimum pydicom dependency
     - python: 2.7
       env:
-        - DEPENDS="numpy==1.6.0 pydicom==0.9.7 pillow==2.6"
+        - DEPENDS="numpy==1.7.1 pydicom==0.9.7 pillow==2.6"
     # test against numpy 1.7
     - python: 2.7
       env:
diff --git a/doc/source/installation.rst b/doc/source/installation.rst
@@ -87,7 +87,7 @@ Requirements
     .travis.yml
 
 *  Python_ 2.7, or >= 3.4
-*  NumPy_ 1.6 or greater
+*  NumPy_ 1.7 or greater
 *  Six_ 1.3 or greater
 *  SciPy_ (optional, for full SPM-ANALYZE support)
 *  PyDICOM_ 0.9.7 or greater (optional, for DICOM support)
diff --git a/nibabel/info.py b/nibabel/info.py
@@ -186,7 +186,7 @@ def cmp_pkg_version(version_str, pkg_version_str=__version__):
 # doc/source/installation.rst
 # requirements.txt
 # .travis.yml
-NUMPY_MIN_VERSION = '1.6.0'
+NUMPY_MIN_VERSION = '1.7.1'
 PYDICOM_MIN_VERSION = '0.9.7'
 SIX_MIN_VERSION = '1.3'
 
diff --git a/nibabel/streamlines/tck.py b/nibabel/streamlines/tck.py
@@ -21,7 +21,6 @@
 from .header import Field
 
 MEGABYTE = 1024 * 1024
-BUFFER_SIZE = 1000000
 
 
 def create_empty_header():
@@ -342,8 +341,8 @@ def _read_header(fileobj):
 
         return hdr
 
-    @staticmethod
-    def _read(fileobj, header, buffer_size=4):
+    @classmethod
+    def _read(cls, fileobj, header, buffer_size=4):
         """ Return generator that reads TCK data from `fileobj` given `header`
 
         Parameters
@@ -369,65 +368,60 @@ def _read(fileobj, header, buffer_size=4):
         buffer_size = int(buffer_size * MEGABYTE)
         buffer_size += coordinate_size - (buffer_size % coordinate_size)
 
+        # Markers for streamline end and file end
+        fiber_marker = cls.FIBER_DELIMITER.astype(dtype).tostring()
+        eof_marker = cls.EOF_DELIMITER.astype(dtype).tostring()
+
         with Opener(fileobj) as f:
             start_position = f.tell()
 
             # Set the file position at the beginning of the data.
             f.seek(header["_offset_data"], os.SEEK_SET)
 
             eof = False
-            buff = b""
-            pts = []
-
-            i = 0
-
-            while not eof or not np.all(np.isinf(pts)):
-
-                if not eof:
-                    bytes_read = f.read(buffer_size)
-                    buff += bytes_read
-                    eof = len(bytes_read) == 0
+            buffs = []
+            n_streams = 0
 
-                # Read floats.
-                pts = np.frombuffer(buff, dtype=dtype)
+            while not eof:
 
-                # Convert data to little-endian if needed.
-                if dtype != '<f4':
-                    pts = pts.astype('<f4')
-
-                pts = pts.reshape([-1, 3])
-                idx_nan = np.arange(len(pts))[np.isnan(pts[:, 0])]
+                bytes_read = f.read(buffer_size)
+                buffs.append(bytes_read)
+                eof = len(bytes_read) != buffer_size
 
                 # Make sure we've read enough to find a streamline delimiter.
-                if len(idx_nan) == 0:
+                if fiber_marker not in bytes_read:
                     # If we've read the whole file, then fail.
-                    if eof and not np.all(np.isinf(pts)):
-                        msg = ("Cannot find a streamline delimiter. This file"
-                               " might be corrupted.")
-                        raise DataError(msg)
-
-                    # Otherwise read a bit more.
-                    continue
-
-                nb_pts_total = 0
-                idx_start = 0
-                for idx_end in idx_nan:
-                    nb_pts = len(pts[idx_start:idx_end, :])
-                    nb_pts_total += nb_pts
-
-                    if nb_pts > 0:
-                        yield pts[idx_start:idx_end, :]
-                        i += 1
-
-                    idx_start = idx_end + 1
-
-                # Remove pts plus the first triplet of NaN.
-                nb_tiplets_to_remove = nb_pts_total + len(idx_nan)
-                nb_bytes_to_remove = nb_tiplets_to_remove * 3 * dtype.itemsize
-                buff = buff[nb_bytes_to_remove:]
+                    if eof:
+                        # Could have minimal buffering, and have read only the
+                        # EOF delimiter
+                        buffs = [b''.join(buffs)]
+                        if not buffs[0] == eof_marker:
+                            raise DataError(
+                                "Cannot find a streamline delimiter. This file"
+                                " might be corrupted.")
+                    else:
+                        # Otherwise read a bit more.
+                        continue
+
+                all_parts = b''.join(buffs).split(fiber_marker)
+                point_parts, buffs = all_parts[:-1], all_parts[-1:]
+                point_parts = [p for p in point_parts if p != b'']
+
+                for point_part in point_parts:
+                    # Read floats.
+                    pts = np.frombuffer(point_part, dtype=dtype)
+                    # Enforce ability to write to underlying bytes object
+                    pts.flags.writeable = True
+                    # Convert data to little-endian if needed.
+                    yield pts.astype('<f4', copy=False).reshape([-1, 3])
+
+                n_streams += len(point_parts)
+
+            if not buffs[-1] == eof_marker:
+                raise DataError('Expecting end-of-file marker ' 'inf inf inf')
 
             # In case the 'count' field was not provided.
-            header[Field.NB_STREAMLINES] = i
+            header[Field.NB_STREAMLINES] = n_streams
 
             # Set the file position where it was (in case it was already open).
             f.seek(start_position, os.SEEK_CUR)
diff --git a/nibabel/streamlines/tests/test_tck.py b/nibabel/streamlines/tests/test_tck.py
@@ -6,16 +6,16 @@
 from nibabel.externals.six import BytesIO
 from nibabel.py3k import asbytes
 
-from nose.tools import assert_equal, assert_raises
-
-from nibabel.testing import data_path
-from .test_tractogram import assert_tractogram_equal
 from ..array_sequence import ArraySequence
 from ..tractogram import Tractogram
 from ..tractogram_file import DataError
 
 from ..tck import TckFile
 
+from nose.tools import assert_equal, assert_raises, assert_true
+from numpy.testing import assert_array_equal
+from nibabel.testing import data_path
+from .test_tractogram import assert_tractogram_equal
 
 DATA = {}
 
@@ -62,6 +62,17 @@ def test_load_simple_file(self):
         tck = TckFile(tractogram, header=hdr)
         assert_tractogram_equal(tck.tractogram, DATA['simple_tractogram'])
 
+    def test_writeable_data(self):
+        data = DATA['simple_tractogram']
+        for key in ('simple_tck_fname', 'simple_tck_big_endian_fname'):
+            for lazy_load in [False, True]:
+                tck = TckFile.load(DATA[key], lazy_load=lazy_load)
+                for actual, expected_tgi in zip(tck.streamlines, data):
+                    assert_array_equal(actual, expected_tgi.streamline)
+                    # Test we can write to arrays
+                    assert_true(actual.flags.writeable)
+                    actual[0, 0] = 99
+
     def test_load_simple_file_in_big_endian(self):
         for lazy_load in [False, True]:
             tck = TckFile.load(DATA['simple_tck_big_endian_fname'],
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,4 @@
 #   doc/source/installation.rst
 
 six>=1.3
-numpy>=1.6
+numpy>=1.7