1111import sys
1212import warnings
1313import zlib
14+ import os .path as op
1415from io import StringIO
1516from xml .parsers .expat import ExpatError
1617
@@ -30,45 +31,109 @@ class GiftiParseError(ExpatError):
3031 """ Gifti-specific parsing error """
3132
3233
33- def read_data_block (encoding , endian , ordering , datatype , shape , data ):
34- """ Tries to unzip, decode, parse the funny string data """
35- enclabel = gifti_encoding_codes .label [encoding ]
36- dtype = data_type_codes .type [datatype ]
34+ def read_data_block (darray , fname , data , mmap ):
35+ """Parses data from a <Data> element, or loads from an external file.
36+
37+ Parameters
38+ ----------
39+ darray : GiftiDataArray
40+ GiftiDataArray object representing the parent <DataArray> of this
41+ <Data> element
42+
43+ fname : str or None
44+ Name of GIFTI file being loaded, or None if in-memory
45+
46+ data : str or None
47+ Data to parse, or None if data is in an external file
48+
49+ mmap : {True, False, 'c', 'r', 'r+'}
50+ Controls the use of numpy memory mapping for reading data. Only has
51+ an effect when loading GIFTI images with data stored in external files
52+ (``DataArray`` elements with an ``Encoding`` equal to
53+ ``ExternalFileBinary``). If ``False``, do not try numpy ``memmap``
54+ for data array. If one of ``{'c', 'r', 'r+'}``, try numpy ``memmap``
55+ with ``mode=mmap``. A `mmap` value of ``True`` gives the same
56+ behavior as ``mmap='c'``. If the file cannot be memory-mapped, ignore
57+ `mmap` value and read array from file.
58+
59+ Returns
60+ -------
61+ ``numpy.ndarray`` or ``numpy.memmap`` containing the parsed data
62+ """
63+ if mmap not in (True , False , 'c' , 'r' , 'r+' ):
64+ raise ValueError ("mmap value should be one of True, False, 'c', "
65+ "'r', 'r+'" )
66+ if mmap is True :
67+ mmap = 'c'
68+ enclabel = gifti_encoding_codes .label [darray .encoding ]
69+ dtype = data_type_codes .type [darray .datatype ]
70+
3771 if enclabel == 'ASCII' :
3872 # GIFTI_ENCODING_ASCII
3973 c = StringIO (data )
4074 da = np .loadtxt (c , dtype = dtype )
4175 return da # independent of the endianness
42-
43- elif enclabel == 'External' :
44- # GIFTI_ENCODING_EXTBIN
45- raise NotImplementedError ("In what format are the external files?" )
46-
47- elif enclabel not in ('B64BIN' , 'B64GZ' ):
76+ elif enclabel not in ('B64BIN' , 'B64GZ' , 'External' ):
4877 return 0
4978
79+ # GIFTI_ENCODING_EXTBIN
80+ # We assume that the external data file is raw uncompressed binary, with
81+ # the data type/endianness/ordering specified by the other DataArray
82+ # attributes
83+ if enclabel == 'External' :
84+ if fname is None :
85+ raise GiftiParseError ('ExternalFileBinary is not supported '
86+ 'when loading from in-memory XML' )
87+ ext_fname = op .join (op .dirname (fname ), darray .ext_fname )
88+ if not op .exists (ext_fname ):
89+ raise GiftiParseError ('Cannot locate external file ' + ext_fname )
90+ # We either create a memmap, or load into memory
91+ newarr = None
92+ if mmap :
93+ try :
94+ newarr = np .memmap (ext_fname ,
95+ dtype = dtype ,
96+ mode = mmap ,
97+ offset = darray .ext_offset ,
98+ shape = tuple (darray .dims ))
99+ # If the memmap fails, we ignore the error and load the data into
100+ # memory below
101+ except (AttributeError , TypeError , ValueError ):
102+ pass
103+ # mmap=False or np.memmap failed
104+ if newarr is None :
105+ # We can replace this with a call to np.fromfile in numpy>=1.17,
106+ # as an "offset" paramter was added in that version.
107+ with open (ext_fname , 'rb' ) as f :
108+ f .seek (darray .ext_offset )
109+ nbytes = np .prod (darray .dims ) * dtype ().itemsize
110+ buff = f .read (nbytes )
111+ newarr = np .frombuffer (buff , dtype = dtype )
112+
50113 # Numpy arrays created from bytes objects are read-only.
51114 # Neither b64decode nor decompress will return bytearrays, and there
52115 # are not equivalents to fobj.readinto to allow us to pass them, so
53116 # there is not a simple way to avoid making copies.
54117 # If this becomes a problem, we should write a decoding interface with
55118 # a tunable chunk size.
56- dec = base64 .b64decode (data .encode ('ascii' ))
57- if enclabel == 'B64BIN' :
58- # GIFTI_ENCODING_B64BIN
59- buff = bytearray (dec )
60119 else :
61- # GIFTI_ENCODING_B64GZ
62- buff = bytearray (zlib .decompress (dec ))
63- del dec
64-
65- sh = tuple (shape )
66- newarr = np .frombuffer (buff , dtype = dtype )
120+ dec = base64 .b64decode (data .encode ('ascii' ))
121+ if enclabel == 'B64BIN' :
122+ # GIFTI_ENCODING_B64BIN
123+ buff = bytearray (dec )
124+ else :
125+ # GIFTI_ENCODING_B64GZ
126+ buff = bytearray (zlib .decompress (dec ))
127+ del dec
128+ newarr = np .frombuffer (buff , dtype = dtype )
129+
130+ sh = tuple (darray .dims )
67131 if len (newarr .shape ) != len (sh ):
68- newarr = newarr .reshape (sh , order = array_index_order_codes .npcode [ordering ])
132+ newarr = newarr .reshape (
133+ sh , order = array_index_order_codes .npcode [darray .ind_ord ])
69134
70135 # check if we need to byteswap
71- required_byteorder = gifti_endian_codes .byteorder [endian ]
136+ required_byteorder = gifti_endian_codes .byteorder [darray . endian ]
72137 if (required_byteorder in ('big' , 'little' ) and
73138 required_byteorder != sys .byteorder ):
74139 newarr = newarr .byteswap ()
@@ -82,13 +147,17 @@ def _str2int(in_str):
82147
83148class GiftiImageParser (XmlParser ):
84149
85- def __init__ (self , encoding = None , buffer_size = 35000000 , verbose = 0 ):
150+ def __init__ (self , encoding = None , buffer_size = 35000000 , verbose = 0 ,
151+ mmap = True ):
86152 super (GiftiImageParser , self ).__init__ (encoding = encoding ,
87153 buffer_size = buffer_size ,
88154 verbose = verbose )
89155 # output
90156 self .img = None
91157
158+ # Queried when loading data from <Data> elements - see read_data_block
159+ self .mmap = mmap
160+
92161 # finite state machine stack
93162 self .fsm_state = []
94163
@@ -288,12 +357,17 @@ def CharacterDataHandler(self, data):
288357
289358 def flush_chardata (self ):
290359 """ Collate and process collected character data"""
291- if self ._char_blocks is None :
360+ # Nothing to do for empty elements, except for Data elements which
361+ # are within a DataArray with an external file
362+ if self .write_to != 'Data' and self ._char_blocks is None :
292363 return
293364 # Just join the strings to get the data. Maybe there are some memory
294365 # optimizations we could do by passing the list of strings to the
295366 # read_data_block function.
296- data = '' .join (self ._char_blocks )
367+ if self ._char_blocks is not None :
368+ data = '' .join (self ._char_blocks )
369+ else :
370+ data = None
297371 # Reset the char collector
298372 self ._char_blocks = None
299373
@@ -321,10 +395,8 @@ def flush_chardata(self):
321395 c .close ()
322396
323397 elif self .write_to == 'Data' :
324- da_tmp = self .img .darrays [- 1 ]
325- da_tmp .data = read_data_block (da_tmp .encoding , da_tmp .endian ,
326- da_tmp .ind_ord , da_tmp .datatype ,
327- da_tmp .dims , data )
398+ self .da .data = read_data_block (self .da , self .fname , data ,
399+ self .mmap )
328400 # update the endianness according to the
329401 # current machine setting
330402 self .endian = gifti_endian_codes .code [sys .byteorder ]
0 commit comments