Skip to content

Commit da4df79

Browse files
authored
PYTHON-3508 Improve the performance of GridOut.readline and GridOut.read (#1109)
1 parent ff94b0e commit da4df79

File tree

2 files changed

+71
-57
lines changed

2 files changed

+71
-57
lines changed

doc/changelog.rst

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,23 @@ Changelog
44
Changes in Version 4.3.3
55
------------------------
66

7-
- Fixed a performance regression in :meth:`~gridfs.GridOut.download_to_stream`
8-
and :meth:`~gridfs.GridOut.download_to_stream_by_name` by reading in chunks
9-
instead of line by line.
7+
Version 4.3.3 fixes a number of bugs:
108

9+
- Fixed a performance regression in :meth:`~gridfs.GridFSBucket.download_to_stream`
10+
and :meth:`~gridfs.GridFSBucket.download_to_stream_by_name` by reading in chunks
11+
instead of line by line (`PYTHON-3502`_).
12+
- Improved performance of :meth:`gridfs.grid_file.GridOut.read` and
13+
:meth:`gridfs.grid_file.GridOut.readline` (`PYTHON-3508`_).
14+
15+
Issues Resolved
16+
...............
17+
18+
See the `PyMongo 4.3.3 release notes in JIRA`_ for the list of resolved issues
19+
in this release.
20+
21+
.. _PYTHON-3502: https://jira.mongodb.org/browse/PYTHON-3502
22+
.. _PYTHON-3508: https://jira.mongodb.org/browse/PYTHON-3508
23+
.. _PyMongo 4.3.3 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=34709
1124

1225
Changes in Version 4.3 (4.3.2)
1326
------------------------------

gridfs/grid_file.py

Lines changed: 55 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,10 @@ def __init__(
463463
self.__files = root_collection.files
464464
self.__file_id = file_id
465465
self.__buffer = EMPTY
466+
# Start position within the current buffered chunk.
467+
self.__buffer_pos = 0
466468
self.__chunk_iter = None
469+
# Position within the total file.
467470
self.__position = 0
468471
self._file = file_document
469472
self._session = session
@@ -510,12 +513,12 @@ def readchunk(self) -> bytes:
510513
"""Reads a chunk at a time. If the current position is within a
511514
chunk the remainder of the chunk is returned.
512515
"""
513-
received = len(self.__buffer)
516+
received = len(self.__buffer) - self.__buffer_pos
514517
chunk_data = EMPTY
515518
chunk_size = int(self.chunk_size)
516519

517520
if received > 0:
518-
chunk_data = self.__buffer
521+
chunk_data = self.__buffer[self.__buffer_pos :]
519522
elif self.__position < int(self.length):
520523
chunk_number = int((received + self.__position) / chunk_size)
521524
if self.__chunk_iter is None:
@@ -531,25 +534,12 @@ def readchunk(self) -> bytes:
531534

532535
self.__position += len(chunk_data)
533536
self.__buffer = EMPTY
537+
self.__buffer_pos = 0
534538
return chunk_data
535539

536-
def read(self, size: int = -1) -> bytes:
537-
"""Read at most `size` bytes from the file (less if there
538-
isn't enough data).
539-
540-
The bytes are returned as an instance of :class:`str` (:class:`bytes`
541-
in python 3). If `size` is negative or omitted all data is read.
542-
543-
:Parameters:
544-
- `size` (optional): the number of bytes to read
545-
546-
.. versionchanged:: 3.8
547-
This method now only checks for extra chunks after reading the
548-
entire file. Previously, this method would check for extra chunks
549-
on every call.
550-
"""
540+
def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
541+
"""Internal read() and readline() helper."""
551542
self._ensure_file()
552-
553543
remainder = int(self.length) - self.__position
554544
if size < 0 or size > remainder:
555545
size = remainder
@@ -558,11 +548,36 @@ def read(self, size: int = -1) -> bytes:
558548
return EMPTY
559549

560550
received = 0
561-
data = io.BytesIO()
551+
data = []
562552
while received < size:
563-
chunk_data = self.readchunk()
553+
needed = size - received
554+
if self.__buffer:
555+
# Optimization: Read the buffer with zero byte copies.
556+
buf = self.__buffer
557+
chunk_start = self.__buffer_pos
558+
chunk_data = memoryview(buf)[self.__buffer_pos :]
559+
self.__buffer = EMPTY
560+
self.__buffer_pos = 0
561+
self.__position += len(chunk_data)
562+
else:
563+
buf = self.readchunk()
564+
chunk_start = 0
565+
chunk_data = memoryview(buf)
566+
if line:
567+
pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start
568+
if pos >= 0:
569+
# Decrease size to exit the loop.
570+
size = received + pos + 1
571+
needed = pos + 1
572+
if len(chunk_data) > needed:
573+
data.append(chunk_data[:needed])
574+
# Optimization: Save the buffer with zero byte copies.
575+
self.__buffer = buf
576+
self.__buffer_pos = chunk_start + needed
577+
self.__position -= len(self.__buffer) - self.__buffer_pos
578+
else:
579+
data.append(chunk_data)
564580
received += len(chunk_data)
565-
data.write(chunk_data)
566581

567582
# Detect extra chunks after reading the entire file.
568583
if size == remainder and self.__chunk_iter:
@@ -571,47 +586,32 @@ def read(self, size: int = -1) -> bytes:
571586
except StopIteration:
572587
pass
573588

574-
self.__position -= received - size
589+
return b"".join(data)
590+
591+
def read(self, size: int = -1) -> bytes:
592+
"""Read at most `size` bytes from the file (less if there
593+
isn't enough data).
594+
595+
The bytes are returned as an instance of :class:`str` (:class:`bytes`
596+
in python 3). If `size` is negative or omitted all data is read.
597+
598+
:Parameters:
599+
- `size` (optional): the number of bytes to read
575600
576-
# Return 'size' bytes and store the rest.
577-
data.seek(size)
578-
self.__buffer = data.read()
579-
data.seek(0)
580-
return data.read(size)
601+
.. versionchanged:: 3.8
602+
This method now only checks for extra chunks after reading the
603+
entire file. Previously, this method would check for extra chunks
604+
on every call.
605+
"""
606+
return self._read_size_or_line(size=size)
581607

582608
def readline(self, size: int = -1) -> bytes: # type: ignore[override]
583609
"""Read one line or up to `size` bytes from the file.
584610
585611
:Parameters:
586612
- `size` (optional): the maximum number of bytes to read
587613
"""
588-
remainder = int(self.length) - self.__position
589-
if size < 0 or size > remainder:
590-
size = remainder
591-
592-
if size == 0:
593-
return EMPTY
594-
595-
received = 0
596-
data = io.BytesIO()
597-
while received < size:
598-
chunk_data = self.readchunk()
599-
pos = chunk_data.find(NEWLN, 0, size)
600-
if pos != -1:
601-
size = received + pos + 1
602-
603-
received += len(chunk_data)
604-
data.write(chunk_data)
605-
if pos != -1:
606-
break
607-
608-
self.__position -= received - size
609-
610-
# Return 'size' bytes and store the rest.
611-
data.seek(size)
612-
self.__buffer = data.read()
613-
data.seek(0)
614-
return data.read(size)
614+
return self._read_size_or_line(size=size, line=True)
615615

616616
def tell(self) -> int:
617617
"""Return the current position of this file."""
@@ -651,6 +651,7 @@ def seek(self, pos: int, whence: int = _SEEK_SET) -> int:
651651

652652
self.__position = new_pos
653653
self.__buffer = EMPTY
654+
self.__buffer_pos = 0
654655
if self.__chunk_iter:
655656
self.__chunk_iter.close()
656657
self.__chunk_iter = None

0 commit comments

Comments
 (0)