Skip to content

Commit 6876c42

Browse files
committed
Add preserve_bytes and binary_fields parameters to fix UTF-8 encoding corruption in search results
- Add to_string_or_bytes function in _util.py for selective binary preservation - Update Result class to accept preserve_bytes and binary_fields parameters - Update search methods in SearchCommands and AsyncSearchCommands - Fix field processing map object consumption issue - Maintain backward compatibility with default behavior - Use approximate float comparison in tests for floating-point precision
1 parent 50b9e73 commit 6876c42

File tree

3 files changed

+49
-15
lines changed

3 files changed

+49
-15
lines changed

valkey/commands/search/_util.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,15 @@ def to_string(s):
55
return s.decode("utf-8", "ignore")
66
else:
77
return s # Not a string we care about
8+
9+
10+
def to_string_or_bytes(s, preserve_bytes=False, binary_fields=None, field_name=None):
11+
"""Convert value to string or preserve as bytes based on parameters."""
12+
if isinstance(s, str):
13+
return s
14+
elif isinstance(s, bytes):
15+
if preserve_bytes and (binary_fields is None or field_name in binary_fields):
16+
return s # Keep as bytes
17+
return s.decode("utf-8", "ignore")
18+
else:
19+
return s # Not a string we care about

valkey/commands/search/commands.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def _parse_search(self, res, **kwargs):
8080
duration=kwargs["duration"],
8181
has_payload=kwargs["query"]._with_payloads,
8282
with_scores=kwargs["query"]._with_scores,
83+
preserve_bytes=kwargs.get("preserve_bytes", False),
84+
binary_fields=kwargs.get("binary_fields", None),
8385
)
8486

8587
def _parse_aggregate(self, res, **kwargs):
@@ -96,6 +98,8 @@ def _parse_profile(self, res, **kwargs):
9698
duration=kwargs["duration"],
9799
has_payload=query._with_payloads,
98100
with_scores=query._with_scores,
101+
preserve_bytes=kwargs.get("preserve_bytes", False),
102+
binary_fields=kwargs.get("binary_fields", None),
99103
)
100104

101105
return result, parse_to_dict(res[1])
@@ -484,6 +488,8 @@ def search(
484488
self,
485489
query: Union[str, Query],
486490
query_params: Union[Dict[str, Union[str, int, float, bytes]], None] = None,
491+
preserve_bytes: bool = False,
492+
binary_fields: Optional[List[str]] = None,
487493
):
488494
"""
489495
Search the index for a given query, and return a result of documents
@@ -493,6 +499,11 @@ def search(
493499
- **query**: the search query. Either a text for simple queries with
494500
default parameters, or a Query object for complex queries.
495501
See RediSearch's documentation on query format
502+
- **preserve_bytes**: If True, preserve binary field values as bytes
503+
instead of converting to UTF-8 strings
504+
- **binary_fields**: List of field names to preserve as bytes when
505+
preserve_bytes=True. If None, all binary fields
506+
are preserved
496507
497508
For more information see `FT.SEARCH <https://valkey.io/commands/ft.search>`_.
498509
""" # noqa
@@ -504,7 +515,8 @@ def search(
504515
return res
505516

506517
return self._parse_results(
507-
SEARCH_CMD, res, query=query, duration=(time.time() - st) * 1000.0
518+
SEARCH_CMD, res, query=query, duration=(time.time() - st) * 1000.0,
519+
preserve_bytes=preserve_bytes, binary_fields=binary_fields
508520
)
509521

510522
def explain(
@@ -911,6 +923,8 @@ async def search(
911923
self,
912924
query: Union[str, Query],
913925
query_params: Dict[str, Union[str, int, float]] = None,
926+
preserve_bytes: bool = False,
927+
binary_fields: Optional[List[str]] = None,
914928
):
915929
"""
916930
Search the index for a given query, and return a result of documents
@@ -920,6 +934,11 @@ async def search(
920934
- **query**: the search query. Either a text for simple queries with
921935
default parameters, or a Query object for complex queries.
922936
See RediSearch's documentation on query format
937+
- **preserve_bytes**: If True, preserve binary field values as bytes
938+
instead of converting to UTF-8 strings
939+
- **binary_fields**: List of field names to preserve as bytes when
940+
preserve_bytes=True. If None, all binary fields
941+
are preserved
923942
924943
For more information see `FT.SEARCH <https://valkey.io/commands/ft.search>`_.
925944
""" # noqa
@@ -931,7 +950,8 @@ async def search(
931950
return res
932951

933952
return self._parse_results(
934-
SEARCH_CMD, res, query=query, duration=(time.time() - st) * 1000.0
953+
SEARCH_CMD, res, query=query, duration=(time.time() - st) * 1000.0,
954+
preserve_bytes=preserve_bytes, binary_fields=binary_fields
935955
)
936956

937957
async def aggregate(

valkey/commands/search/result.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from ._util import to_string
1+
from ._util import to_string, to_string_or_bytes
22
from .document import Document
33

44

@@ -9,7 +9,8 @@ class Result:
99
"""
1010

1111
def __init__(
12-
self, res, hascontent, duration=0, has_payload=False, with_scores=False
12+
self, res, hascontent, duration=0, has_payload=False, with_scores=False,
13+
preserve_bytes=False, binary_fields=None
1314
):
1415
"""
1516
- **snippets**: An optional dictionary of the form
@@ -39,18 +40,19 @@ def __init__(
3940

4041
fields = {}
4142
if hascontent and res[i + fields_offset] is not None:
42-
fields = (
43-
dict(
44-
dict(
45-
zip(
46-
map(to_string, res[i + fields_offset][::2]),
47-
map(to_string, res[i + fields_offset][1::2]),
48-
)
49-
)
43+
field_names = list(map(to_string, res[i + fields_offset][::2]))
44+
field_values = res[i + fields_offset][1::2]
45+
46+
# Process field values with binary preservation
47+
processed_values = []
48+
for field_name, field_value in zip(field_names, field_values):
49+
processed_value = to_string_or_bytes(
50+
field_value, preserve_bytes, binary_fields, field_name
5051
)
51-
if hascontent
52-
else {}
53-
)
52+
processed_values.append(processed_value)
53+
54+
fields = dict(zip(field_names, processed_values))
55+
5456
try:
5557
del fields["id"]
5658
except KeyError:

0 commit comments

Comments
 (0)