Skip to content

Commit 1c91111

Browse files
authored
Merge pull request #396 from django-haystack/pr-257
Replace ''.join after char filtering with regex filter
2 parents 1edd85b + ff90912 commit 1c91111

File tree

1 file changed

+5
-18
lines changed

1 file changed

+5
-18
lines changed

pysolr.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def get_version():
8383
# dict key used to add nested documents to a document
8484
NESTED_DOC_KEY = "_childDocuments_"
8585

86+
VALID_XML_CHARS_REGEX = re.compile(
87+
"[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+"
88+
)
89+
8690

8791
class NullHandler(logging.Handler):
8892
def emit(self, record):
@@ -205,23 +209,6 @@ def safe_urlencode(params, doseq=0):
205209
return urlencode(new_params, doseq)
206210

207211

208-
def is_valid_xml_char_ordinal(i):
209-
"""
210-
Defines whether char is valid to use in xml document
211-
212-
XML standard defines a valid char as::
213-
214-
Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
215-
"""
216-
# conditions ordered by presumed frequency
217-
return (
218-
0x20 <= i <= 0xD7FF
219-
or i in (0x9, 0xA, 0xD)
220-
or 0xE000 <= i <= 0xFFFD
221-
or 0x10000 <= i <= 0x10FFFF
222-
)
223-
224-
225212
def clean_xml_string(s):
226213
"""
227214
Cleans string from invalid xml chars
@@ -230,7 +217,7 @@ def clean_xml_string(s):
230217
231218
http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
232219
"""
233-
return "".join(c for c in s if is_valid_xml_char_ordinal(ord(c)))
220+
return VALID_XML_CHARS_REGEX.sub("", s)
234221

235222

236223
class SolrError(Exception):

0 commit comments

Comments
 (0)