@@ -83,6 +83,10 @@ def get_version():
8383# dict key used to add nested documents to a document
8484NESTED_DOC_KEY = "_childDocuments_"
8585
86+ VALID_XML_CHARS_REGEX = re .compile (
87+ "[^\u0020 -\uD7FF \u0009 \u000A \u000D \uE000 -\uFFFD \U00010000 -\U0010FFFF ]+"
88+ )
89+
8690
8791class NullHandler (logging .Handler ):
8892 def emit (self , record ):
@@ -205,23 +209,6 @@ def safe_urlencode(params, doseq=0):
205209 return urlencode (new_params , doseq )
206210
207211
208- def is_valid_xml_char_ordinal (i ):
209- """
210- Defines whether char is valid to use in xml document
211-
212- XML standard defines a valid char as::
213-
214- Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
215- """
216- # conditions ordered by presumed frequency
217- return (
218- 0x20 <= i <= 0xD7FF
219- or i in (0x9 , 0xA , 0xD )
220- or 0xE000 <= i <= 0xFFFD
221- or 0x10000 <= i <= 0x10FFFF
222- )
223-
224-
225212def clean_xml_string (s ):
226213 """
227214 Cleans string from invalid xml chars
@@ -230,7 +217,7 @@ def clean_xml_string(s):
230217
231218 http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
232219 """
233- return "" . join ( c for c in s if is_valid_xml_char_ordinal ( ord ( c )) )
220+ return VALID_XML_CHARS_REGEX . sub ( "" , s )
234221
235222
236223class SolrError (Exception ):
0 commit comments