@@ -149,13 +149,14 @@ class UnicodeRegex(object):
149149 """Ad-hoc hack to recognize all punctuation and symbols."""
150150
151151 def __init__ (self ):
152- def _property_chars (prefix ):
153- return "" .join (six .unichr (x ) for x in range (sys .maxunicode )
154- if unicodedata .category (six .unichr (x )).startswith (prefix ))
155- punctuation = self ._property_chars ("P" )
152+ punctuation = self .property_chars ("P" )
156153 self .nondigit_punct_re = re .compile (r"([^\d])([" + punctuation + r"])" )
157154 self .punct_nondigit_re = re .compile (r"([" + punctuation + r"])([^\d])" )
158- self .symbol_re = re .compile ("([" + _property_chars ("S" ) + "])" )
155+ self .symbol_re = re .compile ("([" + self .property_chars ("S" ) + "])" )
156+
157+ def property_chars (self , prefix ):
158+ return "" .join (six .unichr (x ) for x in range (sys .maxunicode )
159+ if unicodedata .category (six .unichr (x )).startswith (prefix ))
159160
160161
161162def bleu_tokenize (string ):
@@ -182,9 +183,10 @@ def bleu_tokenize(string):
182183 Returns:
183184 a list of tokens
184185 """
185- string = UnicodeRegex .nondigit_punct_re .sub (r"\1 \2 " , string )
186- string = UnicodeRegex .punct_nondigit_re .sub (r" \1 \2" , string )
187- string = UnicodeRegex .symbol_re .sub (r" \1 " , string )
186+ uregex = UnicodeRegex ()
187+ string = uregex .nondigit_punct_re .sub (r"\1 \2 " , string )
188+ string = uregex .punct_nondigit_re .sub (r" \1 \2" , string )
189+ string = uregex .symbol_re .sub (r" \1 " , string )
188190 return string .split ()
189191
190192
0 commit comments