Internal merge

Ryan Sepassi · Ryan Sepassi · commit c7f24da3785f · 2018-01-12T17:16:44.000-08:00
PiperOrigin-RevId: 181811258
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
@@ -149,13 +149,14 @@ class UnicodeRegex(object):
   """Ad-hoc hack to recognize all punctuation and symbols."""
 
   def __init__(self):
-    def _property_chars(prefix):
-      return "".join(six.unichr(x) for x in range(sys.maxunicode)
-                     if unicodedata.category(six.unichr(x)).startswith(prefix))
-    punctuation = self._property_chars("P")
+    punctuation = self.property_chars("P")
     self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
     self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
-    self.symbol_re = re.compile("([" + _property_chars("S") + "])")
+    self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
+
+  def property_chars(self, prefix):
+    return "".join(six.unichr(x) for x in range(sys.maxunicode)
+                   if unicodedata.category(six.unichr(x)).startswith(prefix))
 
 
 def bleu_tokenize(string):
@@ -182,9 +183,10 @@ def bleu_tokenize(string):
   Returns:
     a list of tokens
   """
-  string = UnicodeRegex.nondigit_punct_re.sub(r"\1 \2 ", string)
-  string = UnicodeRegex.punct_nondigit_re.sub(r" \1 \2", string)
-  string = UnicodeRegex.symbol_re.sub(r" \1 ", string)
+  uregex = UnicodeRegex()
+  string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
+  string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
+  string = uregex.symbol_re.sub(r" \1 ", string)
   return string.split()
 
 
diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# coding=utf-8
 """Tests for tensor2tensor.utils.bleu_hook."""
 
 from __future__ import absolute_import
@@ -57,5 +58,10 @@ def testComputeMultipleNgrams(self):
     actual_bleu = 0.3436
     self.assertAllClose(bleu, actual_bleu, atol=1e-03)
 
-if __name__ == '__main__':
+  def testBleuTokenize(self):
+    self.assertEqual(bleu_hook.bleu_tokenize(u"hi, “there”"),
+                     [u"hi", u",", u"“", u"there", u"”"])
+
+
+if __name__ == "__main__":
   tf.test.main()