Bump version to 1.0.6

Ryan Sepassi · Ryan Sepassi · commit b09291df5e87 · 2017-06-23T18:07:50.000-07:00
PiperOrigin-RevId: 159970178
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
@@ -22,12 +22,12 @@
 import io
 import os
 import tarfile
+import urllib
 
 # Dependency imports
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
-import six.moves.urllib_request
 
 from tensor2tensor.data_generators.text_encoder import SubwordTextEncoder
 from tensor2tensor.data_generators.tokenizer import Tokenizer
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import cPickle
 import gzip
 import io
 import json
@@ -31,8 +32,6 @@
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
-from six.moves import cPickle
-
 from tensor2tensor.data_generators import generator_utils
 
 import tensorflow as tf
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -27,7 +27,6 @@
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from collections import defaultdict
 from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
@@ -36,10 +35,7 @@
 PAD = '<pad>'
 EOS = '<EOS>'
 RESERVED_TOKENS = [PAD, EOS]
-if six.PY2:
-  RESERVED_TOKENS_BYTES = RESERVED_TOKENS
-else:
-  RESERVED_TOKENS_BYTES = [bytes(PAD, 'ascii'), bytes(EOS, 'ascii')]
+
 
 class TextEncoder(object):
   """Base class for converting from ints to/from human readable strings."""
@@ -91,25 +87,17 @@ class ByteTextEncoder(TextEncoder):
   """Encodes each byte to an id. For 8-bit strings only."""
 
   def encode(self, s):
-    numres = self._num_reserved_ids
-    if six.PY2:
-      return [ord(c) + numres for c in s]
-    # Python3: explicitly convert to UTF-8
-    return [c + numres for c in s.encode("utf-8")]
+    return [ord(c) + self._num_reserved_ids for c in s]
 
   def decode(self, ids):
-    numres = self._num_reserved_ids
     decoded_ids = []
-    int2byte = six.int2byte
     for id_ in ids:
-      if 0 <= id_ < numres:
-        decoded_ids.append(RESERVED_TOKENS_BYTES[int(id_)])
+      if 0 <= id_ < self._num_reserved_ids:
+        decoded_ids.append(RESERVED_TOKENS[int(id_)])
       else:
-        decoded_ids.append(int2byte(id_ - numres))
-    if six.PY2:
-      return ''.join(decoded_ids)
-    # Python3: join byte arrays and then decode string
-    return b''.join(decoded_ids).decode("utf-8")
+        decoded_ids.append(chr(id_))
+
+    return ''.join(decoded_ids)
 
   @property
   def vocab_size(self):
@@ -123,16 +111,20 @@ def __init__(self, vocab_filename, reverse=False, num_reserved_ids=2):
     """Initialize from a file, one token per line."""
     super(TokenTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
     self._reverse = reverse
-    self._load_vocab_from_file(vocab_filename)
+    if vocab_filename is not None:
+      self._load_vocab_from_file(vocab_filename)
 
   def encode(self, sentence):
     """Converts a space-separated string of tokens to a list of ids."""
     ret = [self._token_to_id[tok] for tok in sentence.strip().split()]
-    return ret[::-1] if self._reverse else ret
+    if self._reverse:
+      ret = ret[::-1]
+    return ret
 
   def decode(self, ids):
-    seq = reversed(ids) if self._reverse else ids
-    return ' '.join([self._safe_id_to_token(i) for i in seq])
+    if self._reverse:
+      ids = ids[::-1]
+    return ' '.join([self._safe_id_to_token(i) for i in ids])
 
   @property
   def vocab_size(self):
@@ -251,22 +243,15 @@ def _escaped_token_to_subtokens(self, escaped_token):
     """
     ret = []
     pos = 0
-    lesc = len(escaped_token)
-    while pos < lesc:
-      end = lesc
-      while end > pos:
+    while pos < len(escaped_token):
+      end = len(escaped_token)
+      while True:
         subtoken = self._subtoken_string_to_id.get(escaped_token[pos:end], -1)
         if subtoken != -1:
           break
         end -= 1
       ret.append(subtoken)
-      if end > pos:
-        pos = end
-      else:
-        # This kinda should not happen, but it does. Cop out by skipping the
-        # nonexistent subtoken from the returned list.
-        # print("Unable to find subtoken in string '{0}'".format(escaped_token))
-        pos += 1
+      pos = end
     return ret
 
   @classmethod
@@ -337,13 +322,13 @@ def build_from_token_counts(self,
     # then count the resulting potential subtokens, keeping the ones
     # with high enough counts for our new vocabulary.
     for i in xrange(num_iterations):
-      counts = defaultdict(int)
+      counts = {}
       for token, count in six.iteritems(token_counts):
         escaped_token = self._escape_token(token)
         # we will count all tails of the escaped_token, starting from boundaries
         # determined by our current segmentation.
         if i == 0:
-          starts = xrange(len(escaped_token))
+          starts = list(range(len(escaped_token)))
         else:
           subtokens = self._escaped_token_to_subtokens(escaped_token)
           pos = 0
@@ -352,33 +337,31 @@ def build_from_token_counts(self,
             starts.append(pos)
             pos += len(self.subtoken_to_subtoken_string(subtoken))
         for start in starts:
-          for end in xrange(start + 1, len(escaped_token)):
+          for end in xrange(start + 1, len(escaped_token) + 1):
             subtoken_string = escaped_token[start:end]
-            counts[subtoken_string] += count
+            counts[subtoken_string] = counts.get(subtoken_string, 0) + count
       # array of lists of candidate subtoken strings, by length
       len_to_subtoken_strings = []
       for subtoken_string, count in six.iteritems(counts):
-        lsub = len(subtoken_string)
-        # all subtoken strings of length 1 are included regardless of count
-        if count < min_count and lsub != 1:
+        if count < min_count or len(subtoken_string) <= 1:
           continue
-        while len(len_to_subtoken_strings) <= lsub:
+        while len(len_to_subtoken_strings) <= len(subtoken_string):
           len_to_subtoken_strings.append([])
-        len_to_subtoken_strings[lsub].append(subtoken_string)
+        len_to_subtoken_strings[len(subtoken_string)].append(subtoken_string)
       new_subtoken_strings = []
       # consider the candidates longest to shortest, so that if we accept
       # a longer subtoken string, we can decrement the counts of its prefixes.
       for subtoken_strings in len_to_subtoken_strings[::-1]:
         for subtoken_string in subtoken_strings:
           count = counts[subtoken_string]
-          if count < min_count and len(subtoken_string) != 1:
-            # subtoken strings of length 1 are included regardless of count
+          if count < min_count:
             continue
           new_subtoken_strings.append((-count, subtoken_string))
           for l in xrange(1, len(subtoken_string)):
             counts[subtoken_string[:l]] -= count
-      # Make sure to include the underscore as a subtoken string
-      new_subtoken_strings.append((0, '_'))
+      # make sure we have all single characters.
+      new_subtoken_strings.extend([(-counts.get(chr(i), 0), chr(i))
+                                   for i in xrange(2**8)])
       new_subtoken_strings.sort()
       self._init_from_list([''] * self._num_reserved_ids +
                            [p[1] for p in new_subtoken_strings])
@@ -407,19 +390,13 @@ def _load_from_file(self, filename):
     subtoken_strings = []
     with tf.gfile.Open(filename) as f:
       for line in f:
-        if six.PY2:
-          subtoken_strings.append(line.strip()[1:-1].decode('string-escape'))
-        else:
-          subtoken_strings.append(line.strip()[1:-1])
+        subtoken_strings.append(line.strip()[1:-1].decode('string-escape'))
     self._init_from_list(subtoken_strings)
 
   def _store_to_file(self, filename):
     with tf.gfile.Open(filename, 'w') as f:
       for subtoken_string in self._all_subtoken_strings:
-        if six.PY2:
-          f.write('\'' + subtoken_string.encode('string-escape') + '\'\n')
-        else:
-          f.write('\'' + subtoken_string + '\'\n')
+        f.write('\'' + subtoken_string.encode('string-escape') + '\'\n')
 
   def _escape_token(self, token):
     r"""Translate '\'->'\\' and '_'->'\u', then append '_'.
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
@@ -45,21 +45,29 @@
 from __future__ import division
 from __future__ import print_function
 
+import array
 import string
 
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
-from collections import defaultdict
+
 
 class Tokenizer(object):
   """Vocab for breaking words into wordpieces.
   """
 
-  _SEPARATOR_CHAR_SET = set(string.punctuation + string.whitespace)
-
   def __init__(self):
-    self.token_counts = defaultdict(int)
+    self._separator_chars = string.punctuation + string.whitespace
+    self._separator_char_mask = array.array(
+        "l", [chr(i) in self._separator_chars for i in xrange(256)])
+    self.token_counts = dict()
+
+  def _increment_token_count(self, token):
+    if token in self.token_counts:
+      self.token_counts[token] += 1
+    else:
+      self.token_counts[token] = 1
 
   def encode(self, raw_text):
     """Encode a raw string as a list of tokens.
@@ -79,11 +87,11 @@ def encode(self, raw_text):
         token = raw_text[token_start:pos]
         if token != " " or token_start == 0:
           ret.append(token)
-          self.token_counts[token] += 1
+          self._increment_token_count(token)
         token_start = pos
     final_token = raw_text[token_start:]
     ret.append(final_token)
-    self.token_counts[final_token] += 1
+    self._increment_token_count(final_token)
     return ret
 
   def decode(self, tokens):
@@ -103,7 +111,7 @@ def decode(self, tokens):
     return ret
 
   def _is_separator_char(self, c):
-    return c in self._SEPARATOR_CHAR_SET
+    return self._separator_char_mask[ord(c)]
 
   def _is_word_char(self, c):
-    return c not in self._SEPARATOR_CHAR_SET
+    return not self._is_separator_char(c)