Merge pull request #66 from vthorsteinsson/fix-tokens

lukaszkaiser · web-flow · commit af235c1de411 · 2017-06-29T10:16:28.000-07:00
Unicode in SubwordTextEncoder
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
@@ -242,9 +242,12 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
 
       # For some datasets a second extraction is necessary.
       if ".gz" in lang_file:
-        tf.logging.info("Unpacking subdirectory %s" % filepath)
         new_filepath = os.path.join(tmp_dir, lang_file[:-3])
-        gunzip_file(filepath, new_filepath)
+        if os.path.exists(new_filepath):
+          tf.logging.info("Subdirectory %s already exists, skipping unpacking" % filepath)
+        else:
+          tf.logging.info("Unpacking subdirectory %s" % filepath)
+          gunzip_file(filepath, new_filepath)
         filepath = new_filepath
 
       # Use Tokenizer to count the word occurrences.
@@ -258,7 +261,8 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
           _ = tokenizer.encode(line)
 
   vocab = SubwordTextEncoder.build_to_target_size(
-      vocab_size, tokenizer.token_counts, vocab_filepath, 1, 1e3)
+      vocab_size, tokenizer.token_counts, 1, 1e3)
+  vocab.store_to_file(vocab_filepath)
   return vocab
 
 
diff --git a/tensor2tensor/data_generators/snli.py b/tensor2tensor/data_generators/snli.py
@@ -136,14 +136,14 @@ def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
   if tf.gfile.Exists(vocab_filepath):
     gs = text_encoder.SubwordTextEncoder(vocab_filepath)
     return gs
-  else:
-    example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
-    gs = text_encoder.SubwordTextEncoder()
-    token_counts = text_encoder.SubwordTextEncoder.get_token_counts(
-        example_file, corpus_max_lines=1000000)
-    gs = gs.build_to_target_size(
-        vocab_size, token_counts, vocab_filepath, min_val=1, max_val=1e3)
-    return gs
+  example_file = os.path.join(tmp_dir, _EXAMPLES_FILE)
+  gs = text_encoder.SubwordTextEncoder()
+  token_counts = text_encoder.SubwordTextEncoder.get_token_counts(
+      example_file, corpus_max_lines=1000000)
+  gs = gs.build_to_target_size(
+      vocab_size, token_counts, min_val=1, max_val=1e3)
+  gs.store_to_file(vocab_filepath)
+  return gs
 
 
 def snli_token_generator(tmp_dir, train, vocab_size):
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -175,9 +175,9 @@ class SubwordTextEncoder(TextEncoder):
   """
 
   def __init__(self, filename=None, num_reserved_ids=2):
-    """Read from a file."""
     self._tokenizer = tokenizer.Tokenizer()
     if filename is not None:
+      # Read from a file.
       self._load_from_file(filename)
 
     super(SubwordTextEncoder, self).__init__(num_reserved_ids=num_reserved_ids)
@@ -235,14 +235,13 @@ def _subtokens_to_tokens(self, subtokens):
 
   def subtoken_to_subtoken_string(self, subtoken):
     """Subtoken_String (string) corresponding to the given subtoken (id)."""
-    if (subtoken >= 0 and subtoken < self.vocab_size and
-        self._all_subtoken_strings[subtoken]):
-      return self._all_subtoken_strings[subtoken]
-    else:
-      if 0 <= subtoken < self._num_reserved_ids:
-        return '%s_' % RESERVED_TOKENS[subtoken]
-      else:
-        return 'ID%d_' % subtoken
+    if 0 <= subtoken < self.vocab_size:
+      subtoken_string = self._all_subtoken_strings[subtoken]
+      if subtoken_string:
+        return subtoken_string
+    if 0 <= subtoken < self._num_reserved_ids:
+      return '%s_' % RESERVED_TOKENS[subtoken]
+    return 'ID%d_' % subtoken
 
   def _escaped_token_to_subtokens(self, escaped_token):
     """Converts an escaped token string to a list of subtokens.
@@ -262,21 +261,32 @@ def _escaped_token_to_subtokens(self, escaped_token):
         if subtoken != -1:
           break
         end -= 1
-      ret.append(subtoken)
       if end > pos:
+        ret.append(subtoken)
         pos = end
       else:
-        # This kinda should not happen, but it does. Cop out by skipping the
-        # nonexistent subtoken from the returned list.
-        # print("Unable to find subtoken in string '{0}'".format(escaped_token))
+        # No subtoken in the vocabulary matches escaped_token[pos].
+        # This can happen if the token contains a Unicode character
+        # that did not occur in the vocabulary training set.
+        # The id self.vocab_size - 1 is decoded as Unicode uFFFD,
+        # REPLACEMENT_CHARACTER.
+        ret.append(self.vocab_size - 1)
+        # Ensure that the outer loop continues
         pos += 1
     return ret
 
+  @classmethod
+  def alphabet(cls, token_counts):
+    """Return the set of Unicode characters that appear in the tokens"""
+    alphabet_set = set()
+    for token in six.iterkeys(token_counts):
+      alphabet_set |= set(token)
+    return alphabet_set
+
   @classmethod
   def build_to_target_size(cls,
                            target_size,
                            token_counts,
-                           store_filename,
                            min_val,
                            max_val,
                            num_iterations=4):
@@ -296,50 +306,51 @@ def build_to_target_size(cls,
     Returns:
       a SubwordTextEncoder instance.
     """
-    present_count = (max_val + min_val) // 2
-    tf.logging.info('Trying min_count %d' % present_count)
-    subtokenizer = cls()
-    subtokenizer.build_from_token_counts(token_counts, store_filename,
-                                         present_count, num_iterations)
-
-    if min_val >= max_val or subtokenizer.vocab_size == target_size:
-      return subtokenizer
-    elif subtokenizer.vocab_size > target_size:
-      other_subtokenizer = cls.build_to_target_size(
-          target_size, token_counts, store_filename, present_count + 1, max_val,
-          num_iterations)
-      if (abs(other_subtokenizer.vocab_size - target_size) <
-          abs(subtokenizer.vocab_size - target_size)):
-        return other_subtokenizer
-      else:
+
+    # Calculate the alphabet, i.e. the set of all Unicode characters
+    # that appear in the tokens
+    alphabet_set = cls.alphabet(token_counts)
+    tf.logging.info('Alphabet contains %d characters' % len(alphabet_set))
+
+    def bisect(min_val, max_val):
+      present_count = (max_val + min_val) // 2
+      tf.logging.info('Trying min_count %d' % present_count)
+      subtokenizer = cls()
+      subtokenizer.build_from_token_counts(token_counts, alphabet_set,
+                                           present_count, num_iterations)
+
+      if min_val >= max_val or subtokenizer.vocab_size == target_size:
         return subtokenizer
-    else:
-      other_subtokenizer = cls.build_to_target_size(
-          target_size, token_counts, store_filename, min_val, present_count - 1,
-          num_iterations)
+      if subtokenizer.vocab_size > target_size:
+        other_subtokenizer = bisect(present_count + 1, max_val)
+      else:
+        other_subtokenizer = bisect(min_val, present_count - 1)
       if (abs(other_subtokenizer.vocab_size - target_size) <
           abs(subtokenizer.vocab_size - target_size)):
         return other_subtokenizer
       else:
         return subtokenizer
 
+    return bisect(min_val, max_val)
+
   def build_from_token_counts(self,
                               token_counts,
-                              store_filename,
+                              alphabet_set,
                               min_count,
                               num_iterations=4):
     """Train a SubwordTextEncoder based on a dictionary of word counts.
 
     Args:
-      token_counts: a dictionary of string to int.
-      store_filename: a string - where to write the vocabulary.
+      token_counts: a dictionary of Unicode strings to int.
+      alphabet_set: the set of Unicode characters that appear in the tokens.
       min_count: an integer - discard subtokens with lower counts.
       num_iterations: an integer.  how many iterations of refinement.
     """
     # We build iteratively.  On each iteration, we segment all the words,
     # then count the resulting potential subtokens, keeping the ones
     # with high enough counts for our new vocabulary.
     for i in xrange(num_iterations):
+      tf.logging.info("Iteration {0}".format(i))
       counts = defaultdict(int)
       for token, count in six.iteritems(token_counts):
         escaped_token = self._escape_token(token)
@@ -353,39 +364,49 @@ def build_from_token_counts(self,
           starts = []
           for subtoken in subtokens:
             starts.append(pos)
-            pos += len(self.subtoken_to_subtoken_string(subtoken))
+            pos += len(self._all_subtoken_strings[subtoken])
         for start in starts:
-          for end in xrange(start + 1, len(escaped_token)):
+          for end in xrange(start + 1, len(escaped_token) + 1):
             subtoken_string = escaped_token[start:end]
             counts[subtoken_string] += count
-      # array of lists of candidate subtoken strings, by length
+      # Array of sets of candidate subtoken strings, by length
       len_to_subtoken_strings = []
       for subtoken_string, count in six.iteritems(counts):
         lsub = len(subtoken_string)
-        # all subtoken strings of length 1 are included regardless of count
-        if count < min_count and lsub != 1:
+        # All subtoken strings of length 1 are automatically included
+        # later, so we don't need to consider them here
+        if count < min_count or lsub <= 1:
           continue
+        # Add this subtoken string to its length set
         while len(len_to_subtoken_strings) <= lsub:
-          len_to_subtoken_strings.append([])
-        len_to_subtoken_strings[lsub].append(subtoken_string)
+          len_to_subtoken_strings.append(set())
+        len_to_subtoken_strings[lsub].add(subtoken_string)
       new_subtoken_strings = []
       # consider the candidates longest to shortest, so that if we accept
       # a longer subtoken string, we can decrement the counts of its prefixes.
-      for subtoken_strings in len_to_subtoken_strings[::-1]:
+      for subtoken_strings in reversed(len_to_subtoken_strings[2:]):
         for subtoken_string in subtoken_strings:
           count = counts[subtoken_string]
-          if count < min_count and len(subtoken_string) != 1:
-            # subtoken strings of length 1 are included regardless of count
+          if count < min_count:
             continue
-          new_subtoken_strings.append((-count, subtoken_string))
+          new_subtoken_strings.append((count, subtoken_string))
           for l in xrange(1, len(subtoken_string)):
             counts[subtoken_string[:l]] -= count
-      # Make sure to include the underscore as a subtoken string
-      new_subtoken_strings.append((0, '_'))
-      new_subtoken_strings.sort()
-      self._init_from_list([''] * self._num_reserved_ids +
+      # Sort what we've got so far in decreasing order by count
+      new_subtoken_strings.sort(reverse = True)
+      # Add the alphabet set at the end of the vocabulary list
+      for char in alphabet_set:
+        new_subtoken_strings.append((0, char))
+      # Also include the Unicode REPLACEMENT CHARACTER to use
+      # when encountering previously unseen Unicode characters
+      # in the input (i.e. input external to the tokenizer training
+      # set, which may thus contain characters not in the alphabet_set).
+      # This must be the last entry in the subtoken vocabulary list.
+      new_subtoken_strings.append((0, u'\uFFFD'))
+      # Now we have a candidate vocabulary
+      self._init_from_list([u''] * self._num_reserved_ids +
                            [p[1] for p in new_subtoken_strings])
-      print('vocab_size = %d' % self.vocab_size)
+      tf.logging.info('vocab_size = %d' % self.vocab_size)
 
     original = 'This sentence was encoded by the SubwordTextEncoder.'
     encoded = self.encode(original)
@@ -394,33 +415,33 @@ def build_from_token_counts(self,
     decoded = self.decode(encoded)
     print(decoded)
     assert decoded == original
-    self._store_to_file(store_filename)
+
+  def dump(self):
+    """ Debugging dump of the current subtoken vocabulary """
+    subtoken_strings = [(i, s) for s, i in six.iteritems(self._subtoken_string_to_id)]
+    print(u", ".join(u"{0} : '{1}'".format(i, s) for i, s in sorted(subtoken_strings)))
 
   def _init_from_list(self, subtoken_strings):
     """Initialize from a list of subtoken strings."""
     self._all_subtoken_strings = subtoken_strings
-    self._subtoken_string_to_id = {}
-    for i in xrange(len(subtoken_strings)):
-      subtoken_string = subtoken_strings[i]
-      if subtoken_string:
-        self._subtoken_string_to_id[subtoken_string] = i
+    self._subtoken_string_to_id = { s : i for i, s in enumerate(subtoken_strings) if s }
 
   def _load_from_file(self, filename):
     """Load from a file."""
     subtoken_strings = []
     with tf.gfile.Open(filename) as f:
       for line in f:
         if six.PY2:
-          subtoken_strings.append(line.strip()[1:-1].decode('string-escape'))
+          subtoken_strings.append(line.strip()[1:-1].decode('utf-8'))
         else:
           subtoken_strings.append(line.strip()[1:-1])
     self._init_from_list(subtoken_strings)
 
-  def _store_to_file(self, filename):
+  def store_to_file(self, filename):
     with tf.gfile.Open(filename, 'w') as f:
       for subtoken_string in self._all_subtoken_strings:
         if six.PY2:
-          f.write('\'' + subtoken_string.encode('string-escape') + '\'\n')
+          f.write('\'' + subtoken_string.encode('utf-8') + '\'\n')
         else:
           f.write('\'' + subtoken_string + '\'\n')
 
@@ -437,43 +458,26 @@ def _escape_token(self, token):
   def _unescape_token(self, escaped_token):
     r"""Remove '_' from end, then translate '\\'->'\' and '\u'->'_'.
 
-    TODO(noam): There must be some better way to do this with regexps.
-
     Args:
       escaped_token: a string
     Returns:
       token: a string
     """
     assert escaped_token[-1] == '_'
-    escaped_token = escaped_token[:-1]
-    if '\\' not in escaped_token:
-      return escaped_token
-    ret = ''
-    pos = 0
-    while pos < len(escaped_token):
-      if escaped_token[pos] == '\\' and pos + 1 < len(escaped_token):
-        if escaped_token[pos + 1] == 'u':
-          ret += '_'
-        else:
-          ret += escaped_token[pos + 1]
-        pos += 1
-      pos += 1
-    return ret
+    return escaped_token[:-1].replace('\\u', '_').replace('\\\\', '\\')
 
   @classmethod
   def get_token_counts(cls, text_filepattern, corpus_max_lines):
-    """Read the corpus and compute a dictionary of word counts."""
+    """Read the corpus and compute a dictionary of token counts."""
     tok = tokenizer.Tokenizer()
-    token_counts = {}
     lines_read = 0
     filenames = tf.gfile.Glob(text_filepattern)
     for text_filename in filenames:
       with tf.gfile.Open(text_filename) as f:
         for line in f:
-          tokens = tok.encode(line.strip())
-          for t in tokens:
-            token_counts[t] = token_counts.get(t, 0) + 1
+          # The tokenizer updates token_counts in encode()
+          tok.encode(line.strip())
           lines_read += 1
           if corpus_max_lines > 0 and lines_read > corpus_max_lines:
-            return token_counts
-    return token_counts
+            return tok.token_counts
+    return tok.token_counts
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -59,8 +59,11 @@ def main(unused_argv):
     raise ValueError('Must provide --corpus_filepattern')
   token_counts = text_encoder.SubwordTextEncoder.get_token_counts(
       FLAGS.corpus_filepattern, FLAGS.corpus_max_lines)
-  gs.build_from_token_counts(token_counts, FLAGS.output_fn, FLAGS.min_count,
+  alphabet_set = SubwordTextEncoder.alphabet(token_counts)
+  gs.build_from_token_counts(token_counts, alphabet_set,
+                             FLAGS.min_count,
                              FLAGS.num_iterations)
+  gs.store_to_file(FLAGS.output_fn)
 
 
 if __name__ == '__main__':
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py