Add basic support for TF2 modeling.

rjpower · copybara-github · commit f65b5e4e0be5 · 2020-05-20T14:37:03.000-07:00
This is not complete, but can be extended to add support for TPUs and more
models as required. Tested on CPU/GPU with the following configuration:

PROBLEM=translate_envi_iwslt32k
MODEL=transformer
HPARAMS=transformer_base_single_gpu

DATA_DIR=$HOME/t2t_data
TMP_DIR=/tmp/t2t_datagen
TRAIN_DIR=$HOME/t2t_train/$PROBLEM/$MODEL-$HPARAMS

t2t-trainer   --data_dir=$DATA_DIR   --problem=$PROBLEM   --model=$MODEL   --hparams_set=$HPARAMS   --output_dir=$TRAIN_DIR

Verified the loss decreases as expected and checkpoints etc work.

PiperOrigin-RevId: 312557333
diff --git a/setup.py b/setup.py
@@ -63,13 +63,15 @@
         'scipy',
         'six>=1.12.0',
         'sympy',
+        'tensorflow-addons',
         'tensorflow-datasets',
         'tensorflow-gan',
         'tensorflow-probability==0.7.0',
+        'tf_slim',
         'tqdm',
     ],
     extras_require={
-        'tensorflow': ['tensorflow>=1.15.0,<2.0'],
+        'tensorflow': ['tensorflow>=1.15.0'],
         'tensorflow-hub': ['tensorflow-hub>=0.1.1'],
         'tests': [
             # Needed to fix a Travis pytest error.
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -17,7 +17,7 @@ from __future__ import print_function
 
 from tensor2tensor.bin import t2t_datagen
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 def main(argv):
   t2t_datagen.main(argv)
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
@@ -22,12 +22,12 @@ from __future__ import print_function
 
 from tensor2tensor.bin import t2t_trainer
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 def main(argv):
   t2t_trainer.main(argv)
 
 
 if __name__ == "__main__":
   tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run()
+  tf.app.run(main)
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
@@ -36,8 +36,6 @@
 from tensor2tensor.utils import usr_dir
 import tensorflow.compat.v1 as tf
 
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -242,8 +240,10 @@ def create_run_config(hp, output_dir=None):
     save_ckpt_steps = None  # Disable the default saver
     save_ckpt_secs = None  # Disable the default saver
     tpu_config_extra_kwargs = {
-        "num_cores_per_replica": 1,
-        "per_host_input_for_training": tpu_config.InputPipelineConfig.BROADCAST,
+        "num_cores_per_replica":
+            1,
+        "per_host_input_for_training":
+            tf.estimator.tpu.InputPipelineConfig.BROADCAST,
     }
 
   # the various custom getters we have written do not play well together yet.
diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
@@ -30,10 +30,6 @@
 from tensor2tensor.models import image_transformer
 from tensor2tensor.models import image_transformer_2d
 from tensor2tensor.models import lstm
-from tensor2tensor.models import mtf_image_transformer
-from tensor2tensor.models import mtf_resnet
-from tensor2tensor.models import mtf_transformer
-from tensor2tensor.models import mtf_transformer2
 from tensor2tensor.models import neural_assistant
 from tensor2tensor.models import neural_gpu
 from tensor2tensor.models import resnet
@@ -47,15 +43,9 @@
 from tensor2tensor.models.neural_architecture_search import nas_model
 from tensor2tensor.models.research import adafactor_experiments
 from tensor2tensor.models.research import aligned
-from tensor2tensor.models.research import attention_lm
-from tensor2tensor.models.research import attention_lm_moe
 from tensor2tensor.models.research import autoencoders
 from tensor2tensor.models.research import cycle_gan
 from tensor2tensor.models.research import gene_expression
-from tensor2tensor.models.research import glow
-from tensor2tensor.models.research import lm_experiments
-from tensor2tensor.models.research import moe_experiments
-from tensor2tensor.models.research import multiquery_paper
 from tensor2tensor.models.research import neural_stack
 from tensor2tensor.models.research import rl
 from tensor2tensor.models.research import shuffle_network
@@ -69,19 +59,34 @@
 from tensor2tensor.models.research import transformer_symshard
 from tensor2tensor.models.research import transformer_vae
 from tensor2tensor.models.research import universal_transformer
-from tensor2tensor.models.research import vqa_attention
-from tensor2tensor.models.research import vqa_recurrent_self_attention
-from tensor2tensor.models.research import vqa_self_attention
 from tensor2tensor.models.video import basic_deterministic
 from tensor2tensor.models.video import basic_recurrent
 from tensor2tensor.models.video import basic_stochastic
 from tensor2tensor.models.video import emily
-from tensor2tensor.models.video import epva
-from tensor2tensor.models.video import next_frame_glow
 from tensor2tensor.models.video import savp
 from tensor2tensor.models.video import sv2p
+from tensor2tensor.utils import contrib
 from tensor2tensor.utils import registry
 
+# The following models can't be imported under TF2
+if not contrib.is_tf2:
+  # pylint: disable=g-import-not-at-top
+  from tensor2tensor.models.research import attention_lm
+  from tensor2tensor.models.research import attention_lm_moe
+  from tensor2tensor.models.research import glow
+  from tensor2tensor.models.research import lm_experiments
+  from tensor2tensor.models.research import moe_experiments
+  from tensor2tensor.models.research import multiquery_paper
+  from tensor2tensor.models import mtf_image_transformer
+  from tensor2tensor.models import mtf_resnet
+  from tensor2tensor.models import mtf_transformer
+  from tensor2tensor.models import mtf_transformer2
+  from tensor2tensor.models.research import vqa_attention
+  from tensor2tensor.models.research import vqa_recurrent_self_attention
+  from tensor2tensor.models.research import vqa_self_attention
+  from tensor2tensor.models.video import epva
+  from tensor2tensor.models.video import next_frame_glow
+  # pylint: enable=g-import-not-at-top
 
 # pylint: disable=unused-import
 
diff --git a/tensor2tensor/utils/contrib.py b/tensor2tensor/utils/contrib.py
@@ -23,23 +23,40 @@
 from __future__ import print_function  # Not necessary in a Python 3-only module
 
 from absl import logging
-from tensorflow.python import tf2  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
-is_tf2 = tf2.enabled()
+import tensorflow.compat.v1 as tf
+
+# Check if we have contrib available
+try:
+  from tensorflow.contrib import slim as tf_slim  # pylint: disable=g-import-not-at-top
+  is_tf2 = False
+except:  # pylint: disable=bare-except
+  # tf.contrib, including slim and certain optimizers are not available in TF2
+  # Some features are now available in separate packages. We shim support for
+  # these as needed.
+  import tensorflow_addons as tfa  # pylint: disable=g-import-not-at-top
+  import tf_slim  # pylint: disable=g-import-not-at-top
+  is_tf2 = True
 
 
 def err_if_tf2(msg='err'):
   if is_tf2:
-    msg = 'contrib is unavailable in tf2.'
     if msg == 'err':
+      msg = 'contrib is unavailable in tf2.'
       raise ImportError(msg)
     else:
+      msg = 'contrib is unavailable in tf2.'
       logging.info(msg)
 
 
+class DummyModule(object):
+
+  def __init__(self, **kw):
+    for k, v in kw.items():
+      setattr(self, k, v)
+
+
 def slim():
-  err_if_tf2()
-  from tensorflow.contrib import slim as contrib_slim  # pylint: disable=g-import-not-at-top
-  return contrib_slim
+  return tf_slim
 
 
 def util():
@@ -54,8 +71,26 @@ def tfe():
   return contrib_eager
 
 
+def deprecated(reason, date):
+  del reason
+  del date
+  def decorator(fn):
+    return fn
+  return decorator
+
+
 def framework(msg='err'):
-  err_if_tf2(msg=msg)
+  """Return framework module or dummy version."""
+  del msg
+  if is_tf2:
+    return DummyModule(
+        arg_scope=None,
+        get_name_scope=lambda: tf.get_default_graph().get_name_scope(),
+        name_scope=tf.name_scope,
+        deprecated=deprecated,
+        nest=tf.nest,
+        argsort=tf.argsort)
+
   from tensorflow.contrib import framework as contrib_framework  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
   return contrib_framework
 
@@ -67,9 +102,13 @@ def nn():
 
 
 def layers():
-  err_if_tf2(msg='err')
-  from tensorflow.contrib import layers as contrib_layers  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
-  return contrib_layers
+  """Return layers module or dummy version."""
+  try:
+    from tensorflow.contrib import layers as contrib_layers  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
+    return contrib_layers
+  except:  # pylint: disable=bare-except
+    return DummyModule(
+        OPTIMIZER_CLS_NAMES={}, optimize_loss=tf_slim.optimize_loss)
 
 
 def rnn():
@@ -109,9 +148,13 @@ def metrics():
 
 
 def opt():
-  err_if_tf2(msg='err')
-  from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
-  return contrib_opt
+  if not is_tf2:
+    from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
+    return contrib_opt
+  return DummyModule(
+      LazyAdam=tfa.optimizers.LazyAdam,
+      LazyAdamOptimizer=tfa.optimizers.LazyAdam,
+  )
 
 
 def mixed_precision():
@@ -132,10 +175,31 @@ def distribute():
   return contrib_distribute
 
 
+def replace_monitors_with_hooks(monitors_or_hooks, estimator):
+  """Stub for missing function."""
+  del estimator
+  monitors_or_hooks = monitors_or_hooks or []
+  hooks = [
+      m for m in monitors_or_hooks if isinstance(m, tf.estimator.SessionRunHook)
+  ]
+  deprecated_monitors = [
+      m for m in monitors_or_hooks
+      if not isinstance(m, tf.estimator.SessionRunHook)
+  ]
+  assert not deprecated_monitors
+  return hooks
+
+
 def learn():
-  err_if_tf2(msg='err')
-  from tensorflow.contrib import learn as contrib_learn  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
-  return contrib_learn
+  """Return tf.contrib.learn module or dummy version."""
+  if not is_tf2:
+    from tensorflow.contrib import learn as contrib_learn  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
+    return contrib_learn
+  return DummyModule(
+      RunConfig=tf.estimator.RunConfig,
+      monitors=DummyModule(
+          replace_monitors_with_hooks=replace_monitors_with_hooks),
+  )
 
 
 def tf_prof():
diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py
@@ -112,13 +112,22 @@ def optimize(loss,
 
 @registry.register_optimizer
 def adam(learning_rate, hparams):
+  """Return adam optimizer for the given params."""
   # We change the default epsilon for Adam.
   # Using LazyAdam as it's much faster for large vocabulary embeddings.
-  return contrib.opt().LazyAdamOptimizer(
-      learning_rate,
-      beta1=hparams.optimizer_adam_beta1,
-      beta2=hparams.optimizer_adam_beta2,
-      epsilon=hparams.optimizer_adam_epsilon)
+  if contrib.is_tf2:
+    # in TF2 beta1 -> beta_1 :/
+    return contrib.opt().LazyAdamOptimizer(
+        learning_rate,
+        beta_1=hparams.optimizer_adam_beta1,
+        beta_2=hparams.optimizer_adam_beta2,
+        epsilon=hparams.optimizer_adam_epsilon)
+  else:
+    return contrib.opt().LazyAdamOptimizer(
+        learning_rate,
+        beta1=hparams.optimizer_adam_beta1,
+        beta2=hparams.optimizer_adam_beta2,
+        epsilon=hparams.optimizer_adam_epsilon)
 
 
 @registry.register_optimizer
@@ -229,7 +238,12 @@ def __init__(self, optimizer_name, lr, hparams, use_tpu=False):  # pylint: disab
     self._zero_grads = hparams.optimizer_zero_grads
 
   def compute_gradients(self, loss, var_list=None, **kwargs):  # pylint: disable=arguments-differ
-    gradients = self._opt.compute_gradients(loss, var_list, **kwargs)
+    if contrib.is_tf2:
+      gradients = self._opt.get_gradients(loss, var_list)
+      gradients = zip(gradients, var_list)
+    else:
+      gradients = self._opt.compute_gradients(loss, var_list, **kwargs)
+
     def cast_grad(g, v):
       if v is not None and g is not None:
         g = common_layers.cast_like(g, v)
@@ -240,8 +254,13 @@ def cast_grad(g, v):
     return gradients
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    return self._opt.apply_gradients(
-        grads_and_vars, global_step=global_step, name=name)
+    if contrib.is_tf2:
+      with tf.control_dependencies(
+          [tf.assign_add(tf.train.get_or_create_global_step(), 1)]):
+        return self._opt.apply_gradients(grads_and_vars, name=name)
+    else:
+      return self._opt.apply_gradients(
+          grads_and_vars, global_step=global_step, name=name)
 
 
 def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None):
diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
@@ -200,7 +200,7 @@ def create_run_config(model_name,
       "keep_checkpoint_max": keep_checkpoint_max,
       "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
       "tf_random_seed": random_seed,
-      "log_step_count_steps": log_step_count_steps
+      "log_step_count_steps": log_step_count_steps,
   }
   if save_checkpoints_secs:
     del run_config_args["save_checkpoints_steps"]
@@ -239,6 +239,12 @@ def create_run_config(model_name,
     del run_config_args["master"]
     del run_config_args["evaluation_master"]
 
+  # tf.estimator RunConfig construction got totally broken in TF2.
+  # we now have to specify master in a global environment variable
+  if contrib.is_tf2:
+    del run_config_args["evaluation_master"]
+    del run_config_args["master"]
+
   config = run_config_cls(**run_config_args)
 
   # If not using TPU, add device info for data_parallelism