RL bugfixes.

Lukasz Kaiser · Copybara-Service · commit 8bdaeb281c1d · 2018-09-07T13:24:45.000-07:00
PiperOrigin-RevId: 212026070
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -253,16 +253,16 @@ def expand_squeeze_to_nd(x, n, squeeze_dim=2, expand_dim=-1):
 
 
 def standardize_images(x):
-  """Image standardization on batches."""
+  """Image standardization on batches and videos."""
   with tf.name_scope("standardize_images", [x]):
-    x = tf.to_float(x)
+    x_shape = shape_list(x)
+    x = tf.to_float(tf.reshape(x, [-1] + x_shape[-3:]))
     x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keepdims=True)
     x_variance = tf.reduce_mean(
         tf.square(x - x_mean), axis=[1, 2, 3], keepdims=True)
-    x_shape = shape_list(x)
-    num_pixels = tf.to_float(x_shape[1] * x_shape[2] * x_shape[3])
+    num_pixels = tf.to_float(x_shape[-1] * x_shape[-2] * x_shape[-3])
     x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
-    return x
+    return tf.reshape(x, x_shape)
 
 
 def flatten4d3d(x):
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -535,17 +535,8 @@ def bottom(self, x):
     inputs = x
     with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
       common_layers.summarize_video(inputs, "inputs")
-      inputs_shape = common_layers.shape_list(inputs)
-      # Standardize frames.
-      inputs = tf.reshape(inputs, [-1] + inputs_shape[2:])
       inputs = common_layers.standardize_images(inputs)
-      inputs = tf.reshape(inputs, inputs_shape)
-      # Concatenate the time dimension on channels for image models to work.
-      transposed = tf.transpose(inputs, [0, 2, 3, 1, 4])
-      return tf.reshape(transposed, [
-          inputs_shape[0], inputs_shape[2], inputs_shape[3],
-          inputs_shape[1] * inputs_shape[4]
-      ])
+      return common_layers.time_to_channels(inputs)
 
   def targets_bottom(self, x, summary_prefix="targets_bottom"):  # pylint: disable=arguments-differ
     inputs = x
@@ -573,10 +564,13 @@ def top(self, body_output, targets):
     num_frames = common_layers.shape_list(targets)[1]
     body_output_shape = common_layers.shape_list(body_output)
     # We assume the body output is of this shape and layout.
+    # Note: if you tf.concat([frames], axis=-1) at the end of your model,
+    # then you need to reshape to [..., num_frames, depth] like below, not
+    # into [..., depth, num_frames] due to memory layout of concat/reshape.
     reshape_shape = body_output_shape[:-1] + [
-        num_channels, self.top_dimensionality, num_frames]
+        num_channels, num_frames, self.top_dimensionality]
     res = tf.reshape(body_output, reshape_shape)
-    res = tf.transpose(res, [0, 5, 1, 2, 3, 4])
+    res = tf.transpose(res, [0, 4, 1, 2, 3, 5])
     res_shape = common_layers.shape_list(res)
     res_argmax = tf.argmax(tf.reshape(res, [-1, res_shape[-1]]), axis=-1)
     res_argmax = tf.reshape(res_argmax, res_shape[:-1])
diff --git a/tensor2tensor/models/research/autoencoders.py b/tensor2tensor/models/research/autoencoders.py
@@ -594,6 +594,7 @@ def encoder(self, x):
               activation=common_layers.belu,
               name="strided")
           y = x
+          y = tf.nn.dropout(y, 1.0 - hparams.residual_dropout)
           for r in range(hparams.num_residual_layers):
             residual_filters = filters
             if r < hparams.num_residual_layers - 1:
@@ -606,7 +607,7 @@ def encoder(self, x):
                 padding="SAME",
                 activation=common_layers.belu,
                 name="residual_%d" % r)
-          x += tf.nn.dropout(y, 1.0 - hparams.residual_dropout)
+          x += y
           x = common_layers.layer_norm(x, name="ln")
       return x, layers
 
diff --git a/tensor2tensor/models/video/basic_deterministic.py b/tensor2tensor/models/video/basic_deterministic.py
@@ -85,9 +85,9 @@ def body_single(self, features):
     # Run a stack of convolutions.
     for i in range(hparams.num_hidden_layers):
       with tf.variable_scope("layer%d" % i):
-        y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu,
+        y = tf.nn.dropout(x, 1.0 - hparams.dropout)
+        y = tf.layers.conv2d(y, filters, kernel1, activation=common_layers.belu,
                              strides=(1, 1), padding="SAME")
-        y = tf.nn.dropout(y, 1.0 - hparams.dropout)
         if i == 0:
           x = y
         else:
@@ -172,6 +172,7 @@ def body(self, features):
       sampled_frame = tf.reshape(
           res_frames[-1], shape[:-1] + [hparams.problem.num_channels, 256])
       sampled_frame = tf.to_float(tf.argmax(sampled_frame, axis=-1))
+      sampled_frame = common_layers.standardize_images(sampled_frame)
       if is_predicting:
         all_frames[i + hparams.video_num_input_frames] = sampled_frame
 
diff --git a/tensor2tensor/models/video/basic_deterministic_params.py b/tensor2tensor/models/video/basic_deterministic_params.py
@@ -65,8 +65,8 @@ def next_frame_sampling():
   """Basic conv model with scheduled sampling."""
   hparams = next_frame_basic_deterministic()
   hparams.video_num_target_frames = 2
-  hparams.scheduled_sampling_warmup_steps = 30000
-  hparams.scheduled_sampling_prob = 0.1
+  hparams.scheduled_sampling_warmup_steps = 50000
+  hparams.scheduled_sampling_prob = 0.5
   return hparams
 
 
diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py
@@ -41,61 +41,62 @@ def define_train(hparams):
 
 
 def train(hparams, event_dir=None, model_dir=None,
-          restore_agent=True, epoch=0):
+          restore_agent=True, epoch=0, name_scope="rl_train"):
   """Train."""
   with tf.Graph().as_default():
-    train_summary_op, _, initialization = define_train(hparams)
-    if event_dir:
-      summary_writer = tf.summary.FileWriter(
-          event_dir, graph=tf.get_default_graph(), flush_secs=60)
-    else:
-      summary_writer = None
+    with tf.name_scope(name_scope):
+      train_summary_op, _, initialization = define_train(hparams)
+      if event_dir:
+        summary_writer = tf.summary.FileWriter(
+            event_dir, graph=tf.get_default_graph(), flush_secs=60)
+      else:
+        summary_writer = None
 
-    if model_dir:
-      model_saver = tf.train.Saver(
-          tf.global_variables(".*network_parameters.*"))
-    else:
-      model_saver = None
+      if model_dir:
+        model_saver = tf.train.Saver(
+            tf.global_variables(".*network_parameters.*"))
+      else:
+        model_saver = None
 
-    # TODO(piotrmilos): This should be refactored, possibly with
-    # handlers for each type of env
-    if hparams.environment_spec.simulated_env:
-      env_model_loader = tf.train.Saver(
-          tf.global_variables("next_frame*"))
-    else:
-      env_model_loader = None
+      # TODO(piotrmilos): This should be refactored, possibly with
+      # handlers for each type of env
+      if hparams.environment_spec.simulated_env:
+        env_model_loader = tf.train.Saver(
+            tf.global_variables("next_frame*"))
+      else:
+        env_model_loader = None
 
-    with tf.Session() as sess:
-      sess.run(tf.global_variables_initializer())
-      initialization(sess)
-      if env_model_loader:
-        trainer_lib.restore_checkpoint(
-            hparams.world_model_dir, env_model_loader, sess,
-            must_restore=True)
-      start_step = 0
-      if model_saver and restore_agent:
-        start_step = trainer_lib.restore_checkpoint(
-            model_dir, model_saver, sess)
+      with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        initialization(sess)
+        if env_model_loader:
+          trainer_lib.restore_checkpoint(
+              hparams.world_model_dir, env_model_loader, sess,
+              must_restore=True)
+        start_step = 0
+        if model_saver and restore_agent:
+          start_step = trainer_lib.restore_checkpoint(
+              model_dir, model_saver, sess)
 
-      # Fail-friendly, don't train if already trained for this epoch
-      if start_step >= ((hparams.epochs_num * (epoch + 1))):
-        tf.logging.info("Skipping PPO training for epoch %d as train steps "
-                        "(%d) already reached", epoch, start_step)
-        return
+        # Fail-friendly, don't train if already trained for this epoch
+        if start_step >= ((hparams.epochs_num * (epoch + 1))):
+          tf.logging.info("Skipping PPO training for epoch %d as train steps "
+                          "(%d) already reached", epoch, start_step)
+          return
 
-      for epoch_index in range(hparams.epochs_num):
-        summary = sess.run(train_summary_op)
-        if summary_writer:
-          summary_writer.add_summary(summary, epoch_index)
-        if (hparams.eval_every_epochs and
-            epoch_index % hparams.eval_every_epochs == 0):
-          if summary_writer and summary:
+        for epoch_index in range(hparams.epochs_num):
+          summary = sess.run(train_summary_op)
+          if summary_writer:
             summary_writer.add_summary(summary, epoch_index)
-          else:
-            tf.logging.info("Eval summary not saved")
-        if (model_saver and hparams.save_models_every_epochs and
-            (epoch_index % hparams.save_models_every_epochs == 0 or
-             (epoch_index + 1) == hparams.epochs_num)):
-          ckpt_path = os.path.join(
-              model_dir, "model.ckpt-{}".format(epoch_index + 1 + start_step))
-          model_saver.save(sess, ckpt_path)
+          if (hparams.eval_every_epochs and
+              epoch_index % hparams.eval_every_epochs == 0):
+            if summary_writer and summary:
+              summary_writer.add_summary(summary, epoch_index)
+            else:
+              tf.logging.info("Eval summary not saved")
+          if (model_saver and hparams.save_models_every_epochs and
+              (epoch_index % hparams.save_models_every_epochs == 0 or
+               (epoch_index + 1) == hparams.epochs_num)):
+            ckpt_path = os.path.join(
+                model_dir, "model.ckpt-{}".format(epoch_index + 1 + start_step))
+            model_saver.save(sess, ckpt_path)
diff --git a/tensor2tensor/rl/trainer_model_based.py b/tensor2tensor/rl/trainer_model_based.py
@@ -183,7 +183,8 @@ def train_agent(problem_name, agent_model_dir,
       "output_dir": world_model_dir,
       "data_dir": epoch_data_dir,
   }):
-    rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch)
+    rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=epoch,
+                         name_scope="ppo_sim")
 
 
 def train_agent_real_env(
@@ -218,7 +219,8 @@ def train_agent_real_env(
       "data_dir": epoch_data_dir,
   }):
     # epoch = 10**20 is a hackish way to avoid skiping training
-    rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=10**20)
+    rl_trainer_lib.train(ppo_hparams, event_dir, agent_model_dir, epoch=10**20,
+                         name_scope="ppo_real")
 
 
 def evaluate_world_model(simulated_problem_name, problem_name, hparams,
@@ -266,6 +268,7 @@ def train_world_model(problem_name, data_dir, output_dir, hparams, epoch):
       "hparams_set": hparams.generative_model_params,
       "hparams": "learning_rate_constant=%.6f" % learning_rate,
       "eval_steps": 100,
+      "local_eval_frequency": 2000,
       "train_steps": train_steps,
   }):
     t2t_trainer.main([])
@@ -519,6 +522,7 @@ def training_loop(hparams, output_dir, report_fn=None, report_metric=None):
     mean_reward_summary.value[0].simple_value = mean_reward
     eval_metrics_writer.add_summary(model_reward_accuracy_summary, epoch)
     eval_metrics_writer.add_summary(mean_reward_summary, epoch)
+    eval_metrics_writer.flush()
 
     # Report metrics
     eval_metrics = {"model_reward_accuracy": model_reward_accuracy,
@@ -599,10 +603,12 @@ def rl_modelrl_base():
 
 @registry.register_hparams
 def rl_modelrl_base_quick():
-  """Base setting with only 2 epochs and 500 PPO steps per epoch."""
+  """Base setting but quicker with only 2 epochs."""
   hparams = rl_modelrl_base()
   hparams.epochs = 2
-  hparams.ppo_epochs_num = 500
+  hparams.ppo_epochs_num = 1000
+  hparams.ppo_epoch_length = 50
+  hparams.real_ppo_epochs_num = 10
   return hparams
 
 
@@ -615,6 +621,14 @@ def rl_modelrl_base_quick_sd():
   return hparams
 
 
+@registry.register_hparams
+def rl_modelrl_base_quick_sm():
+  """Quick setting with sampling."""
+  hparams = rl_modelrl_base_quick()
+  hparams.generative_model_params = "next_frame_sampling"
+  return hparams
+
+
 @registry.register_hparams
 def rl_modelrl_base_stochastic():
   """Base setting with a stochastic next-frame model."""