Merge branch 'stop_hejt_and_gradients' into eval_implemantation

Błażej O · Błażej O · commit 3cf9c38f388a · 2018-02-13T16:00:27.000+01:00
diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py
@@ -131,3 +131,26 @@ def feed_forward_categorical_fun(action_space, config, observations):
     value = tf.contrib.layers.fully_connected(x, 1, None)[..., 0]
   policy = tf.contrib.distributions.Categorical(logits=logits)
   return NetworkOutput(policy, value, lambda a: a)
+
+
+def feed_forward_cnn_small_categorical_fun(action_space, config, observations):
+  """Small cnn network with categorical output."""
+  obs_shape = observations.shape.as_list()
+  x = tf.reshape(observations, [-1]+ obs_shape[2:])
+
+  with tf.variable_scope('policy'):
+    x = tf.to_float(x)/255.0
+    x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], activation_fn= tf.nn.relu, padding="SAME")
+    x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], activation_fn=tf.nn.relu, padding="SAME")
+
+    flat_x = tf.reshape(x, [
+      tf.shape(observations)[0], tf.shape(observations)[1],
+      functools.reduce(operator.mul, x.shape.as_list()[1:], 1)])
+
+    x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu)
+    logits = tf.contrib.layers.fully_connected(x, action_space.n, activation_fn=None)
+
+    value = tf.contrib.layers.fully_connected(x, 1, activation_fn=None)[..., 0]
+    policy = tf.contrib.distributions.Categorical(logits=logits)
+
+  return NetworkOutput(policy, value, lambda a: a)
diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py
@@ -24,7 +24,7 @@ def define_collect(policy_factory, batch_env, hparams, eval_phase):
   memory_shape = [hparams.epoch_length] + [batch_env.observ.shape.as_list()[0]]
   memories_shapes_and_types = [
       # observation
-      (memory_shape + [batch_env.observ.shape.as_list()[1]], tf.float32),
+      (memory_shape + batch_env.observ.shape.as_list()[1:], tf.float32),
       (memory_shape, tf.float32),      # reward
       (memory_shape, tf.bool),         # done
       # action
diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py
@@ -21,9 +21,17 @@
 import tensorflow as tf
 
 
+def get_optimiser(config):
+
+  if config.optimizer=='Adam':
+    return tf.train.AdamOptimizer(config.learning_rate)
+
+  return config.optimizer(config.learning_rate)
+
+
 def define_ppo_step(observation, action, reward, done, value, old_pdf,
                     policy_factory, config):
-  """A step of PPO."""
+
   new_policy_dist, new_value, _ = policy_factory(observation)
   new_pdf = new_policy_dist.prob(action)
 
@@ -43,27 +51,30 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf,
                                    ratio * advantage_normalized)
   policy_loss = -tf.reduce_mean(surrogate_objective)
 
-  value_error = calculate_discounted_return(
-      reward, new_value, done, config.gae_gamma, config.gae_lambda) - value
+  value_error = calculate_generalized_advantage_estimator(
+            reward, new_value, done, config.gae_gamma, config.gae_lambda)
   value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2)
 
   entropy = new_policy_dist.entropy()
   entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy)
 
-  total_loss = policy_loss + value_loss + entropy_loss
+  optimizer = get_optimiser(config)
+  losses = [policy_loss, value_loss, entropy_loss]
 
-  optimization_op = tf.contrib.layers.optimize_loss(
-      loss=total_loss,
-      global_step=tf.train.get_or_create_global_step(),
-      optimizer=config.optimizer,
-      learning_rate=config.learning_rate)
+  gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses]
 
-  with tf.control_dependencies([optimization_op]):
-    return [tf.identity(x) for x in (policy_loss, value_loss, entropy_loss)]
+  gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]
+
+  gradients_flat = sum([gradient[0] for gradient in gradients], ())
+  gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())
+
+  optimize_op = optimizer.apply_gradients(zip(gradients_flat, gradients_variables_flat))
+
+  with tf.control_dependencies([optimize_op]):
+    return [tf.identity(x) for x in losses + gradients_norms]
 
 
 def define_ppo_epoch(memory, policy_factory, config):
-  """An epoch of PPO."""
   observation, reward, done, action, old_pdf, value = memory
 
   # This is to avoid propagating gradients though simulation of simulation
@@ -74,59 +85,39 @@ def define_ppo_epoch(memory, policy_factory, config):
   value = tf.stop_gradient(value)
   old_pdf = tf.stop_gradient(old_pdf)
 
-  policy_loss, value_loss, entropy_loss = tf.scan(
-      lambda _1, _2: define_ppo_step(  # pylint: disable=g-long-lambda
-          observation, action, reward, done, value,
-          old_pdf, policy_factory, config),
+  ppo_step_rets = tf.scan(
+      lambda _1, _2: define_ppo_step(observation, action, reward, done, value,
+                                     old_pdf, policy_factory, config),
       tf.range(config.optimization_epochs),
-      [0., 0., 0.],
+      [0., 0., 0., 0., 0., 0.],
       parallel_iterations=1)
 
-  summaries = [tf.summary.scalar("policy loss", tf.reduce_mean(policy_loss)),
-               tf.summary.scalar("value loss", tf.reduce_mean(value_loss)),
-               tf.summary.scalar("entropy loss", tf.reduce_mean(entropy_loss))]
+  ppo_summaries = [tf.reduce_mean(ret) for ret in ppo_step_rets]
+  summaries_names = ["policy_loss", "value_loss", "entropy_loss",
+                     "policy_gradient", "value_gradient", "entropy_gradient"]
 
+  summaries = [tf.summary.scalar(summary_name, summary)
+               for summary_name, summary in zip(summaries_names, ppo_summaries)]
   losses_summary = tf.summary.merge(summaries)
 
-  losses_summary = tf.Print(losses_summary,
-                            [tf.reduce_mean(policy_loss)], "policy loss: ")
-  losses_summary = tf.Print(losses_summary,
-                            [tf.reduce_mean(value_loss)], "value loss: ")
-  losses_summary = tf.Print(losses_summary,
-                            [tf.reduce_mean(entropy_loss)], "entropy loss: ")
+  for summary_name, summary in zip(summaries_names, ppo_summaries):
+    losses_summary = tf.Print(losses_summary, [summary], summary_name + ": ")
 
   return losses_summary
 
+def calculate_generalized_advantage_estimator(reward, value, done, gae_gamma, gae_lambda):
+  """Generalized advantage estimator"""
 
-def calculate_discounted_return(reward, value, done, discount, unused_lambda):
-  """Discounted Monte-Carlo returns."""
-  done = tf.cast(done, tf.float32)
-  reward2 = done[-1, :] * reward[-1, :] + (1 - done[-1, :]) * value[-1, :]
-  reward = tf.concat([reward[:-1,], reward2[None, ...]], axis=0)
-  return_ = tf.reverse(tf.scan(
-      lambda agg, cur: cur[0] + (1 - cur[1]) * discount * agg,  # fn
-      [tf.reverse(reward, [0]),  # elem
-       tf.reverse(done, [0])],
-      tf.zeros_like(reward[0, :]),  # initializer
-      1,
-      False), [0])
-  return tf.check_numerics(return_, "return")
-
-
-def calculate_generalized_advantage_estimator(
-    reward, value, done, gae_gamma, gae_lambda):
-  """Generalized advantage estimator."""
-  # Below is slight weirdness, we set the last reward to 0.
-  # This makes the adventantage to be 0 in the last timestep.
-  reward = tf.concat([reward[:-1, :], value[-1:, :]], axis=0)
-  next_value = tf.concat([value[1:, :], tf.zeros_like(value[-1:, :])], axis=0)
-  next_not_done = 1 - tf.cast(tf.concat(
-      [done[1:, :], tf.zeros_like(done[-1:, :])], axis=0), tf.float32)
+  # Below is slight wierdness, we set the last reward to 0.
+  # This makes the adventantage to be 0 in the last timestep
+  reward = tf.concat([reward[:-1,:], value[-1:,:]], axis=0)
+  next_value = tf.concat([value[1:,:], tf.zeros_like(value[-1:, :])], axis=0)
+  next_not_done = 1 - tf.cast(tf.concat([done[1:, :], tf.zeros_like(done[-1:, :])], axis=0), tf.float32)
   delta = reward + gae_gamma * next_value * next_not_done - value
 
   return_ = tf.reverse(tf.scan(
       lambda agg, cur: cur[0] + cur[1] * gae_gamma * gae_lambda * agg,
       [tf.reverse(delta, [0]), tf.reverse(next_not_done, [0])],
       tf.zeros_like(delta[0, :]),
-      1, False), [0])
-  return tf.check_numerics(tf.stop_gradient(return_), "return")
+      parallel_iterations=1),  [0])
+  return tf.check_numerics(return_, 'return')
diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py
@@ -33,9 +33,12 @@
 import tensorflow as tf
 
 
-def define_train(hparams, environment_name, event_dir):
+def define_train(hparams, environment_spec, event_dir):
   """Define the training setup."""
-  env_lambda = lambda: gym.make(environment_name)
+  if isinstance(environment_spec, str):
+    env_lambda = lambda: gym.make(environment_spec)
+  else:
+    env_lambda = environment_spec
   policy_lambda = hparams.network
   env = env_lambda()
   action_space = env.action_space
@@ -63,8 +66,8 @@ def define_train(hparams, environment_name, event_dir):
   return summary, eval_summary
 
 
-def train(hparams, environment_name, event_dir=None):
-  train_summary_op, eval_summary_op = define_train(hparams, environment_name, event_dir)
+def train(hparams, environment_spec, event_dir=None):
+  train_summary_op, eval_summary_op = define_train(hparams, environment_spec, event_dir)
 
   if event_dir:
     summary_writer = tf.summary.FileWriter(