|
20 | 20 |
|
21 | 21 | import tensorflow as tf |
22 | 22 |
|
23 | | -def get_otimiser(config): |
| 23 | + |
| 24 | +def get_optimiser(config): |
24 | 25 |
|
25 | 26 | if config.optimizer=='Adam': |
26 | 27 | return tf.train.AdamOptimizer(config.learning_rate) |
27 | 28 |
|
28 | 29 | return config.optimizer(config.learning_rate) |
29 | 30 |
|
30 | 31 |
|
31 | | - |
32 | 32 | def define_ppo_step(observation, action, reward, done, value, old_pdf, |
33 | 33 | policy_factory, config): |
34 | 34 |
|
@@ -58,7 +58,7 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf, |
58 | 58 | entropy = new_policy_dist.entropy() |
59 | 59 | entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy) |
60 | 60 |
|
61 | | - optimizer = get_otimiser(config) |
| 61 | + optimizer = get_optimiser(config) |
62 | 62 | losses = [policy_loss, value_loss, entropy_loss] |
63 | 63 |
|
64 | 64 | gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses] |
@@ -86,8 +86,8 @@ def define_ppo_epoch(memory, policy_factory, config): |
86 | 86 | old_pdf = tf.stop_gradient(old_pdf) |
87 | 87 |
|
88 | 88 | ppo_step_rets = tf.scan( |
89 | | - lambda a, x: define_ppo_step(observation, action, reward, done, value, |
90 | | - old_pdf, policy_factory, config), |
| 89 | + lambda _1, _2: define_ppo_step(observation, action, reward, done, value, |
| 90 | + old_pdf, policy_factory, config), |
91 | 91 | tf.range(config.optimization_epochs), |
92 | 92 | [0., 0., 0., 0., 0., 0.], |
93 | 93 | parallel_iterations=1) |
|
0 commit comments