tensorflow
diff --git a/‎tensor2tensor/envs/env_problem_utils.py‎
Lines changed: 6 additions & 4 deletions b/‎tensor2tensor/envs/env_problem_utils.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎tensor2tensor/envs/env_problem_utils_test.py‎
Lines changed: 3 additions & 3 deletions b/‎tensor2tensor/envs/env_problem_utils_test.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tensor2tensor/trax/layers/attention.py‎
Lines changed: 1 addition & 1 deletion b/‎tensor2tensor/trax/layers/attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensor2tensor/trax/layers/attention_test.py‎
Lines changed: 2 additions & 2 deletions b/‎tensor2tensor/trax/layers/attention_test.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensor2tensor/trax/layers/base.py‎
Lines changed: 50 additions & 23 deletions b/‎tensor2tensor/trax/layers/base.py‎
Lines changed: 50 additions & 23 deletions
diff --git a/‎tensor2tensor/trax/layers/base_test.py‎
Lines changed: 6 additions & 6 deletions b/‎tensor2tensor/trax/layers/base_test.py‎
Lines changed: 6 additions & 6 deletions
@@ -57,6 +57,7 @@ def play_env_problem_with_policy(env,
                                  num_trajectories=1,
                                  max_timestep=None,
                                  reset=True,
+                                 state=None,
                                  rng=None,
                                  temperature=1.0,
                                  boundary=32,
@@ -73,7 +74,8 @@ def play_env_problem_with_policy(env,
       trajectory that exceeds this time put it in the completed bin, and *dont*
       reset the env.
     reset: bool, true if we want to reset the envs. The envs are also reset if
-      max_max_timestep is None or < 0
+      max_max_timestep is None or < 0.
+    state: the state for `policy_fn`.
     rng: jax rng, splittable.
     temperature: float, temperature used in Gumbel sampling.
     boundary: int, pad the sequences to the multiples of this number.
@@ -118,8 +120,8 @@ def gumbel_sample(log_probs):
     assert (B,) == lengths.shape
 
     t1 = time.time()
-    log_prob_actions, value_predictions, rng = policy_fun(
-        padded_observations, rng=rng)
+    log_prob_actions, value_predictions, state, rng = policy_fun(
+        padded_observations, state=state, rng=rng)
     policy_application_total_time += (time.time() - t1)
 
     assert (B, T) == log_prob_actions.shape[:2]
@@ -192,7 +194,7 @@ def gumbel_sample(log_probs):
   }
   timing_info = {k: round(1000 * v, 2) for k, v in timing_info.items()}
 
-  return completed_trajectories, num_done_trajectories, timing_info
+  return completed_trajectories, num_done_trajectories, timing_info, state
 
 
 def make_env(batch_size=1,
 
@@ -53,7 +53,7 @@ def test_play_env_problem_with_policy(self):
     # Let's make sure that at-most 4 observations come to the policy function.
     len_history_for_policy = 4
 
-    def policy_fun(observations, rng=None):
+    def policy_fun(observations, state=None, rng=None):
       b, t = observations.shape[:2]
       # Assert that observations from time-step len_history_for_policy onwards
       # are zeros.
@@ -65,11 +65,11 @@ def policy_fun(observations, rng=None):
       p = np.random.uniform(size=(b, t, a))
       p = np.exp(p)
       p = p / np.sum(p, axis=-1, keepdims=True)
-      return np.log(p), np.log(p), rng
+      return np.log(p), np.log(p), state, rng
 
     max_timestep = 15
     num_trajectories = 2
-    trajectories, _, _ = env_problem_utils.play_env_problem_with_policy(
+    trajectories, _, _, _ = env_problem_utils.play_env_problem_with_policy(
         env,
         policy_fun,
         num_trajectories=num_trajectories,
 
@@ -169,7 +169,7 @@ def AttentionQKV(d_feature, n_heads=1, dropout=0.0, mode='train'):
           core.Dense(d_feature),
       ),
       PureAttention(  # pylint: disable=no-value-for-parameter
-          d_feature=d_feature, n_heads=n_heads, dropout=dropout, mode=mode),
+          n_heads=n_heads, dropout=dropout, mode=mode),
       core.Dense(d_feature),
   ]
 
 
@@ -30,7 +30,7 @@ def test_shift_right(self):
     # Test shifts right on axis=1
     layer = attention.ShiftRight()
     input_np = onp.arange(2*3*3).reshape(2, 3, 3)
-    output_np = layer(input_np)
+    output_np, _ = layer(input_np)
     self.assertEqual(input_np.shape, output_np.shape)
     self.assertAllEqual(onp.array([[[0, 0, 0],
                                     [0, 1, 2],
@@ -49,7 +49,7 @@ def test_shift_right_float(self):
     input_np /= 2.0
     self.assertEqual(input_np.dtype, onp.float32)
 
-    output_np = layer(input_np)
+    output_np, _ = layer(input_np)
     self.assertEqual(input_np.shape, output_np.shape)
     self.assertEqual(output_np.dtype, onp.float32)
 
 
@@ -81,7 +81,7 @@ def __repr__(self):
     else:
       return '{}[{}]'.format(class_str, fields_str)
 
-  def call(self, inputs, params=(), **kwargs):
+  def call(self, inputs, params=(), state=(), **kwargs):
     """Applies this layer to given activation tensors, using trainable params.
 
     Args:
@@ -94,6 +94,7 @@ def call(self, inputs, params=(), **kwargs):
           and one for each of this layer's sublayers. If a layer (or sublayer)
           has no trainable parameters, the corresponding params element is an
           empty tuple.
+      state: start state.
       **kwargs: Layer-specific keyword args.
 
     Returns:
@@ -106,6 +107,7 @@ def call(self, inputs, params=(), **kwargs):
     """
     raise NotImplementedError
 
+  # TODO(wangpeng): Should be called `new_parameters_and_state`.
   def new_parameters(self, input_shapes, input_dtype, rng):
     """Creates layer-specific parameters based on data shape, dtype and rng.
 
@@ -144,7 +146,7 @@ def has_custom_grad(self):
     """Whether to use custom gradients (in which case, see below)."""
     return False
 
-  def custom_grad(self, inputs, output, grad, params, **kwargs):
+  def custom_grad(self, inputs, output, grad, params, state, **kwargs):
     """Custom backward pass to propagate gradients in a custom way.
 
     Args:
@@ -153,6 +155,7 @@ def custom_grad(self, inputs, output, grad, params, **kwargs):
       grad: gradient signal (called cotangent in jax) computed based on
         subsequent layers. The structure and shape must match output.
       params: layer parameters
+      state: start state.
       **kwargs: kwargs for the layer
 
     Returns:
@@ -164,14 +167,15 @@ def custom_grad(self, inputs, output, grad, params, **kwargs):
 
   # End of subclassing interface, all functions below are internal.
 
-  def pseudo_call(self, pseudo_inputs, params):
+  def pseudo_call(self, pseudo_inputs, params, state):
     """Computes shapes and types this layer would produce for the given inputs.
 
     Args:
       pseudo_inputs: A ShapeType instance (input data minus the actual values)
           or a tuple of ShapeType instances, following the same conventions as
           Layer.call's input arg.
       params: Parameters for this layer.
+      state: start state.
 
     Returns:
       A ShapeType instance representing the shape and type of the output (if
@@ -183,12 +187,12 @@ def pseudo_call(self, pseudo_inputs, params):
       # cause a large number of dropout masks to be computed and permanently
       # stored in global memory.
       rng = ShapeType(shape=(2,), dtype=onp.uint32)
-      def call_on_input(x, params, rng):
-        return self.call(x, params=params, rng=rng)
+      def call_on_input(x, params, state, rng):
+        return self.call(x, params=params, state=state, rng=rng)
       params_shapes = nested_map(
           params, lambda x: ShapeType(shape=x.shape, dtype=x.dtype))
       s = backend.eval_on_shapes(call_on_input)(pseudo_inputs,
-                                                params_shapes, rng)
+                                                params_shapes, state, rng)
       return s
     except Exception:
       name, trace = self.__class__.__name__, _short_traceback(skip=3)
@@ -213,52 +217,74 @@ def initialize(self, input_shapes, input_dtype, rng):
     """
     try:
       # Initialize params once; store them for use when this layer is called.
+      # Needs to call new_parameters regardless of _init_finished because state
+      # also needs to be initialized. After jitting, graph pruning should be
+      # able to remove unnecessary computation.
+      # TODO(lukaszkaiser): Revisit this decision and see whether layers sharing
+      #   params should also share states.
+      params, state = self.new_parameters(input_shapes, input_dtype, rng)
       if not self._init_finished:
-        self._params = self.new_parameters(input_shapes, input_dtype, rng)
         self._init_finished = True
-        return self._params
+        self._params = params
       else:
-        return ()
+        params = ()
+      return (params, state)
     except Exception:
       name, trace = self.__class__.__name__, _short_traceback(skip=3)
       raise LayerError(name, 'initialize', self._caller, input_shapes, trace)
 
-  def __call__(self, x, params=(), **kwargs):
+  def __call__(self, x, params=(), state=(), **kwargs):
     try:
       # If params are nothing, we may be reusing this layer.
       # Use the cached parameters to calculate the value.
       # Note: to make sure jit tracers can decide this branch in python we
       #   use "params is ()" instead of, e.g., "not params" or "params == ()".
       if params is ():  # pylint: disable=literal-comparison
         params = self._params
-      # In this case, we're called for the first time: cache parameters.
-      self._params = params
+      else:
+        # In this case, we're called for the first time: cache parameters.
+        self._params = params
 
       if not self.has_custom_grad:
-        return self.call(x, params=params, **kwargs)
+        return self.call(x, params=params, state=state, **kwargs)
 
       # Custom gradients part.
       assert backend.get_name() == 'jax', (
           'Custom gradients are only supported in JAX for now.')
 
+      # TODO(wangpeng): JAX doesn't support custom grads for functions with
+      #   auxiliary output yet (https://github.com/google/jax/issues/844). Will
+      #   remove the constraints on state below when this feature is added to
+      #   JAX.
+
+      assert state is (), (  # pylint: disable=literal-comparison
+          'Custom gradients do not allow non-trivial start state.')
+
+      def check_end_state(output_state):
+        output, state = output_state
+        assert state is (), (  # pylint: disable=literal-comparison
+            'Custom gradients do not allow non-trivial end state.')
+        return output
+
       # See this link for how custom transformations are defined in JAX:
       # https://jax.readthedocs.io/en/latest/jax.html#jax.custom_transforms
       # Note that we capture the kwargs and don't calculate gradients wrt. them.
       @jax.custom_transforms
       def do_call(y, params):
-        return self.call(y, params=params, **kwargs)
+        return check_end_state(self.call(y, params=params, state=(), **kwargs))
 
       # This is the custom gradient (vector-jacobian product in JAX) function.
       # For the exact specification of this custom transformation see this link:
       # https://jax.readthedocs.io/en/latest/jax.html#jax.defjvp_all
       def do_call_vjp(y, params):
-        output = self.call(y, params=params, **kwargs)
+        output = check_end_state(self.call(y, params=params, state=(),
+                                           **kwargs))
         def vjpfun(grad):
           return self.custom_grad(y, output, grad, params, **kwargs)
         return output, vjpfun
 
       jax.defvjp_all(do_call, do_call_vjp)
-      return do_call(x, params)
+      return do_call(x, params), ()
 
     except Exception:
       name, trace = self.__class__.__name__, _short_traceback()
@@ -413,22 +439,23 @@ def _n_outputs(self):
 
     def _new_parameters(self, input_shapes, input_dtype, rng):
       if new_parameters is None:
-        return ()
+        return (), ()
       kwargs = self._init_kwargs  # pylint: disable=protected-access
-      return new_parameters(input_shapes, input_dtype, rng, **kwargs)
+      return new_parameters(input_shapes, input_dtype, rng, **kwargs), ()
 
     def _is_empty(raw_output):
       return raw_output is None or (isinstance(raw_output, (list, tuple))
                                     and len(raw_output) == 0)  # pylint: disable=g-explicit-length-test
 
-    def _call_with_context(self, x, params=(), **kwargs):
+    def _call_with_context(self, x, params=(), state=(), **kwargs):
       """Calls raw_call_fn with extra keyword args from Layer.__init__."""
       merged_kwargs = kwargs.copy()
       merged_kwargs.update(self._init_kwargs)  # pylint: disable=protected-access
 
       _validate_call_input(x, n_inputs)
       raw_output = raw_call_fn(x, params=params, **merged_kwargs)
-      return () if _is_empty(raw_output) else raw_output
+      output = () if _is_empty(raw_output) else raw_output
+      return (output, state)
 
     # Set docstrings and create the class.
     _call_with_context.__doc__ = raw_call_fn.__doc__
@@ -502,15 +529,15 @@ def check_shape_agreement(layer_fn, input_shapes, integer_inputs=False):
     input_dtype = tuple(input_dtype for _ in input_shapes)
   else:
     pseudo_data = ShapeType(input_shapes, input_dtype)
-  params = layer_fn.initialize(input_shapes, input_dtype, rng1)
-  pseudo_output = layer_fn.pseudo_call(pseudo_data, params)
+  params, state = layer_fn.initialize(input_shapes, input_dtype, rng1)
+  pseudo_output, _ = layer_fn.pseudo_call(pseudo_data, params, state)
   if isinstance(pseudo_output, tuple):
     output_shape = tuple(x.shape for x in pseudo_output)
   else:
     output_shape = pseudo_output.shape
 
   random_input = _random_values(input_shapes, rng2, integer_inputs)
-  real_output = layer_fn(random_input, params, rng=rng3)
+  real_output, _ = layer_fn(random_input, params, state=state, rng=rng3)
   result_shape = shapes(real_output)
 
   msg = 'output shape %s != real result shape %s' % (output_shape, result_shape)
 
@@ -40,11 +40,11 @@ class IdWithZeroGrad(base.Layer):
 
       def call(self, x, params, **kwargs):
         del params, kwargs
-        return x
+        return x, ()
 
       def new_parameters(self, input_shapes, input_dtype, rng):
         del input_shapes, input_dtype, rng
-        return ()
+        return (), ()
 
       @property
       def has_custom_grad(self):
@@ -59,7 +59,7 @@ def custom_grad(self, inputs, output, ct, params, **kwargs):
     input_shape = (9, 17)
     random_input = backend.random.uniform(rng, input_shape, minval=-1.0,
                                           maxval=1.0)
-    f = lambda x: backend.numpy.mean(layer(x, params, rng=rng))
+    f = lambda x: backend.numpy.mean(layer(x, params, rng=rng)[0])
     grad = backend.grad(f)(random_input)
     self.assertEqual(grad.shape, input_shape)  # Gradient for each input.
     self.assertEqual(sum(sum(grad * grad)), 0.0)  # Each one is 0.
@@ -70,11 +70,11 @@ class IdWithIdGrad(base.Layer):
 
       def call(self, x, params, **kwargs):
         del params, kwargs
-        return x
+        return x, ()
 
       def new_parameters(self, input_shapes, input_dtype, rng):
         del input_shapes, input_dtype, rng
-        return ()
+        return (), ()
 
       @property
       def has_custom_grad(self):
@@ -89,7 +89,7 @@ def custom_grad(self, inputs, output, ct, params, **kwargs):
     input_shape = (9, 17)
     random_input = backend.random.uniform(rng, input_shape, minval=-1.0,
                                           maxval=1.0)
-    f = lambda x: backend.numpy.mean(layer(x, params, rng=rng))
+    f = lambda x: backend.numpy.mean(layer(x, params, rng=rng)[0])
     grad = backend.grad(f)(random_input)
     self.assertEqual(grad.shape, input_shape)  # Gradient for each input.
     self.assertEqual(sum(sum(grad)), sum(sum(random_input)))  # Same as input.
Original file line number	Diff line number	Diff line change
`@@ -169,7 +169,7 @@ def AttentionQKV(d_feature, n_heads=1, dropout=0.0, mode='train'):`
`169`	`169`	`core.Dense(d_feature),`
`170`	`170`	`),`
`171`	`171`	`PureAttention( # pylint: disable=no-value-for-parameter`
`172`		`- d_feature=d_feature, n_heads=n_heads, dropout=dropout, mode=mode),`
	`172`	`+ n_heads=n_heads, dropout=dropout, mode=mode),`
`173`	`173`	`core.Dense(d_feature),`
`174`	`174`	`]`
`175`	`175`