Internal change

tensorflower-gardener · tensorflower-gardener · commit 270ed2adab5c · 2023-03-27T16:15:56.000-07:00
PiperOrigin-RevId: 519857140
diff --git a/official/vision/modeling/layers/nn_blocks.py b/official/vision/modeling/layers/nn_blocks.py
@@ -1555,15 +1555,32 @@ def call(self, inputs, inputs_positions=None):
 class TransformerEncoderBlock(nlp_modeling.layers.TransformerEncoderBlock):
   """TransformerEncoderBlock layer with stochastic depth and layerscale."""
 
-  def __init__(self,
-               *args,
-               stochastic_depth_drop_rate=0.0,
-               layer_scale_init_value=0.0,
-               **kwargs):
-    """Initializes TransformerEncoderBlock."""
+  def __init__(
+      self,
+      *args,
+      stochastic_depth_drop_rate=0.0,
+      layer_scale_init_value=0.0,
+      max_attention_inference_parallelism=None,
+      **kwargs
+  ):
+    """Initializes TransformerEncoderBlock.
+
+    Args:
+      *args: positional arguments passed to super().__init__.
+      stochastic_depth_drop_rate: the drop rate for the stochastic depth layer.
+      layer_scale_init_value:
+      max_attention_inference_parallelism: the number of examples to run in
+        parallel in the attention blocks during inference. Set this limit to
+        reduce the peak memory usage. If None, use vectorized operations to run
+        the whole batch in parallel.
+      **kwargs: keyword arguments passed to super().__init__.
+    """
     super().__init__(*args, **kwargs)
     self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
     self._layer_scale_init_value = layer_scale_init_value
+    self._max_attention_inference_parallelism = (
+        max_attention_inference_parallelism
+    )
 
   def build(self, input_shape):
     if self._stochastic_depth_drop_rate:
@@ -1582,10 +1599,25 @@ def build(self, input_shape):
       self._layer_scale_mlp = lambda x, *args, **kwargs: tf.identity(x)
     super().build(input_shape)
 
+    if self._max_attention_inference_parallelism is not None:
+      attention_layer_config = self._attention_layer.get_config()
+      self._attention_layer = nn_layers.MultiHeadAttention.from_config({
+          **attention_layer_config,
+          'max_inference_parallelism': (
+              self._max_attention_inference_parallelism
+          ),
+      })
+
   def get_config(self):
-    config = {'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate}
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    config = super().get_config()
+    config.update({
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'layer_scale_init_value': self._layer_scale_init_value,
+        'max_attention_inference_parallelism': (
+            self._max_attention_inference_parallelism
+        ),
+    })
+    return config
 
   def call(self, inputs, output_range=None, training=None):
     """Transformer self-attention encoder block call."""
@@ -1675,29 +1707,39 @@ def call(self, inputs, output_range=None, training=None):
 
 @tf.keras.utils.register_keras_serializable(package='Vision')
 class TransformerScaffold(nlp_modeling.layers.TransformerScaffold):
-  """TransformerScaffold layer for vision applications.
-
-  This layer is a subclass of NLP TransformerScaffold:
+  """TransformerScaffold layer for vision applications."""
 
-  Attributes:
-    stochastic_depth_drop_rate: Drop rate for the residual connections.
-    return_attention_scores: Optionally return the attention output.
-    ffn_has_residual_connection: Whether the feedforward network has internal
-      residual connection and layer norm. If False, the residual connection and
-      the layer norm op are called inside TransformerScaffold.
-  """
+  def __init__(
+      self,
+      *args,
+      stochastic_depth_drop_rate: float = 0.0,
+      return_attention_scores: bool = False,
+      ffn_has_residual_connection: bool = False,
+      max_attention_inference_parallelism: Optional[int] = None,
+      **kwargs
+  ):
+    """Initializes TransformerEncoderBlock.
 
-  def __init__(self,
-               *args,
-               stochastic_depth_drop_rate: float = 0.0,
-               return_attention_scores: bool = False,
-               ffn_has_residual_connection: bool = False,
-               **kwargs):
-    """Initializes TransformerEncoderBlock."""
+    Args:
+      *args: positional arguments passed to super().__init__.
+      stochastic_depth_drop_rate: the drop rate for the stochastic depth layer.
+      return_attention_scores: whether to return the attention output.
+      ffn_has_residual_connection: whether the feedforward network has internal
+        residual connection and layer norm. If False, the residual connection
+        and the layer norm op are called inside TransformerScaffold.
+      max_attention_inference_parallelism: the number of examples to run in
+        parallel in the attention blocks during inference. Set this limit to
+        reduce the peak memory usage. If None, use vectorized operations to run
+        the whole batch in parallel.
+      **kwargs: keyword arguments passed to super().__init__.
+    """
     super().__init__(*args, **kwargs)
     self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
     self._return_attention_scores = return_attention_scores
     self._ffn_has_residual_connection = ffn_has_residual_connection
+    self._max_attention_inference_parallelism = (
+        max_attention_inference_parallelism
+    )
 
   def build(self, input_shape: Union[tf.TensorShape, List[int]]):
     if self._stochastic_depth_drop_rate:
@@ -1708,15 +1750,26 @@ def build(self, input_shape: Union[tf.TensorShape, List[int]]):
 
     super().build(input_shape)
 
+    if self._max_attention_inference_parallelism is not None:
+      attention_layer_config = self._attention_layer.get_config()
+      self._attention_layer = self._attention_cls.from_config({
+          **attention_layer_config,
+          'max_inference_parallelism': (
+              self._max_attention_inference_parallelism
+          ),
+      })
+
   def get_config(self):
-    config = {
+    config = super().get_config()
+    config.update({
         'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
         'return_attention_scores': self._return_attention_scores,
-        'ffn_has_residual_connection': self._ffn_has_residual_connection
-    }
-    base_config = super().get_config()
-    base_config.update(config)
-    return base_config
+        'ffn_has_residual_connection': self._ffn_has_residual_connection,
+        'max_attention_inference_parallelism': (
+            self._max_attention_inference_parallelism
+        ),
+    })
+    return config
 
   def call(
       self,
diff --git a/official/vision/modeling/layers/nn_blocks_test.py b/official/vision/modeling/layers/nn_blocks_test.py
@@ -24,8 +24,8 @@
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
-from official.nlp import modeling as nlp_modeling
 from official.vision.modeling.layers import nn_blocks
+from official.vision.modeling.layers import nn_layers
 
 
 def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
@@ -392,7 +392,7 @@ def auto_fn():
 # boolean 'True'. We register this class as a Keras serializable so we can
 # test serialization below.
 @tf.keras.utils.register_keras_serializable(package='TestOnlyAttention')
-class ValidatedAttentionLayer(nlp_modeling.layers.attention.MultiHeadAttention):
+class ValidatedAttentionLayer(nn_layers.MultiHeadAttention):
 
   def __init__(self, call_list, **kwargs):
     super(ValidatedAttentionLayer, self).__init__(**kwargs)
@@ -414,7 +414,7 @@ def call(
 
   def get_config(self):
     config = super(ValidatedAttentionLayer, self).get_config()
-    config['call_list'] = []
+    config['call_list'] = self.list
     return config
 
 
@@ -456,29 +456,32 @@ def tearDown(self):
     super(TransformerLayerTest, self).tearDown()
     tf.keras.mixed_precision.set_global_policy('float32')
 
-  def test_layer_creation(self):
+  @parameterized.parameters(None, 2)
+  def test_layer_creation(self, max_attention_inference_parallelism):
     sequence_length = 21
     width = 80
 
-    call_list = []
     attention_layer_cfg = {
         'num_heads': 10,
         'key_dim': 8,
-        'call_list': call_list,
+        'call_list': []
     }
     test_layer = nn_blocks.TransformerScaffold(
         attention_cls=ValidatedAttentionLayer,
         attention_cfg=attention_layer_cfg,
         num_attention_heads=10,
         inner_dim=2048,
-        inner_activation='relu')
+        inner_activation='relu',
+        max_attention_inference_parallelism=max_attention_inference_parallelism,
+    )
 
     # Create a 3-dimensional input (the first dimension is implicit).
     data_tensor = tf.keras.Input(shape=(sequence_length, width))
     output_tensor = test_layer(data_tensor)
     # The default output of a transformer layer should be the same as the input.
     self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
 
+    call_list = test_layer._attention_layer.get_config()['call_list']
     # If call_list[0] exists and is True, the passed layer class was
     # instantiated from the given config properly.
     self.assertNotEmpty(call_list)
@@ -551,22 +554,23 @@ def test_layer_creation_with_mask(self):
     self.assertNotEmpty(call_list)
     self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
 
-  def test_layer_invocation(self):
+  @parameterized.parameters(None, 2)
+  def test_layer_invocation(self, max_attention_inference_parallelism):
     sequence_length = 21
     width = 80
 
-    call_list = []
     attention_layer_cfg = {
         'num_heads': 10,
         'key_dim': 8,
-        'call_list': call_list,
+        'call_list': [],
     }
     test_layer = nn_blocks.TransformerScaffold(
         attention_cls=ValidatedAttentionLayer,
         attention_cfg=attention_layer_cfg,
         num_attention_heads=10,
         inner_dim=2048,
-        inner_activation='relu')
+        inner_activation='relu',
+        max_attention_inference_parallelism=max_attention_inference_parallelism)
 
     # Create a 3-dimensional input (the first dimension is implicit).
     data_tensor = tf.keras.Input(shape=(sequence_length, width))
@@ -581,6 +585,8 @@ def test_layer_invocation(self):
     input_data = 10 * np.random.random_sample(
         (batch_size, sequence_length, width))
     _ = model.predict(input_data)
+
+    call_list = test_layer._attention_layer.get_config()['call_list']
     # If call_list[0] exists and is True, the passed layer class was
     # instantiated from the given config properly.
     self.assertNotEmpty(call_list)
diff --git a/official/vision/modeling/layers/nn_layers.py b/official/vision/modeling/layers/nn_layers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Contains common building blocks for neural networks."""
+
 from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 from absl import logging
@@ -1279,3 +1280,116 @@ def get_config(self):
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class MultiHeadAttention(tf.keras.layers.MultiHeadAttention):
+  """MultiHeadAttention layer.
+
+  This is an implementation of multi-headed attention as described in the paper
+  "Attention is all you Need" (Vaswani et al., 2017).
+  """
+
+  def __init__(
+      self, *args, max_inference_parallelism: Optional[int] = None, **kwargs
+  ):
+    """Initializes MultiHeadAttention.
+
+    Args:
+      *args: Positional arguments passed to super().__init__.
+      max_inference_parallelism: The number of examples to run in parallel
+        during inference. Set this limit to reduce the peak memory usage. If
+        None, use vectorized operations to run the whole batch in parallel.
+      **kwargs: Keyword arguments passed to super().__init__.
+    """
+    super().__init__(*args, **kwargs)
+    self._max_inference_parallelism = max_inference_parallelism
+
+  def get_config(self):
+    config = super().get_config()
+    config.update({
+        'max_inference_parallelism': self._max_inference_parallelism,
+    })
+    return config
+
+  def _compute_attention(
+      self,
+      query: tf.Tensor,
+      key: tf.Tensor,
+      value: tf.Tensor,
+      attention_mask: Optional[tf.Tensor] = None,
+      training: Optional[bool] = None,
+  ):
+    """Applies dot-product attention with query, key, value tensors.
+
+    Args:
+      query: Projected query `Tensor` of shape `(B, T, N, key_dim)`.
+      key: Projected key `Tensor` of shape `(B, S, N, key_dim)`.
+      value: Projected value `Tensor` of shape `(B, S, N, value_dim)`.
+      attention_mask: a boolean mask of shape `(B, T, S)`, that prevents
+        attention to certain positions. It is generally not needed if the
+        `query` and `value` (and/or `key`) are masked.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+
+    Returns:
+      attention_output: Multi-headed outputs of attention computation.
+      attention_scores: Multi-headed attention weights.
+    """
+    batch_size = query.get_shape().as_list()[0]  # None if dynamic.
+
+    if (
+        training
+        or self._max_inference_parallelism is None
+        or self._max_inference_parallelism <= 0
+        or (
+            # If the whole batch is allowed to be run in parallel, use fully
+            # vectorized computation instead of tf.map_fn to make things more
+            # efficient.
+            batch_size is not None
+            and batch_size <= self._max_inference_parallelism
+        )
+    ):
+      return self._compute_attention_delegate(
+          query, key, value, attention_mask, training
+      )
+    else:
+      # Sequentialize the inference execution with limited parallelism.
+      def _compute_fn(x):
+        attention_output, attention_scores = self._compute_attention_delegate(
+            query=x[0][tf.newaxis, ...],
+            key=x[1][tf.newaxis, ...],
+            value=x[2][tf.newaxis, ...],
+            attention_mask=x[3][tf.newaxis, ...] if len(x) >= 4 else None,
+            training=training,
+        )
+        attention_output = tf.squeeze(attention_output, axis=0)
+        attention_scores = tf.squeeze(attention_scores, axis=0)
+        return attention_output, attention_scores
+
+      if attention_mask is not None:
+        elems = [query, key, value, attention_mask]
+      else:
+        elems = [query, key, value]
+
+      return tf.map_fn(
+          fn=_compute_fn,
+          elems=elems,
+          fn_output_signature=(value.dtype, value.dtype),
+          parallel_iterations=self._max_inference_parallelism,
+      )
+
+  def _compute_attention_delegate(
+      self,
+      query: tf.Tensor,
+      key: tf.Tensor,
+      value: tf.Tensor,
+      attention_mask: Optional[tf.Tensor] = None,
+      training: Optional[bool] = None,
+  ):
+    """Implements dot-product attention with query, key, value tensors."""
+    # Simply calls the implementation of the super class here, while the users
+    # can override this function for customizing attention computation.
+    return super()._compute_attention(
+        query, key, value, attention_mask, training
+    )
diff --git a/official/vision/modeling/layers/nn_layers_test.py b/official/vision/modeling/layers/nn_layers_test.py