@@ -777,15 +777,15 @@ def moe_layer(data_parallelism,
777777 xs_2d = dp (tf .reshape , xs , [[- 1 , model_hidden_size ]] * dp .n )
778778 # Call the MoE
779779 moe_out_2d , importance , load , _ , _ = moe .Eval (
780- dp .devices , xs_2d , train , identifiers = None , summaries = True )
780+ dp .devices , xs_2d , train , identifiers = None )
781781 # Reshape the output to the original shape.
782782 moe_out = dp (tf .reshape , moe_out_2d , dp (tf .shape , xs ))
783783 # These losses encourage equal load on the different experts.
784784 loss = loss_coef * (eu .CVSquared (importance ) + eu .CVSquared (load ))
785785 return moe_out , loss
786786
787787
788- def simple_attention (target , source , bias = None , summaries = True ):
788+ def simple_attention (target , source , bias = None ):
789789 """A simple attention function.
790790
791791 Args:
@@ -795,7 +795,6 @@ def simple_attention(target, source, bias=None, summaries=True):
795795 `[batch, source_timesteps_1, source_timesteps_2, depth]`
796796 bias: an optional `Tensor` with shape `[batch, timesteps, 1, 1]` used
797797 to mask the attention to not attend to padding of input.
798- summaries: Boolean, whether to output summaries.
799798
800799 Returns:
801800 a `Tensor` with same shape as `target`
@@ -814,7 +813,7 @@ def simple_attention(target, source, bias=None, summaries=True):
814813 if bias is not None :
815814 attention += tf .expand_dims (tf .squeeze (bias , axis = [2 , 3 ]), axis = 1 )
816815 attention = tf .nn .softmax (attention )
817- if summaries and not tf .get_variable_scope ().reuse :
816+ if not tf .get_variable_scope ().reuse :
818817 tf .summary .image ("attention" , tf .expand_dims (attention , 3 ), max_outputs = 5 )
819818 attended = tf .matmul (attention , source )
820819 return tf .reshape (attended , target_shape )
@@ -861,8 +860,7 @@ def multiscale_conv_sum(inputs, output_size, dilation_rates_and_kernel_sizes,
861860def multiscale_conv_and_attention (x ,
862861 padding ,
863862 hparams ,
864- source = None ,
865- summaries = True ):
863+ source = None ):
866864 """A common part of t2t layers.
867865
868866 First, do a linear multiscale convolution
@@ -875,7 +873,6 @@ def multiscale_conv_and_attention(x,
875873 padding: a padding type
876874 hparams: hyperparameters for model
877875 source: optional source tensor for attention. (encoder output)
878- summaries: Boolean, whether to output summaries.
879876
880877 Returns:
881878 a Tensor.
@@ -893,7 +890,7 @@ def multiscale_conv_and_attention(x,
893890 x = conv (x , hparams .hidden_size , (1 , 1 ))
894891 x = noam_norm (x + conv_sum )
895892 if source is not None :
896- x = noam_norm (x + simple_attention (x , source , summaries = summaries ))
893+ x = noam_norm (x + simple_attention (x , source ))
897894 return x
898895
899896
@@ -930,8 +927,7 @@ def conv_with_pools(inputs, output_size, kernel_size, pool_sizes, pooling_type,
930927def conv_with_pools_and_attention (x ,
931928 padding ,
932929 hparams ,
933- source = None ,
934- summaries = True ):
930+ source = None ):
935931 """A common part of t2t layers.
936932
937933 First, do conv_with_pools
@@ -944,7 +940,6 @@ def conv_with_pools_and_attention(x,
944940 padding: a padding type
945941 hparams: hyperparameters for model
946942 source: optional source tensor for attention. (encoder output)
947- summaries: Boolean, whether to output summaries.
948943
949944 Returns:
950945 a Tensor.
@@ -959,7 +954,7 @@ def conv_with_pools_and_attention(x,
959954 conv_sum += x
960955 x = noam_norm (conv_sum )
961956 if source is not None :
962- x = noam_norm (x + simple_attention (x , source , summaries = summaries ))
957+ x = noam_norm (x + simple_attention (x , source ))
963958 return x
964959
965960
@@ -1057,7 +1052,6 @@ def attention_1d_v0(source,
10571052 transform_source = True ,
10581053 transform_target = True ,
10591054 transform_output = True ,
1060- summaries = True ,
10611055 name = None ):
10621056 """multi-headed attention.
10631057
@@ -1075,7 +1069,6 @@ def attention_1d_v0(source,
10751069 transform_source: a boolean
10761070 transform_target: a boolean
10771071 transform_output: a boolean
1078- summaries: a boolean
10791072 name: an optional string
10801073
10811074 Returns:
@@ -1116,7 +1109,7 @@ def _maybe_transform(t, size, should_transform, name):
11161109 mask = (1.0 - mask ) * - 1e9
11171110 attention += mask
11181111 attention = tf .nn .softmax (attention )
1119- if summaries and not tf .get_variable_scope ().reuse :
1112+ if not tf .get_variable_scope ().reuse :
11201113 # Compute a color image summary.
11211114 image = tf .reshape (attention ,
11221115 [batch , num_heads , target_length , source_length ])
@@ -1162,7 +1155,6 @@ def conv_hidden_relu(inputs,
11621155 output_size ,
11631156 kernel_size = (1 , 1 ),
11641157 second_kernel_size = (1 , 1 ),
1165- summaries = True ,
11661158 dropout = 0.0 ,
11671159 ** kwargs ):
11681160 """Hidden layer with RELU activation followed by linear projection."""
@@ -1183,7 +1175,7 @@ def conv_hidden_relu(inputs,
11831175 ** kwargs )
11841176 if dropout != 0.0 :
11851177 h = tf .nn .dropout (h , 1.0 - dropout )
1186- if summaries and not tf .get_variable_scope ().reuse :
1178+ if not tf .get_variable_scope ().reuse :
11871179 tf .summary .histogram ("hidden_density_logit" ,
11881180 relu_density_logit (
11891181 h , list (range (inputs .shape .ndims - 1 ))))
0 commit comments