SciSharp
diff --git a/‎src/TensorFlowNET.Core/Keras/ArgsDefinition/Attention/AttentionArgs.cs‎
Lines changed: 20 additions & 0 deletions b/‎src/TensorFlowNET.Core/Keras/ArgsDefinition/Attention/AttentionArgs.cs‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/TensorFlowNET.Core/Keras/ArgsDefinition/Attention/BaseDenseAttentionArgs.cs‎
Lines changed: 20 additions & 0 deletions b/‎src/TensorFlowNET.Core/Keras/ArgsDefinition/Attention/BaseDenseAttentionArgs.cs‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/TensorFlowNET.Keras/Engine/Layer.cs‎
Lines changed: 9 additions & 0 deletions b/‎src/TensorFlowNET.Keras/Engine/Layer.cs‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/TensorFlowNET.Keras/Layers/Attention/Attention.cs‎
Lines changed: 159 additions & 0 deletions b/‎src/TensorFlowNET.Keras/Layers/Attention/Attention.cs‎
Lines changed: 159 additions & 0 deletions
@@ -0,0 +1,20 @@
+namespace Tensorflow.Keras.ArgsDefinition
+{
+    public class AttentionArgs : BaseDenseAttentionArgs
+    {
+
+        /// <summary>
+        /// If `true`, will create a scalar variable to scale the attention scores.
+        /// </summary>
+        public bool use_scale { get; set; } = false;
+
+        /// <summary>
+        /// Function to use to compute attention scores, one of
+        /// `{"dot", "concat"}`. `"dot"` refers to the dot product between the query
+        /// and key vectors. `"concat"` refers to the hyperbolic tangent of the
+        /// concatenation of the query and key vectors.
+        /// </summary>
+        public string score_mode { get; set; } = "dot";
+
+    }
+}
@@ -0,0 +1,20 @@
+namespace Tensorflow.Keras.ArgsDefinition
+{
+    public class BaseDenseAttentionArgs : LayerArgs
+    {
+
+        /// <summary>
+        /// Boolean. Set to `true` for decoder self-attention. Adds a mask such
+        /// that position `i` cannot attend to positions `j > i`. This prevents the
+        /// flow of information from the future towards the past.
+        /// </summary>
+        public bool causal { get; set; } = false;
+
+        /// <summary>
+        /// Float between 0 and 1. Fraction of the units to drop for the
+        /// attention scores.
+        /// </summary>
+        public float dropout { get; set; } = 0f;
+
+    }
+}
@@ -275,6 +275,15 @@ public List<IVariableV1> weights
                 weights.AddRange(non_trainable_weights);
                 return weights;
             }
+            set
+            {
+                if (weights.Count() != value.Count()) throw new ValueError(
+                                            $"You called `set_weights` on layer \"{this.name}\"" +
+                                            $"with a weight list of length {len(value)}, but the layer was " +
+                                            $"expecting {len(weights)} weights.");
+                foreach (var (this_w, v_w) in zip(weights, value))
+                    this_w.assign(v_w, read_value: true);
+            }
         }
 
         public virtual LayerArgs get_config()
 
@@ -0,0 +1,159 @@
+using static Tensorflow.Binding;
+using static Tensorflow.KerasApi;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using Tensorflow.Keras.ArgsDefinition;
+
+namespace Tensorflow.Keras.Layers
+{
+    /// <summary>
+    /// Dot-product attention layer, a.k.a. Luong-style attention.
+    /// Inputs are `query` tensor of shape `[batch_size, Tq, dim]`, `value` tensor of
+    /// shape `[batch_size, Tv, dim]` and `key` tensor of shape
+    /// `[batch_size, Tv, dim]`. The calculation follows the steps:
+    /// <para>
+    /// 1. Calculate scores with shape `[batch_size, Tq, Tv]` as a `query`-`key` dot
+    ///    product: `scores = tf.matmul(query, key, transpose_b=True)`.
+    /// </para>
+    /// <para>
+    /// 2. Use scores to calculate a distribution with shape
+    ///    `[batch_size, Tq, Tv]`: `distribution = tf.nn.softmax(scores)`.
+    /// </para>
+    /// <para>
+    /// 3. Use `distribution` to create a linear combination of `value` with
+    ///    shape `[batch_size, Tq, dim]`:
+    ///    `return tf.matmul(distribution, value)`.
+    /// </para>
+    /// </summary>
+    /// <example> 0
+    /// <code>
+    /// //Variable-length int sequences.
+    /// var query_input = keras.Input((1000), dtype: TF_DataType.TF_INT32);
+    /// var value_input = keras.Input((1000), dtype: TF_DataType.TF_INT32);
+    /// // Embedding lookup.
+    /// var token_embedding = keras.layers.Embedding(input_dim: 1000, output_dim: 64);
+    /// // Query embeddings of shape [batch_size, Tq, dimension].
+    /// var query_embeddings = token_embedding.Apply(query_input);
+    /// // Value embeddings of shape [batch_size, Tv, dimension].
+    /// var value_embeddings = token_embedding.Apply(value_input);
+    /// // CNN layer.
+    /// var cnn_layer = keras.layers.Conv1D(
+    ///     filters: 100,
+    ///     kernel_size: 4,
+    ///     // Use 'same' padding so outputs have the same shape as inputs.
+    ///     padding: "same");
+    /// var cnn_layer2 = keras.layers.Conv1D(
+    ///     filters: 100,
+    ///     kernel_size: 4,
+    ///     // Use 'same' padding so outputs have the same shape as inputs.
+    ///     padding: "same");
+    /// // Query encoding of shape [batch_size, Tq, filters].
+    /// var query_seq_encoding = cnn_layer.Apply(query_embeddings);
+    /// // Value encoding of shape [batch_size, Tv, filters].
+    /// var value_seq_encoding = cnn_layer.Apply(value_embeddings);
+    /// // Query-value attention of shape [batch_size, Tq, filters].
+    /// var query_value_attention_seq = keras.layers.Attention().Apply(
+    ///    (query_seq_encoding, value_seq_encoding));
+    /// // Reduce over the sequence axis to produce encodings of shape
+    /// // [batch_size, filters].
+    /// var query_encoding = keras.layers.GlobalAveragePooling1D().Apply(
+    ///     query_seq_encoding);
+    /// var query_value_attention = keras.layers.GlobalAveragePooling1D().Apply(
+    ///     query_value_attention_seq);
+    /// // Concatenate query and document encodings to produce a DNN input layer.
+    /// var input_layer = keras.layers.Concatenate().Apply(
+    ///     (query_encoding, query_value_attention));
+    /// // Add DNN layers, and create Model.
+    /// // ...
+    /// </code>
+    /// </example>
+    public class Attention : BaseDenseAttention
+    {
+        
+        public IVariableV1 concat_score_weight;
+        
+        public IVariableV1 scale;
+
+        AttentionArgs args;
+        
+        string score_mode { get => args.score_mode; }
+        
+        bool use_scale { get => args.use_scale; }
+        
+        public Attention(AttentionArgs args) : base(args)
+        {
+            this.args = args;
+            if (!new List<string> {
+                "dot",
+                "concat"
+            }.Contains(this.score_mode))
+                throw new ValueError("Received: score_mode={score_mode}. Acceptable values are: [\"dot\", \"concat\"]");
+        }
+        
+        // Creates variable when `use_scale` is True or `score_mode` is `concat`.
+        protected override void build(Tensors inputs) {
+            if (this.use_scale)
+                this.scale = this.add_weight(name: "scale",
+                                             shape: 1,
+                                             initializer: tf.ones_initializer,
+                                             dtype: this.DType,
+                                             trainable: true);
+            else
+                this.scale = null;
+
+            if (this.score_mode == "concat")
+                this.concat_score_weight = this.add_weight(name: "concat_score_weight",
+                                                           shape: 1,
+                                                           initializer: tf.ones_initializer,
+                                                           dtype: this.DType,
+                                                           trainable: true);
+            else
+                this.concat_score_weight = null;
+            base.build(inputs);
+        }
+
+        /// <summary>
+        /// Calculates attention scores as a query-key dot product.
+        /// </summary>
+        /// <param name="query">query: Query tensor of shape `[batch_size, Tq, dim]`.</param>
+        /// <param name="key">key: Key tensor of shape `[batch_size, Tv, dim]`.</param>
+        /// <returns>Tensor of shape `[batch_size, Tq, Tv]`.</returns>
+        public override Tensor _calculate_scores(Tensor query, Tensor key)
+        {
+            Tensor scores = null;
+            if (this.score_mode == "dot")
+            {
+                //scores = tf.matmul(query, key, transpose_b: true);
+                //scores = tf.matmul(tf.squeeze(query),tf.squeeze(key), transpose_b: true);
+                scores = tf.linalg.einsum("bij,bkj->bik", (query, key));
+                if (this.scale != null)
+                    scores *= this.scale.AsTensor();
+            } else if (this.score_mode == "concat") {
+                // Reshape tensors to enable broadcasting.
+                // Reshape into [batch_size, Tq, 1, dim].
+                var q_reshaped = tf.expand_dims(query, axis: -2);
+                // Reshape into [batch_size, 1, Tv, dim].
+                var k_reshaped = tf.expand_dims(key, axis: -3);
+                if (this.scale != null)
+                    scores = this.concat_score_weight.AsTensor() *
+                             tf.reduce_sum(tf.tanh(this.scale.AsTensor() * (q_reshaped + k_reshaped)), axis: -1);
+                else
+                    scores = this.concat_score_weight.AsTensor() *
+                        tf.reduce_sum(tf.tanh(q_reshaped + k_reshaped), axis: -1);
+            }
+            return scores;
+        }
+
+        public override LayerArgs get_config() => this.args;
+        //var config = new Dictionary<object, object> {
+        //    {
+        //        "use_scale",
+        //        this.use_scale},
+        //    {
+        //        "score_mode",
+        //        this.score_mode}};
+        //var base_config = base.get_config();
+        //return new dict(base_config.items().ToList() + config.items().ToList());
+    }
+}