modern-fortran
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 1 deletion b/‎README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎example/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎example/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎example/dense_mnist.f90‎
Lines changed: 5 additions & 4 deletions b/‎example/dense_mnist.f90‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎example/mha_simple.f90‎
Lines changed: 37 additions & 0 deletions b/‎example/mha_simple.f90‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎src/nf.f90‎
Lines changed: 11 additions & 0 deletions b/‎src/nf.f90‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/nf/nf_cross_attention_layer.f90‎
Lines changed: 66 additions & 0 deletions b/‎src/nf/nf_cross_attention_layer.f90‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎src/nf/nf_dropout_layer.f90‎
Lines changed: 83 additions & 0 deletions b/‎src/nf/nf_dropout_layer.f90‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/nf/nf_dropout_layer_submodule.f90‎
Lines changed: 68 additions & 0 deletions b/‎src/nf/nf_dropout_layer_submodule.f90‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎src/nf/nf_layer.f90‎
Lines changed: 1 addition & 1 deletion b/‎src/nf/nf_layer.f90‎
Lines changed: 1 addition & 1 deletion
@@ -20,6 +20,7 @@ add_library(neural-fortran
   src/nf/nf_base_layer.f90
   src/nf/nf_conv2d_layer.f90
   src/nf/nf_conv2d_layer_submodule.f90
+  src/nf/nf_cross_attention_layer.f90
   src/nf/nf_datasets.f90
   src/nf/nf_datasets_submodule.f90
   src/nf/nf_datasets_mnist.f90
@@ -40,13 +41,17 @@ add_library(neural-fortran
   src/nf/nf_layer_submodule.f90
   src/nf/nf_locally_connected_1d_submodule.f90
   src/nf/nf_locally_connected_1d.f90
+  src/nf/nf_linear2d_layer.f90
+  src/nf/nf_linear2d_layer_submodule.f90
   src/nf/nf_loss.f90
   src/nf/nf_loss_submodule.f90
   src/nf/nf_maxpool1d_layer.f90
   src/nf/nf_maxpool1d_layer_submodule.f90
   src/nf/nf_maxpool2d_layer.f90
   src/nf/nf_maxpool2d_layer_submodule.f90
   src/nf/nf_metrics.f90
+  src/nf/nf_multihead_attention.f90
+  src/nf/nf_multihead_attention_submodule.f90
   src/nf/nf_network.f90
   src/nf/nf_network_submodule.f90
   src/nf/nf_optimizers.f90
@@ -57,8 +62,11 @@ add_library(neural-fortran
   src/nf/nf_reshape_layer_submodule.f90
   src/nf/nf_reshape2d_layer.f90
   src/nf/nf_reshape2d_layer_submodule.f90
+  src/nf/nf_self_attention_layer.f90
   src/nf/io/nf_io_binary.f90
   src/nf/io/nf_io_binary_submodule.f90
+  src/nf/nf_dropout_layer.f90
+  src/nf/nf_dropout_layer_submodule.f90
 )
 
 target_link_libraries(neural-fortran PRIVATE)
 
@@ -30,9 +30,12 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
-| Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ |
+| Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ |
+| Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
 
@@ -7,6 +7,7 @@ foreach(execid
   simple
   sine
   quadratic
+  mha_simple
 )
   add_executable(${execid} ${execid}.f90)
   target_link_libraries(${execid} PRIVATE
 
@@ -1,6 +1,6 @@
 program dense_mnist
 
-  use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr
+  use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr, relu, softmax, dropout
 
   implicit none
 
@@ -17,8 +17,9 @@ program dense_mnist
 
   net = network([ &
     input(784), &
-    dense(30), &
-    dense(10) &
+    dense(64, relu()), &
+    dropout(0.2), &
+    dense(10, softmax()) &
   ])
   num_epochs = 10
 
@@ -32,7 +33,7 @@ program dense_mnist
     call net % train( &
       training_images, &
       label_digits(training_labels), &
-      batch_size=100, &
+      batch_size=128, &
       epochs=1, &
       optimizer=sgd(learning_rate=3.) &
     )
 
@@ -0,0 +1,37 @@
+program mha_simple
+  use nf, only: dense, input, network, sgd, self_attention, flatten
+  implicit none
+  type(network) :: net
+  real, allocatable :: x(:, :), y(:)
+  integer, parameter :: num_iterations = 500
+  integer :: n
+
+  print '("Simple")'
+  print '(60("="))'
+
+  net = network([ &
+    input(3, 8), &
+    self_attention(4), &
+    flatten(), &
+    dense(2) &
+  ])
+
+  call net % print_info()
+
+  allocate(x(3, 8))
+  call random_number(x)
+
+  y = [0.123456, 0.246802]
+
+  do n = 0, num_iterations
+
+    call net % forward(x)
+    call net % backward(y)
+    call net % update(optimizer=sgd(learning_rate=1.))
+
+    if (mod(n, 50) == 0) &
+      print '(i4,2(3x,f8.6))', n, net % predict(x)
+
+  end do
+
+end program mha_simple
@@ -4,6 +4,15 @@ module nf
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
     conv2d, dense, flatten, input, maxpool1d, maxpool2d, reshape, reshape2d, locally_connected_1d
+    conv2d, &
+    dense, &
+    dropout, &
+    flatten, &
+    input, &
+    linear2d, &
+    maxpool2d, &
+    reshape, &
+    self_attention
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
@@ -12,4 +21,6 @@ module nf
                            gaussian, linear, relu, leaky_relu,     &
                            sigmoid, softmax, softplus, step, tanhf, &
                            celu
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
 end module nf
@@ -0,0 +1,66 @@
+module nf_cross_attention_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_activation, only: softmax
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
+
+  implicit none
+
+  type, extends(multihead_attention_layer) :: cross_attention_layer
+    !! Cross Attention Layer
+    !! Source:
+    !! Bahdanau, D. (2014)
+    !! Neural machine translation by jointly learning to align and translate.
+    !! https://arxiv.org/pdf/1409.0473
+    real, allocatable :: gradient(:, :, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+  end type cross_attention_layer
+
+  interface cross_attention_layer
+    module function cross_attention_layer_cons(n_heads) result(res)
+      !! This function returns the `cross_attention_layer` instance.
+      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      type(cross_attention_layer) :: res
+    end function cross_attention_layer_cons
+  end interface cross_attention_layer
+
+contains
+  module function cross_attention_layer_cons(n_heads) result(res)
+    !! This function returns the `cross_attention_layer` instance.
+    integer, intent(in) :: n_heads
+    type(cross_attention_layer) :: res
+    res % n_heads = n_heads
+  end function cross_attention_layer_cons
+
+  pure module subroutine backward(self, input, gradient)
+    !! Cross Attention Back propagation
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+    real, intent(in) :: gradient(:, :)
+
+    call self % common_backward(input(1, :, :), gradient)
+    self % gradient(1, :, :) = self % query_layer % gradient
+    self % gradient(2, :, :) = self % key_layer % gradient + self % value_layer % gradient
+  end subroutine backward
+
+  pure module subroutine forward(self, input)
+    !! Cross Attention Forward propagation
+    !! Input Shape (kind, sequence_length, model_dimension)
+    !! where kind is 1 for Query and 2 for Key-Value
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+
+    call self % common_forward(input(1, :, :), input(2, :, :), input(2, :, :))
+  end subroutine forward
+
+  module subroutine init(self, input_shape)
+    class(cross_attention_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    call self % init_base(input_shape)
+    allocate(self % gradient(2, self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_cross_attention_layer
@@ -0,0 +1,83 @@
+module nf_dropout_layer
+
+  !! Dropout layer by Srivastava et al. (2014).
+  !!
+  !! Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I. and 
+  !! Salakhutdinov, R., 2014. Dropout: a simple way to prevent neural networks 
+  !! from overfitting. The Journal of Machine Learning Research, 16(1), 
+  !! pp.1929-1958.
+
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: dropout_layer
+
+  type, extends(base_layer) :: dropout_layer
+    !! Concrete implementation of a dropout layer type
+
+    integer :: input_size = 0
+
+    real, allocatable :: output(:)
+    real, allocatable :: gradient(:)
+    real, allocatable :: mask(:) ! binary mask for dropout
+
+    real :: dropout_rate ! probability of dropping a neuron
+    real :: scale ! scale factor to preserve the input sum
+    logical :: training = .true. ! set to .false. for inference
+
+  contains
+
+    procedure :: backward
+    procedure :: forward
+    procedure :: init
+
+  end type dropout_layer
+
+  interface dropout_layer
+    module function dropout_layer_cons(rate) &
+      result(res)
+      !! This function returns the `dropout_layer` instance.
+      real, intent(in) :: rate
+        !! Dropout rate
+      type(dropout_layer) :: res
+        !! dropout_layer instance
+    end function dropout_layer_cons
+  end interface dropout_layer
+
+  interface
+
+    pure module subroutine backward(self, gradient)
+      !! Apply the backward gradient descent pass.
+      !! Only weight and bias gradients are updated in this subroutine,
+      !! while the weights and biases themselves are untouched.
+      class(dropout_layer), intent(in out) :: self
+        !! Dropout layer instance
+      real, intent(in) :: gradient(:)
+        !! Gradient from the next layer
+    end subroutine backward
+
+    module subroutine forward(self, input)
+      !! Propagate forward the layer.
+      !! Calling this subroutine updates the values of a few data components
+      !! of `dropout_layer` that are needed for the backward pass.
+      class(dropout_layer), intent(in out) :: self
+        !! Dense layer instance
+      real, intent(in) :: input(:)
+        !! Input from the previous layer
+    end subroutine forward
+
+    module subroutine init(self, input_shape)
+      !! Initialize the layer data structures.
+      !!
+      !! This is a deferred procedure from the `base_layer` abstract type.
+      class(dropout_layer), intent(in out) :: self
+        !! Dropout layer instance
+      integer, intent(in) :: input_shape(:)
+        !! Shape of the input layer
+    end subroutine init
+
+  end interface
+
+end module nf_dropout_layer
@@ -0,0 +1,68 @@
+submodule (nf_dropout_layer) nf_dropout_layer_submodule
+  use nf_random, only: shuffle
+  !! This submodule implements the procedures defined in the
+  !! nf_dropout_layer module.
+
+contains
+
+  module function dropout_layer_cons(rate) result(res)
+    real, intent(in) :: rate
+    type(dropout_layer) :: res
+    res % dropout_rate = rate
+    res % scale = 1 / (1 - rate)
+  end function dropout_layer_cons
+
+
+  module subroutine init(self, input_shape)
+    class(dropout_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    self % input_size = input_shape(1)
+
+    ! Allocate arrays
+    allocate(self % output(self % input_size))
+    allocate(self % gradient(self % input_size))
+    allocate(self % mask(self % input_size))
+
+    ! Initialize arrays
+    self % output = 0
+    self % gradient = 0
+    self % mask = 1  ! Default mask is all ones (no dropout)
+
+  end subroutine init
+
+
+  module subroutine forward(self, input)
+    class(dropout_layer), intent(in out) :: self
+    real, intent(in) :: input(:)
+
+    ! Generate random mask for dropout, training mode only
+    if (self % training) then
+
+      ! Set the first dropout_rate number of elements to 0, the rest to 1,
+      ! and shuffle. Note that the selection of the elements rounds down to
+      ! the nearest integer, so in cases where size(input) * dropout_rate is
+      ! not an integer, the actual dropout rate will be slightly lower.
+      self % mask = 1
+      self % mask(:int(size(self % mask) * self % dropout_rate)) = 0
+      call shuffle(self % mask)
+
+      ! Apply dropout mask
+      self % output = input * self % mask * self % scale
+
+    else
+      ! In inference mode, we don't apply dropout; simply pass through the input
+      self % output = input
+
+    end if
+
+  end subroutine forward
+
+
+  pure module subroutine backward(self, gradient)
+    class(dropout_layer), intent(in out) :: self
+    real, intent(in) :: gradient(:)
+    self % gradient = gradient * self % mask * self % scale
+  end subroutine backward
+
+end submodule nf_dropout_layer_submodule 
@@ -91,7 +91,7 @@ end subroutine backward_3d
 
   interface
 
-    pure module subroutine forward(self, input)
+    module subroutine forward(self, input)
       !! Apply a forward pass on the layer.
       !! This changes the internal state of the layer.
       !! This is normally called internally by the `network % forward`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ foreach(execid`
`7`	`7`	`simple`
`8`	`8`	`sine`
`9`	`9`	`quadratic`
	`10`	`+ mha_simple`
`10`	`11`	`)`
`11`	`12`	`add_executable(${execid} ${execid}.f90)`
`12`	`13`	`target_link_libraries(${execid} PRIVATE`