GSoC Optimizers: Example program to fit a quadratic function (#134)

Spnetic-5 · milancurcic · web-flow · commit adaf9bb28033 · 2023-06-12T10:18:03.000-04:00
* 📦 UPDATE: Added program to fit quadratic function

* 👌 IMPROVE: Added optimizer subroutines

* 🐛 FIX: code refactoring &amp; minor changes

* Fixes to make the code run

* 📦 Feat: Added batch shuffle in SGD

* Updated the changes in Mini-batch GD

* Define the quadratic as its own function

* Cleanup

---------

Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -6,6 +6,7 @@ foreach(execid
   get_set_network_params
   simple
   sine
+  quadratic
 )
   add_executable(${execid} ${execid}.f90)
   target_link_libraries(${execid} PRIVATE
diff --git a/example/quadratic.f90 b/example/quadratic.f90
@@ -0,0 +1,179 @@
+program quadratic_fit
+  ! This program fits a quadratic function using a small neural network using
+  ! stochastic gradient descent, batch gradient descent, and minibatch gradient
+  ! descent.
+  use nf, only: dense, input, network
+
+  implicit none
+  type(network) :: net_sgd, net_batch_sgd, net_minibatch_sgd
+
+  ! Training parameters
+  integer, parameter :: num_epochs = 1000
+  integer, parameter :: train_size = 1000
+  integer, parameter :: test_size = 30
+  integer, parameter :: batch_size = 10
+  real, parameter :: learning_rate = 0.01
+
+  ! Input and output data
+  real, allocatable :: x(:), y(:) ! training data
+  real, allocatable :: xtest(:), ytest(:) ! testing data
+  real, allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:)
+
+  integer :: i, n
+
+  print '("Fitting quadratic function")'
+  print '(60("="))'
+
+  allocate(xtest(test_size), ytest(test_size))
+  xtest = [((i - 1) * 2 / test_size, i = 1, test_size)]
+  ytest = quadratic(xtest)
+
+  ! x and y as 1-D arrays
+  allocate(x(train_size), y(train_size))
+
+  ! Generating the dataset
+  do i = 1, train_size
+    call random_number(x(i))
+    x(i) = x(i) * 2
+  end do
+  y = quadratic(x)
+
+  ! Instantiate a separate network for each optimization method.
+  net_sgd = network([input(1), dense(3), dense(1)])
+  net_batch_sgd = network([input(1), dense(3), dense(1)])
+  net_minibatch_sgd = network([input(1), dense(3), dense(1)])
+
+  ! Print network info to stdout; this will be the same for all three networks.
+  call net_sgd % print_info()
+
+  ! SGD optimizer
+  call sgd_optimizer(net_sgd, x, y, learning_rate, num_epochs)
+
+  ! Batch SGD optimizer
+  call batch_gd_optimizer(net_batch_sgd, x, y, learning_rate, num_epochs)
+
+  ! Mini-batch SGD optimizer
+  call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
+
+  ! Calculate predictions on the test set
+  ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1, test_size)]
+  ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1, test_size)]
+  ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1, test_size)]
+
+  ! Print the mean squared error
+  print '("Stochastic gradient descent MSE:", f9.6)', sum((ypred_sgd - ytest)**2) / size(ytest)
+  print '("     Batch gradient descent MSE: ", f9.6)', sum((ypred_batch_sgd - ytest)**2) / size(ytest)
+  print '(" Minibatch gradient descent MSE: ", f9.6)', sum((ypred_minibatch_sgd - ytest)**2) / size(ytest)
+
+contains
+
+  real elemental function quadratic(x) result(y)
+    ! Quadratic function
+    real, intent(in) :: x
+    y = (x**2 / 2 + x / 2 + 1) / 2
+  end function quadratic
+
+  subroutine sgd_optimizer(net, x, y, learning_rate, num_epochs)
+    ! In the stochastic gradient descent (SGD) optimizer, we run the forward
+    ! and backward passes and update the weights for each training sample,
+    ! one at a time.
+    type(network), intent(inout) :: net
+    real, intent(in) :: x(:), y(:)
+    real, intent(in) :: learning_rate
+    integer, intent(in) :: num_epochs
+    integer :: i, n
+
+    print *, "Running SGD optimizer..."
+
+    do n = 1, num_epochs
+      do i = 1, size(x)
+        call net % forward([x(i)])
+        call net % backward([y(i)])
+        call net % update(learning_rate)
+      end do
+    end do
+
+  end subroutine sgd_optimizer
+
+  subroutine batch_gd_optimizer(net, x, y, learning_rate, num_epochs)
+    ! Like the stochastic gradient descent (SGD) optimizer, except that here we
+    ! accumulate the weight gradients for all training samples and update the
+    ! weights once per epoch.
+    type(network), intent(inout) :: net
+    real, intent(in) :: x(:), y(:)
+    real, intent(in) :: learning_rate
+    integer, intent(in) :: num_epochs
+    integer :: i, n
+
+    print *, "Running batch GD optimizer..."
+
+    do n = 1, num_epochs
+      do i = 1, size(x)
+        call net % forward([x(i)])
+        call net % backward([y(i)])
+      end do
+      call net % update(learning_rate / size(x))
+    end do
+
+  end subroutine batch_gd_optimizer
+
+  subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_size)
+    ! Like the batch SGD optimizer, except that here we accumulate the weight
+    ! over a number of mini batches and update the weights once per mini batch.
+    !
+    ! Note: -O3 on GFortran must be accompanied with -fno-frontend-optimize for
+    ! this subroutine to converge to a solution.
+    type(network), intent(inout) :: net
+    real, intent(in) :: x(:), y(:)
+    real, intent(in) :: learning_rate
+    integer, intent(in) :: num_epochs, batch_size
+    integer :: i, j, n, num_samples, num_batches, start_index, end_index
+    real, allocatable :: batch_x(:), batch_y(:)
+    integer, allocatable :: batch_indices(:)
+
+    print *, "Running mini-batch GD optimizer..."
+
+    num_samples = size(x)
+    num_batches = num_samples / batch_size
+
+    ! Generate shuffled indices for the mini-batches
+    allocate(batch_x(batch_size), batch_y(batch_size))
+    allocate(batch_indices(num_batches))
+
+    do j = 1, num_batches
+      batch_indices(j) = (j - 1) * batch_size + 1
+    end do
+
+    call shuffle(batch_indices)
+
+    do n = 1, num_epochs
+      do j = 1, num_batches
+        start_index = batch_indices(j)
+        end_index = min(start_index + batch_size - 1, num_samples)
+
+        do i = start_index, end_index
+          call net % forward([x(i)])
+          call net % backward([y(i)])
+        end do
+
+        call net % update(learning_rate / batch_size)
+      end do
+    end do
+  end subroutine minibatch_gd_optimizer
+
+  subroutine shuffle(arr)
+    ! Shuffle an array using the Fisher-Yates algorithm.
+    integer, intent(inout) :: arr(:)
+    real :: a
+    integer :: i, temp
+
+    do i = size(arr), 2, -1
+      call random_number(a)
+      a = floor(a * real(i)) + 1
+      temp = arr(i)
+      arr(i) = arr(int(a))
+      arr(int(a)) = temp
+    end do
+  end subroutine shuffle
+
+end program quadratic_fit

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ foreach(execid`
`6`	`6`	`get_set_network_params`
`7`	`7`	`simple`
`8`	`8`	`sine`
	`9`	`+ quadratic`
`9`	`10`	`)`
`10`	`11`	`add_executable(${execid} ${execid}.f90)`
`11`	`12`	`target_link_libraries(${execid} PRIVATE`