@@ -3,21 +3,23 @@ program quadratic_fit
33 ! stochastic gradient descent, batch gradient descent, and minibatch gradient
44 ! descent.
55 use nf, only: dense, input, network
6+ use nf_dense_layer, only: dense_layer
67
78 implicit none
8- type (network) :: net_sgd, net_batch_sgd, net_minibatch_sgd
9+ type (network) :: net_sgd, net_batch_sgd, net_minibatch_sgd, net_rms_prop
910
1011 ! Training parameters
1112 integer , parameter :: num_epochs = 1000
1213 integer , parameter :: train_size = 1000
1314 integer , parameter :: test_size = 30
1415 integer , parameter :: batch_size = 10
1516 real , parameter :: learning_rate = 0.01
17+ real , parameter :: decay_rate = 0.9
1618
1719 ! Input and output data
1820 real , allocatable :: x(:), y(:) ! training data
1921 real , allocatable :: xtest(:), ytest(:) ! testing data
20- real , allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:)
22+ real , allocatable :: ypred_sgd(:), ypred_batch_sgd(:), ypred_minibatch_sgd(:), ypred_rms_prop(:)
2123
2224 integer :: i, n
2325
@@ -42,6 +44,7 @@ program quadratic_fit
4244 net_sgd = network([input(1 ), dense(3 ), dense(1 )])
4345 net_batch_sgd = network([input(1 ), dense(3 ), dense(1 )])
4446 net_minibatch_sgd = network([input(1 ), dense(3 ), dense(1 )])
47+ net_rms_prop = network([input(1 ), dense(3 ), dense(1 )])
4548
4649 ! Print network info to stdout; this will be the same for all three networks.
4750 call net_sgd % print_info()
@@ -55,15 +58,20 @@ program quadratic_fit
5558 ! Mini-batch SGD optimizer
5659 call minibatch_gd_optimizer(net_minibatch_sgd, x, y, learning_rate, num_epochs, batch_size)
5760
61+ ! RMSProp optimizer
62+ call rmsprop_optimizer(net_rms_prop, x, y, learning_rate, num_epochs, decay_rate)
63+
5864 ! Calculate predictions on the test set
5965 ypred_sgd = [(net_sgd % predict([xtest(i)]), i = 1 , test_size)]
6066 ypred_batch_sgd = [(net_batch_sgd % predict([xtest(i)]), i = 1 , test_size)]
6167 ypred_minibatch_sgd = [(net_minibatch_sgd % predict([xtest(i)]), i = 1 , test_size)]
68+ ypred_rms_prop = [(net_rms_prop % predict([xtest(i)]), i = 1 , test_size)]
6269
6370 ! Print the mean squared error
6471 print ' ("Stochastic gradient descent MSE:", f9.6)' , sum ((ypred_sgd - ytest)** 2 ) / size (ytest)
6572 print ' (" Batch gradient descent MSE: ", f9.6)' , sum ((ypred_batch_sgd - ytest)** 2 ) / size (ytest)
6673 print ' (" Minibatch gradient descent MSE: ", f9.6)' , sum ((ypred_minibatch_sgd - ytest)** 2 ) / size (ytest)
74+ print ' (" RMSProp MSE: ", f9.6)' , sum ((ypred_rms_prop - ytest)** 2 ) / size (ytest)
6775
6876contains
6977
@@ -161,6 +169,77 @@ subroutine minibatch_gd_optimizer(net, x, y, learning_rate, num_epochs, batch_si
161169 end do
162170 end subroutine minibatch_gd_optimizer
163171
172+ subroutine rmsprop_optimizer (net , x , y , learning_rate , num_epochs , decay_rate )
173+ ! RMSprop optimizer for updating weights using root mean square
174+ type (network), intent (inout ) :: net
175+ real , intent (in ) :: x(:), y(:)
176+ real , intent (in ) :: learning_rate, decay_rate
177+ integer , intent (in ) :: num_epochs
178+ integer :: i, j, n
179+ real , parameter :: epsilon = 1e-8 ! Small constant to avoid division by zero
180+
181+ ! Define a dedicated type to store the RMSprop gradients.
182+ ! This is needed because array sizes vary between layers and we need to
183+ ! keep track of gradients for each layer over time.
184+ ! For now this works only for dense layers.
185+ ! We will need to define a similar type for conv2d layers.
186+ type :: rms_gradient_dense
187+ real , allocatable :: dw(:,:)
188+ real , allocatable :: db(:)
189+ end type rms_gradient_dense
190+
191+ type (rms_gradient_dense), allocatable :: rms(:)
192+
193+ print * , " Running RMSprop optimizer..."
194+
195+ ! Here we allocate the array or RMS gradient derived types.
196+ ! We need one for each dense layer, however we will allocate it to the
197+ ! length of all layers as it will make housekeeping easier.
198+ allocate (rms(size (net % layers)))
199+
200+ do n = 1 , num_epochs
201+
202+ do i = 1 , size (x)
203+ call net % forward([x(i)])
204+ call net % backward([y(i)])
205+ end do
206+
207+ ! RMSprop update rule
208+ do j = 1 , size (net % layers)
209+ select type (this_layer = > net % layers(j) % p)
210+ type is (dense_layer)
211+
212+ ! If this is our first time here for this layer, allocate the
213+ ! internal RMS gradient arrays and initialize them to zero.
214+ if (.not. allocated (rms(j) % dw)) then
215+ allocate (rms(j) % dw, mold= this_layer % dw)
216+ allocate (rms(j) % db, mold= this_layer % db)
217+ rms(j) % dw = 0
218+ rms(j) % db = 0
219+ end if
220+
221+ ! Update the RMS gradients using the RMSprop moving average rule
222+ rms(j) % dw = decay_rate * rms(j) % dw + (1 - decay_rate) * this_layer % dw** 2
223+ rms(j) % db = decay_rate * rms(j) % db + (1 - decay_rate) * this_layer % db** 2
224+
225+ ! Update weights and biases using the RMSprop update rule
226+ this_layer % weights = this_layer % weights - learning_rate &
227+ / sqrt (rms(j) % dw + epsilon) * this_layer % dw
228+ this_layer % biases = this_layer % biases - learning_rate &
229+ / sqrt (rms(j) % db + epsilon) * this_layer % db
230+
231+ ! We have updated the weights and biases, so we need to reset the
232+ ! gradients to zero for the next epoch.
233+ this_layer % dw = 0
234+ this_layer % db = 0
235+
236+ end select
237+ end do
238+
239+ end do
240+
241+ end subroutine rmsprop_optimizer
242+
164243 subroutine shuffle (arr )
165244 ! Shuffle an array using the Fisher-Yates algorithm.
166245 integer , intent (inout ) :: arr(:)
0 commit comments