@@ -276,7 +276,7 @@ def apply_momentum(updates, params=None, momentum=0.9):
276276 for param in params :
277277 value = param .get_value (borrow = True )
278278 velocity = aesara .shared (
279- np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable
279+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
280280 )
281281 x = momentum * velocity + updates [param ]
282282 updates [velocity ] = x - param
@@ -391,7 +391,7 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9):
391391 for param in params :
392392 value = param .get_value (borrow = True )
393393 velocity = aesara .shared (
394- np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable
394+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
395395 )
396396 x = momentum * velocity + updates [param ] - param
397397 updates [velocity ] = x
@@ -534,7 +534,9 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
534534
535535 for param , grad in zip (params , grads ):
536536 value = param .get_value (borrow = True )
537- accu = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
537+ accu = aesara .shared (
538+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
539+ )
538540 accu_new = accu + grad ** 2
539541 updates [accu ] = accu_new
540542 updates [param ] = param - (learning_rate * grad / at .sqrt (accu_new + epsilon ))
@@ -660,7 +662,9 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
660662
661663 for param , grad in zip (params , grads ):
662664 value = param .get_value (borrow = True )
663- accu = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
665+ accu = aesara .shared (
666+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
667+ )
664668 accu_new = rho * accu + (one - rho ) * grad ** 2
665669 updates [accu ] = accu_new
666670 updates [param ] = param - (learning_rate * grad / at .sqrt (accu_new + epsilon ))
@@ -751,10 +755,12 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
751755 for param , grad in zip (params , grads ):
752756 value = param .get_value (borrow = True )
753757 # accu: accumulate gradient magnitudes
754- accu = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
758+ accu = aesara .shared (
759+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
760+ )
755761 # delta_accu: accumulate update magnitudes (recursively!)
756762 delta_accu = aesara .shared (
757- np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable
763+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
758764 )
759765
760766 # update accu (as in rmsprop)
@@ -844,8 +850,12 @@ def adam(
844850
845851 for param , g_t in zip (params , all_grads ):
846852 value = param .get_value (borrow = True )
847- m_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
848- v_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
853+ m_prev = aesara .shared (
854+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
855+ )
856+ v_prev = aesara .shared (
857+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
858+ )
849859
850860 m_t = beta1 * m_prev + (one - beta1 ) * g_t
851861 v_t = beta2 * v_prev + (one - beta2 ) * g_t ** 2
@@ -928,8 +938,12 @@ def adamax(
928938
929939 for param , g_t in zip (params , all_grads ):
930940 value = param .get_value (borrow = True )
931- m_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
932- u_prev = aesara .shared (np .zeros (value .shape , dtype = value .dtype ), shape = param .broadcastable )
941+ m_prev = aesara .shared (
942+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
943+ )
944+ u_prev = aesara .shared (
945+ np .zeros (value .shape , dtype = value .dtype ), broadcastable = param .broadcastable
946+ )
933947
934948 m_t = beta1 * m_prev + (one - beta1 ) * g_t
935949 u_t = at .maximum (beta2 * u_prev , abs (g_t ))
0 commit comments