From bc5adc2063a8a19b964f956bb1eb132ebc505651 Mon Sep 17 00:00:00 2001 From: Jesse Livezey Date: Sat, 21 Feb 2015 16:51:26 -0800 Subject: [PATCH 1/3] first go --- pylearn2/training_algorithms/learning_rule.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/pylearn2/training_algorithms/learning_rule.py b/pylearn2/training_algorithms/learning_rule.py index 72da94ebc5..033570ae23 100644 --- a/pylearn2/training_algorithms/learning_rule.py +++ b/pylearn2/training_algorithms/learning_rule.py @@ -498,3 +498,68 @@ def get_updates(self, learning_rate, grads, lr_scalers=None): updates[param] = param + delta_x_t return updates + + +class Adam(LearningRule): + """ + Implements the AdaDelta learning rule as described in: + "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. + + Parameters + ---------- + b1 : float, optional + Decay rate for first moment estimate. + b2 : float, optional + Decay rate for second moment estimate. + eps : float, optional + Denominator minimum value. + lamb : float, optional + Decay rate for first moment coefficient. + """ + + def __init__(self, b1=.1, b2=.001, eps=1.e-8, lamb=1.e-8): + assert b1 > 0. and b1 <= 1. + assert b2 > 0. and b2 <= 1. + self.b1 = b1 + self.b2 = b2 + self.eps = eps + self.lamb = lamb + + def get_updates(self, learning_rate, grads, lr_scalers=None): + """ + Compute the Adam updates + + Parameters + ---------- + learning_rate : float + Learning rate coefficient. + grads : dict + A dictionary mapping from the model's parameters to their + gradients. + lr_scalers : dict + A dictionary mapping from the model's parameters to a learning + rate multiplier. + """ + updates = OrderedDict() + t = sharedX(0.) + t_p1 = t+1. + b1_cor = 1. - (1. - b1)**t_p1 + b2_cor = 1. - (1. - b2)**t_p1 + b1t = 1. - (1. - b1)*self.lamb**(t) + for param in grads.keys(): + alpha = learning_rate *lr_scalers.get(param, 1.) + # m: first moment estimate + m = sharedX(param.get_value() * 0.) + # v: second moment estimate + v = sharedX(param.get_value() * 0.) + mt = b1t * grads[param] + (1. - b1t)*m + vt = b2 * T.sqr(grads[param]) + (1. - b2)*v + mt_hat = mt / b1_corr + vt_hat = vt / b2_corr + delta = -alpha * mt_hat / (T.sqrt(vt_hat) + self.eps) + updates[param] = param + delta + updates[m] = mt + updates[v] = vt + updates[t] = t_p1 + + return updates From f15d0c90f9f6d02ea1954b58fe634113fdaec585 Mon Sep 17 00:00:00 2001 From: Jesse Livezey Date: Tue, 3 Mar 2015 23:17:00 -0800 Subject: [PATCH 2/3] changes based on paper --- pylearn2/training_algorithms/learning_rule.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pylearn2/training_algorithms/learning_rule.py b/pylearn2/training_algorithms/learning_rule.py index 033570ae23..34d645d66c 100644 --- a/pylearn2/training_algorithms/learning_rule.py +++ b/pylearn2/training_algorithms/learning_rule.py @@ -502,8 +502,9 @@ def get_updates(self, learning_rate, grads, lr_scalers=None): class Adam(LearningRule): """ - Implements the AdaDelta learning rule as described in: - "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. + Implements the Adam learning rule as described in: + "Adam: A Method for Stochastic Optimization", + Diederik P. Kingma, Jimmy Lei Ba. Parameters ---------- @@ -514,10 +515,10 @@ class Adam(LearningRule): eps : float, optional Denominator minimum value. lamb : float, optional - Decay rate for first moment coefficient. + Decay rate for first moment decay rate. """ - def __init__(self, b1=.1, b2=.001, eps=1.e-8, lamb=1.e-8): + def __init__(self, b1=.9, b2=.999, eps=1.e-8, lamb=(1.-1.e-8)): assert b1 > 0. and b1 <= 1. assert b2 > 0. and b2 <= 1. self.b1 = b1 @@ -543,20 +544,19 @@ def get_updates(self, learning_rate, grads, lr_scalers=None): updates = OrderedDict() t = sharedX(0.) t_p1 = t+1. - b1_cor = 1. - (1. - b1)**t_p1 - b2_cor = 1. - (1. - b2)**t_p1 - b1t = 1. - (1. - b1)*self.lamb**(t) + b1t = self.b1*(self.lamb**t) for param in grads.keys(): alpha = learning_rate *lr_scalers.get(param, 1.) # m: first moment estimate m = sharedX(param.get_value() * 0.) # v: second moment estimate v = sharedX(param.get_value() * 0.) - mt = b1t * grads[param] + (1. - b1t)*m - vt = b2 * T.sqr(grads[param]) + (1. - b2)*v - mt_hat = mt / b1_corr - vt_hat = vt / b2_corr - delta = -alpha * mt_hat / (T.sqrt(vt_hat) + self.eps) + + mt = (1.-b1t)*grads[param] + b1t*m + vt = (1.-self.b2)*T.sqr(grads[param]) + self.b2*v + at = alpha*T.sqrt(1-self.b2**2)/(1-b1t) + delta = -at * mt / (T.sqrt(vt_hat) + self.eps) + updates[param] = param + delta updates[m] = mt updates[v] = vt From a6801cf71194b419b9ce04c05e533b394886d845 Mon Sep 17 00:00:00 2001 From: JesseLivezey Date: Fri, 6 Mar 2015 15:20:42 -0800 Subject: [PATCH 3/3] fixed y_hat --- pylearn2/training_algorithms/learning_rule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylearn2/training_algorithms/learning_rule.py b/pylearn2/training_algorithms/learning_rule.py index 34d645d66c..4883884d1d 100644 --- a/pylearn2/training_algorithms/learning_rule.py +++ b/pylearn2/training_algorithms/learning_rule.py @@ -555,7 +555,7 @@ def get_updates(self, learning_rate, grads, lr_scalers=None): mt = (1.-b1t)*grads[param] + b1t*m vt = (1.-self.b2)*T.sqr(grads[param]) + self.b2*v at = alpha*T.sqrt(1-self.b2**2)/(1-b1t) - delta = -at * mt / (T.sqrt(vt_hat) + self.eps) + delta = -at * mt / (T.sqrt(vt) + self.eps) updates[param] = param + delta updates[m] = mt