Skip to content

Commit 11bebeb

Browse files
authored
Removing updates of Beta1 and Beta2 power accumulators outside the op (#4925)
1 parent 3db5278 commit 11bebeb

File tree

3 files changed

+15
-40
lines changed

3 files changed

+15
-40
lines changed

paddle/operators/adam_op.cc

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,6 @@ class AdamOp : public framework::OperatorWithKernel {
4343
"Output(Moment1Out) of AdamOp should not be null.");
4444
PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
4545
"Output(Moment2Out) of AdamOp should not be null.");
46-
PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
47-
"Output(Beta1PowOut) of AdamOp should not be null.");
48-
PADDLE_ENFORCE(ctx->HasOutput("Beta2PowOut"),
49-
"Output(Beta2PowOut) of AdamOp should not be null.");
5046

5147
auto lr_dims = ctx->GetInputDim("LearningRate");
5248
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
@@ -72,8 +68,6 @@ class AdamOp : public framework::OperatorWithKernel {
7268
ctx->SetOutputDim("ParamOut", param_dims);
7369
ctx->SetOutputDim("Moment1Out", param_dims);
7470
ctx->SetOutputDim("Moment2Out", param_dims);
75-
ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
76-
ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
7771
}
7872
};
7973

@@ -92,8 +86,6 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
9286
AddOutput("ParamOut", "(Tensor) Output parameter");
9387
AddOutput("Moment1Out", "(Tensor) Output first moment");
9488
AddOutput("Moment2Out", "(Tensor) Output second moment");
95-
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
96-
AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
9789

9890
AddAttr<float>("beta1",
9991
"(float, default 0.9) "
@@ -121,10 +113,8 @@ Adam updates:
121113
122114
moment1_out = beta1 * moment1 + (1 − beta1) * grad
123115
moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
124-
beta1_pow_out = beta1_pow * beta1
125-
beta2_pow_out = beta2_pow * beta2
126116
learning_rate_t = learning_rate_t *
127-
sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
117+
sqrt(1 - beta2_pow) / (1 - beta1_pow)
128118
param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
129119
130120
References:

paddle/operators/adam_op.h

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
2626
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
2727
auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
2828
auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
29-
auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
30-
auto beta2_pow_out_tensor = ctx.Output<framework::Tensor>("Beta2PowOut");
3129

3230
param_out_tensor->mutable_data<T>(ctx.GetPlace());
3331
moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
3432
moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
35-
beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
36-
beta2_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
3733

3834
float beta1 = ctx.Attr<float>("beta1");
3935
float beta2 = ctx.Attr<float>("beta2");
@@ -56,18 +52,13 @@ class AdamOpKernel : public framework::OpKernel<T> {
5652
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
5753
auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
5854
auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
59-
auto beta1_pow_out =
60-
framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
61-
auto beta2_pow_out =
62-
framework::EigenVector<T>::Flatten(*beta2_pow_out_tensor);
6355
auto place = ctx.GetEigenDevice<Place>();
6456

6557
moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
6658
moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
67-
beta1_pow_out.device(place) = beta1_pow * beta1;
68-
beta2_pow_out.device(place) = beta2_pow * beta2;
59+
6960
// All of these are tensors of 1 element
70-
auto lr_t = lr * (1 - beta2_pow_out).sqrt() / (1 - beta1_pow_out);
61+
auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
7162
// Eigen does not support automatic broadcast
7263
// Get dimensions of moment vector to broadcast lr_t
7364
Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());

python/paddle/v2/framework/tests/test_adam_op.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,12 @@ def setUp(self):
3333

3434
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
3535

36-
param_out, moment1_out, moment2_out, beta1_pow_out, \
37-
beta2_pow_out = adam_step(self.inputs, self.attrs)
36+
param_out, moment1_out, \
37+
moment2_out = adam_step(self.inputs, self.attrs)
3838

3939
self.outputs = {
4040
'Moment1Out': moment1_out,
4141
'Moment2Out': moment2_out,
42-
'Beta1PowOut': beta1_pow_out,
43-
'Beta2PowOut': beta2_pow_out,
4442
'ParamOut': param_out
4543
}
4644

@@ -78,14 +76,12 @@ def setUp(self):
7876

7977
attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
8078

81-
param_out, moment1_out, moment2_out, beta1_pow_out, \
82-
beta2_pow_out = adam_step(self.inputs, attributes)
79+
param_out, moment1_out, \
80+
moment2_out = adam_step(self.inputs, attributes)
8381

8482
self.outputs = {
8583
'Moment1Out': moment1_out,
8684
'Moment2Out': moment2_out,
87-
'Beta1PowOut': beta1_pow_out,
88-
'Beta2PowOut': beta2_pow_out,
8985
'ParamOut': param_out
9086
}
9187

@@ -127,14 +123,12 @@ def setUp(self):
127123

128124
def test_check_output(self):
129125
for _ in range(self.num_steps):
130-
param_out, moment1_out, moment2_out, beta1_pow_out, \
131-
beta2_pow_out = adam_step(self.inputs, self.attrs)
126+
param_out, moment1_out, \
127+
moment2_out = adam_step(self.inputs, self.attrs)
132128

133129
self.outputs = {
134130
'Moment1Out': moment1_out,
135131
'Moment2Out': moment2_out,
136-
'Beta1PowOut': beta1_pow_out,
137-
'Beta2PowOut': beta2_pow_out,
138132
'ParamOut': param_out
139133
}
140134

@@ -145,8 +139,10 @@ def test_check_output(self):
145139
self.inputs['Param'] = param_out
146140
self.inputs['Moment1'] = moment1_out
147141
self.inputs['Moment2'] = moment2_out
148-
self.inputs['Beta1Pow'] = beta1_pow_out
149-
self.inputs['Beta2Pow'] = beta2_pow_out
142+
143+
# Update powers of Beta1 and Beta2 for next time step
144+
self.inputs['Beta1Pow'] *= self.attrs['beta1']
145+
self.inputs['Beta2Pow'] *= self.attrs['beta1']
150146

151147
# Randomize gradient for next step
152148
self.inputs['Grad'] = np.random.uniform(
@@ -175,11 +171,9 @@ def adam_step(inputs, attributes):
175171

176172
moment1_out = beta1 * moment1 + (1 - beta1) * grad
177173
moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
178-
beta1_pow_out = beta1_pow * beta1
179-
beta2_pow_out = beta2_pow * beta2
180-
lr_t = lr * np.sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
174+
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
181175
param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
182-
return param_out, moment1_out, moment2_out, beta1_pow_out, beta2_pow_out
176+
return param_out, moment1_out, moment2_out
183177

184178

185179
if __name__ == "__main__":

0 commit comments

Comments
 (0)