|
24 | 24 | help='number of epoch (default: 100)') |
25 | 25 | parser.add_argument('--hidden_size', type=int, default=10, metavar='N', |
26 | 26 | help='hidden size of the meta optimizer (default: 10)') |
| 27 | +parser.add_argument('--no-cuda', action='store_true', default=False, |
| 28 | + help='enables CUDA training') |
27 | 29 | args = parser.parse_args() |
| 30 | +args.cuda = not args.no_cuda and torch.cuda.is_available() |
28 | 31 |
|
29 | 32 | assert args.optimizer_steps % args.truncated_bptt_step == 0 |
30 | 33 |
|
31 | | -# Create a meta optimizer that wraps a model into a meta model |
32 | | -# to keep track of the meta updates. |
33 | | -meta_optimizer = MetaOptimizer(MetaModel(Model()), args.hidden_size) |
34 | | -optimizer = optim.Adam(meta_optimizer.parameters(), lr=1e-3) |
35 | | -loss_fn = lambda f_x, y: (f_x - y).pow(2).mean() |
36 | 34 |
|
37 | | -for epoch in range(args.max_epoch): |
38 | | - decrease_in_loss = 0.0 |
39 | | - for i in range(args.updates_per_epoch): |
40 | | - |
41 | | - # Sample a new model |
42 | | - model = Model() |
43 | | - |
44 | | - x, y = get_batch(args.batch_size) |
45 | | - x, y = Variable(x), Variable(y) |
46 | | - |
47 | | - # Compute initial loss of the model |
48 | | - f_x = model(x) |
49 | | - initial_loss = loss_fn(f_x, y) |
50 | | - |
51 | | - for k in range(args.optimizer_steps // args.truncated_bptt_step): |
52 | | - # Keep states for truncated BPTT |
53 | | - meta_optimizer.reset_lstm(keep_states=k > 0, model=model) |
54 | | - |
55 | | - loss_sum = 0 |
56 | | - for j in range(args.truncated_bptt_step): |
57 | | - x, y = get_batch(args.batch_size) |
58 | | - x, y = Variable(x), Variable(y) |
59 | | - |
60 | | - # First we need to compute the gradients of the model |
61 | | - f_x = model(x) |
62 | | - loss = loss_fn(f_x, y) |
63 | | - model.zero_grad() |
64 | | - loss.backward() |
65 | | - |
66 | | - # Perfom a meta update using gradients from model |
67 | | - # and return the current meta model saved in the optimizer |
68 | | - meta_model = meta_optimizer.meta_update(model) |
69 | | - |
70 | | - # Compute a loss for a step the meta optimizer |
71 | | - f_x = meta_model(x) |
72 | | - loss = loss_fn(f_x, y) |
73 | | - loss_sum += loss |
74 | | - |
75 | | - # Update the parameters of the meta optimizer |
76 | | - meta_optimizer.zero_grad() |
77 | | - loss_sum.backward() |
78 | | - optimizer.step() |
79 | | - |
80 | | - # Compute relative decrease in the loss function w.r.t initial value |
81 | | - decrease_in_loss += loss.data[0] / initial_loss.data[0] |
82 | | - |
83 | | - print("Epoch: {}, average final/initial loss ratio: {}".format(epoch, |
84 | | - decrease_in_loss / args.updates_per_epoch)) |
| 35 | +def main(): |
| 36 | + # Create a meta optimizer that wraps a model into a meta model |
| 37 | + # to keep track of the meta updates. |
| 38 | + meta_model = Model() |
| 39 | + if args.cuda: |
| 40 | + meta_model.cuda() |
| 41 | + |
| 42 | + meta_optimizer = MetaOptimizer(MetaModel(meta_model), args.hidden_size) |
| 43 | + if args.cuda: |
| 44 | + meta_optimizer.cuda() |
| 45 | + |
| 46 | + optimizer = optim.Adam(meta_optimizer.parameters(), lr=1e-3) |
| 47 | + loss_fn = lambda f_x, y: (f_x - y).pow(2).mean() |
| 48 | + |
| 49 | + for epoch in range(args.max_epoch): |
| 50 | + decrease_in_loss = 0.0 |
| 51 | + for i in range(args.updates_per_epoch): |
| 52 | + |
| 53 | + # Sample a new model |
| 54 | + model = Model() |
| 55 | + if args.cuda: |
| 56 | + model.cuda() |
| 57 | + |
| 58 | + x, y = get_batch(args.batch_size) |
| 59 | + x, y = Variable(x), Variable(y) |
| 60 | + if args.cuda: |
| 61 | + x, y = x.cuda(), y.cuda() |
| 62 | + |
| 63 | + # Compute initial loss of the model |
| 64 | + f_x = model(x) |
| 65 | + initial_loss = loss_fn(f_x, y) |
| 66 | + |
| 67 | + for k in range(args.optimizer_steps // args.truncated_bptt_step): |
| 68 | + # Keep states for truncated BPTT |
| 69 | + meta_optimizer.reset_lstm( |
| 70 | + keep_states=k > 0, model=model, use_cuda=args.cuda) |
| 71 | + |
| 72 | + loss_sum = 0 |
| 73 | + for j in range(args.truncated_bptt_step): |
| 74 | + x, y = get_batch(args.batch_size) |
| 75 | + x, y = Variable(x), Variable(y) |
| 76 | + if args.cuda: |
| 77 | + x, y = x.cuda(), y.cuda() |
| 78 | + |
| 79 | + # First we need to compute the gradients of the model |
| 80 | + f_x = model(x) |
| 81 | + loss = loss_fn(f_x, y) |
| 82 | + model.zero_grad() |
| 83 | + loss.backward() |
| 84 | + |
| 85 | + # Perfom a meta update using gradients from model |
| 86 | + # and return the current meta model saved in the optimizer |
| 87 | + meta_model = meta_optimizer.meta_update(model) |
| 88 | + |
| 89 | + # Compute a loss for a step the meta optimizer |
| 90 | + f_x = meta_model(x) |
| 91 | + loss = loss_fn(f_x, y) |
| 92 | + loss_sum += loss |
| 93 | + |
| 94 | + # Update the parameters of the meta optimizer |
| 95 | + meta_optimizer.zero_grad() |
| 96 | + loss_sum.backward() |
| 97 | + optimizer.step() |
| 98 | + |
| 99 | + # Compute relative decrease in the loss function w.r.t initial |
| 100 | + # value |
| 101 | + decrease_in_loss += loss.data[0] / initial_loss.data[0] |
| 102 | + |
| 103 | + print("Epoch: {}, average final/initial loss ratio: {}".format(epoch, |
| 104 | + decrease_in_loss / args.updates_per_epoch)) |
| 105 | + |
| 106 | +if __name__ == "__main__": |
| 107 | + main() |
0 commit comments