pytorch
diff --git a/‎word_language_model/README.md‎
Lines changed: 5 additions & 2 deletions b/‎word_language_model/README.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎word_language_model/main.py‎
Lines changed: 15 additions & 4 deletions b/‎word_language_model/main.py‎
Lines changed: 15 additions & 4 deletions
@@ -8,9 +8,11 @@ python main.py --accel --epochs 6 # Train a LSTM on Wikitext-2.
 python main.py --accel --epochs 6 --tied # Train a tied LSTM on Wikitext-2.
 python main.py --accel --tied # Train a tied LSTM on Wikitext-2for 40 epochs.
 python main.py --accel --epochs 6 --model Transformer --lr 5
- # Train a Transformer model on Wikitext-2.
+ # Train a Transformer model on Wikitext-2.
+python main.py --accel --epochs 6 --model Transformer --use-optimizer --lr 0.001
+ # Train a Transformer model with AdamW optimizer on Wikitext-2.
 
-python generate.py --accel  # Generate samples from the default model checkpoint.
+python generate.py --accel # Generate samples from the default model checkpoint.
 ```
 
 > [!NOTE] 
@@ -45,6 +47,7 @@ optional arguments:
  path to export the final model in onnx format
  --nhead NHEAD the number of heads in the encoder/decoder of the transformer model
  --dry-run verify the code and the model
+ --use-optimizer specify whether to use an AdamW optimizer
 ```
 
 With these arguments, a variety of models can be tested.
 
@@ -47,7 +47,10 @@
  help='the number of heads in the encoder/decoder of the transformer model')
 parser.add_argument('--dry-run', action='store_true',
  help='verify the code and the model')
-parser.add_argument('--accel', action='store_true',help='Enables accelerated training')
+parser.add_argument('--accel', action='store_true',
+ help='Enables accelerated training')
+parser.add_argument('--use-optimizer', action='store_true',
+ help='Uses AdamW optimizer for gradient updating')
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
@@ -104,6 +107,8 @@ def batchify(data, bsz):
  model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
 
 criterion = nn.NLLLoss()
+if args.use_optimizer:
+ optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
 
 ###############################################################################
 # Training code
@@ -167,7 +172,10 @@ def train():
  data, targets = get_batch(train_data, i)
  # Starting each batch, we detach the hidden state from how it was previously produced.
  # If we didn't, the model would try backpropagating all the way to start of the dataset.
- model.zero_grad()
+ if args.use_optimizer:
+ optimizer.zero_grad()
+ else:
+ model.zero_grad()
  if args.model == 'Transformer':
  output = model(data)
  output = output.view(-1, ntokens)
@@ -179,8 +187,11 @@ def train():
 
  # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
  torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
- for p in model.parameters():
- p.data.add_(p.grad, alpha=-lr)
+ if args.use_optimizer:
+ optimizer.step()
+ else:
+ for p in model.parameters():
+ p.data.add_(p.grad, alpha=-lr)
 
  total_loss += loss.item()