Skip to content

Commit 2438065

Browse files
author
KaijuML
committed
update train.cfg
1 parent a474372 commit 2438065

File tree

1 file changed

+77
-121
lines changed

1 file changed

+77
-121
lines changed

train.cfg

Lines changed: 77 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,138 +1,94 @@
1-
# commented lines are boolean values that defaults to None
2-
# uncomment to pass as True
3-
41
# Model/Embeddings
5-
# src_word_vec_size: 500 # Word embedding size for src.
6-
# tgt_word_vec_size: 500 # Word embedding size for tgt.
7-
word_vec_size: 600 # Word embedding size for src and tgt.
8-
2+
word_vec_size: 600 # Word embedding size for src and tgt
3+
share_embeddings: True # Share embeddings from src and tgt
94

105
# Model/Embedding Features
11-
feat_vec_size: 100 # feature embedding sizes as absolute
12-
#feat_vec_exponent: 0.7 # Not meant to be changed it suppresses a warning
13-
feat_merge: mlp # Merge action for incorporating features embeddings [concat|sum|mlp]
6+
feat_vec_size: 20 # Attribute embedding size
7+
feat_merge: mlp # Merge action for incorporating feature embeddings [concat|sum|mlp]
148
feat_merge_activation: ReLU
159

16-
######################## MODEL STRUCTURE ############
17-
model_type: table # Type of source model to use. [text|table|img|audio].
18-
model_dtype: fp32 # Data type of the model.[rnn|brnn|mean|transformer|htransformer|cnn].
19-
decoder_type: hrnn # Type of decoder layer to use. [rnn|transformer|cnn].
20-
layers: -1 # Number of layers in enc/dec.
2110

22-
######################## ENCODER ##################
23-
encoder_type: htransformer # Type of encoder layer to use.
24-
transformer_ff: 1024 # Size of hidden transformer feed-forward
11+
# Model Structure
12+
model_type: table # Type of source model to use [text|table|img|audio]
13+
model_dtype: fp32
14+
encoder_type: htransformer # Type of encoder [rnn|brnn|transformer|htransformer|cnn]
15+
decoder_type: hrnn # Type of decoder [rnn|transformer|cnn|hrnn]
16+
param_init: 0.1 # Uniform distribution with support (-param_init, +param_init)
2517

26-
# over rides
27-
enc_layers: -1 # Number of layers in the encoder
28-
heads: -1 # Number of heads for transformer self-attention
18+
# We put sizes we wish to change manually at -1
19+
layers: -1
20+
enc_layers: -1
21+
heads: -1
2922
glu_depth: -1
3023

31-
# customs
32-
units_layers: 6 # number of layers in the low level transformer
33-
chunks_layers: 6
34-
units_heads: 6
35-
chunks_heads: 6
36-
units_glu_depth: 1 # usually 2
37-
chunks_glu_depth: 1 # usually 2
38-
39-
40-
######################## DECODER ##################
41-
dec_layers: 2 # Number of layers in the decoder
42-
rnn_size: 600 # Size of rnn hidden states. Overwrites enc_rnn_size and dec_rnn_size
43-
# enc_rnn_size: 600 # Size of encoder rnn hidden states. Must be equal to dec_rnn_size
44-
# dec_rnn_size: 600 # Size of encoder rnn hidden states. Must be equal to enc_rnn_size
45-
cnn_kernel_width: 3 # Size of cnn windows, the kernel_size is (cnn_kernel_width, 1)
46-
input_feed: 1 # Feed the context vector at each time step as additional input
47-
bridge: True # additional layer between the last encoder state and the first decoder state
24+
# Encoder sizes
25+
transformer_ff: 1024 # Size of hidden transformer feed-forward
26+
units_layers: 2
27+
chunks_layers: 2
28+
units_head: 2
29+
chunks_head: 2
30+
units_glu_depth: 1
31+
chunks_glu_depth: 1
32+
33+
# Decoder sizes
34+
dec_layers: 2
35+
rnn_size: 600
36+
input_feed: 1
37+
bridge: True
4838
rnn_type: LSTM
49-
#encoder_rnn_type: TagLSTM # [LSTM|GRU|SRU|TagLSTM]
50-
#decoder_rnn_type: LSTM # [LSTM|GRU|SRU]
51-
# context_gate: both # [source|target|both]
52-
5339

54-
######################### Model/Attention ########################
55-
global_attention: general # The attention type to use [dot|general|mlp|none]
56-
global_attention_function: softmax # [softmax|sparsemax]
57-
self_attn_type: scaled-dot # Self attention type in Transformer decoder layer [scaled-dot|average]
58-
max_relative_positions: 0 # Maximum distance between inputs in relative positions representations
59-
generator_function: softmax # to generate probabilities over the vocabulary
60-
loss_scale: 0 # For FP16 training, the static loss scale to use
61-
#use_pos: True # when hierarchical attention and table, use the pos instead of the full value
6240

41+
# Model/Attention
42+
global_attention: general # Type of attn to use [dot|general|mlp|none]
43+
global_attention_function: softmax # [softmax|sparsemax]
44+
self_attn_type: scaled-dot # self attn type in transformer [scaled-dot|average]
45+
generator_function: softmax
46+
use_pos: True # whether using attributes in attention layers
6347

64-
############ COPY ############
65-
copy_attn: True # Train copy attention layer
66-
copy_attn_type: none # [dot|general|mlp|none] # None to use the same as global_attention[softmax|sparsemax]
67-
copy_attn_force: True # When available, train to copy
48+
# Model/Copy
49+
copy_attn: True
6850
reuse_copy_attn: True # Reuse standard attention for copy
69-
# copy_loss_by_seqlength: True # Divide copy loss by length of sequence
70-
coverage_attn: False # Train a coverage attention layer.
71-
lambda_coverage: 1 # Lambda value for coverage.
51+
copy_attn_force: True # When available, train to copy
52+
53+
54+
# Files and logs
55+
data: experiments/exp-1/data/data # path to datafile from preprocess.py
56+
save_model: experiments/exp-1/models/model # path to store checkpoints
57+
log_file: experiments/exp-1/train-log.txt
58+
59+
report_every: 50 # log current loss every X steps
60+
save_checkpoint_steps: 500 # save a cp every X steps
7261

7362

74-
# General
75-
data: experiments/exp-10/data/data # Path prefix to the “.train.pt” and “.valid.pt” file path from preprocess.py
76-
save_model: experiments/exp-10/models/model # the model will be saved as <save_model>_N.pt where N is the number of steps
77-
save_checkpoint_steps: 1000 # Save a checkpoint every X steps
78-
keep_checkpoint: -1 # Keep X checkpoints (negative: keep all)
79-
gpu_ranks: [0] # list of ranks of each process
63+
# Gpu related:
64+
gpu_ranks: [0] # ids of gpus to use
8065
world_size: 1 # total number of distributed processes
81-
gpu_backend: nccl # Type of torch distributed backend
82-
gpu_verbose_level: 0 # Gives more info on each process per GPU.
83-
master_ip: localhost # IP of master for torch.distributed training
84-
master_port: 10000 # Port of master for torch.distributed training
85-
seed: 456 # Random seed used for the experiments reproducibility
86-
87-
88-
# Initialization
89-
param_init: 0.1 # Uniform distribution with support (-param_init, param_init)
90-
# param_init_glorot: True # Init parameters with xavier_uniform instead of uniform
91-
# train_from: experiments/rotowire/model_step_10000.pt # path to pretrained model's state_dict
92-
reset_optim: none # Optimization resetter when train_from [none|all|states|keep_states]
93-
# pre_word_vecs_enc: <valid_path> # pretrained word embedding for encoder
94-
# pre_word_vecs_dec: <valid_path> # pretrained word embedding for decoder
95-
# fix_word_vecs_enc: True # Fix word embeddings on the encoder side
96-
# fix_word_vecs_dec: True # Fix word embeddings on the decoder side
97-
98-
99-
# Optimization/Type
100-
batch_size: 4 # Maximum batch size for training
101-
batch_type: sents # Batch grouping for batch_size [sents|tokens]
102-
normalization: sents # Normalization method of the gradient [sents|tokens]
103-
accum_count: [16] # Accumulate gradient this many times
104-
accum_steps: [0] # Steps at which accum_count values change
105-
valid_steps: 100000 # Perfom validation every X steps
106-
valid_batch_size: 32 # Maximum batch size for validation
107-
max_generator_batches: 512 # Maximum batches of words in a sequence to run generator on in parallel
108-
train_steps: 100000 # Number of training steps
109-
# single_pass: True # Make a single pass over the training dataset.
110-
optim: adam # Optimization method [sgd|adagrad|adadelta|adam|sparseadam|adafactor|fusedadam]
111-
adagrad_accumulator_init: 0 # Initializes the accumulator values in adagrad
112-
max_grad_norm: 5 # renormalize gradient to have norm equal to max_grad_norm if it exeeds it
113-
dropout: 0.5 # Dropout probability; applied in LSTM stacks
114-
truncated_decoder: 0 # Truncated bptt
115-
adam_beta1: 0.9 # The beta1 parameter used by Adam
116-
adam_beta2: 0.999 # The beta2 parameter used by Adam
117-
label_smoothing: 0.0 # probas for non-true labels will be smoothed by epsilon/(vocab_size-1)
118-
average_decay: 0 # Moving average decay
119-
average_every: 1 # Step for moving average
120-
121-
122-
# Optimization/Rate
123-
learning_rate: 0.001 # Starting learning rate
124-
learning_rate_decay: 0.5 # lr *= <learning_rate_decay>
125-
start_decay_steps: 10000 # Start decaying every <decay_steps> after <start_decay_steps>
126-
decay_steps: 20000 # Decay every decay_steps
127-
decay_method: none # Use a custom decay rate [noam|noamwd|rsqrt|none]
128-
warmup_steps: 0 # Number of warmup steps for custom decay
129-
130-
131-
# Logging
132-
report_every: 50 # Print stats at this interval
133-
log_file: 'experiments/exp-10/train-log.txt' # Output logs to a file under this path
134-
log_file_level: 0 # [ERROR|CRITICAL|INFO|NOTSET|WARNING|DEBUG|40|50|20|0|30|10]
135-
exp_host: '' # Send logs to this crayon server
136-
exp: '' # Name of the experiment for logging
137-
# tensorboard: True # Use tensorboardX for visualization during training
138-
tensorboard_log_dir: runs/onmt # Log directory for Tensorboard
66+
gpu_backend: nccl # type of torch distributed backend
67+
gpu_verbose_level: 0
68+
master_ip: localhost
69+
master_port: 10000
70+
seed: 123
71+
72+
73+
# Optimization & training
74+
batch_size: 32
75+
batch_type: sents
76+
normalization: sents
77+
accum_count: [2] # Update weights every X batches
78+
accum_steps: [0] # steps at which accum counts value changes
79+
valid_steps: 500 # run models on validation set every X steps
80+
train_steps: 30000
81+
optim: adam
82+
max_grad_norm: 5
83+
dropout: .5
84+
adam_beta1: 0.9
85+
adam_beta2: 0.999
86+
label_smoothing: 0.0
87+
average_decay: 0
88+
average_every: 1
89+
90+
# Learning rate
91+
learning_rate: 0.001
92+
learning_rate_decay: 0.5 # lr *= lr_decay
93+
start_decay_step: 5000
94+
decay_steps: 10000

0 commit comments

Comments
 (0)