1- # commented lines are boolean values that defaults to None
2- # uncomment to pass as True
3-
41# Model/Embeddings
5- # src_word_vec_size: 500 # Word embedding size for src.
6- # tgt_word_vec_size: 500 # Word embedding size for tgt.
7- word_vec_size: 600 # Word embedding size for src and tgt.
8-
2+ word_vec_size: 600 # Word embedding size for src and tgt
3+ share_embeddings: True # Share embeddings from src and tgt
94
105# Model/Embedding Features
11- feat_vec_size: 100 # feature embedding sizes as absolute
12- # feat_vec_exponent: 0.7 # Not meant to be changed it suppresses a warning
13- feat_merge: mlp # Merge action for incorporating features embeddings [concat|sum|mlp]
6+ feat_vec_size: 20 # Attribute embedding size
7+ feat_merge: mlp # Merge action for incorporating feature embeddings [concat|sum|mlp]
148feat_merge_activation: ReLU
159
16- # ####################### MODEL STRUCTURE ############
17- model_type: table # Type of source model to use. [text|table|img|audio].
18- model_dtype: fp32 # Data type of the model.[rnn|brnn|mean|transformer|htransformer|cnn].
19- decoder_type: hrnn # Type of decoder layer to use. [rnn|transformer|cnn].
20- layers: -1 # Number of layers in enc/dec.
2110
22- # ####################### ENCODER ##################
23- encoder_type: htransformer # Type of encoder layer to use.
24- transformer_ff: 1024 # Size of hidden transformer feed-forward
11+ # Model Structure
12+ model_type: table # Type of source model to use [text|table|img|audio]
13+ model_dtype: fp32
14+ encoder_type: htransformer # Type of encoder [rnn|brnn|transformer|htransformer|cnn]
15+ decoder_type: hrnn # Type of decoder [rnn|transformer|cnn|hrnn]
16+ param_init: 0.1 # Uniform distribution with support (-param_init, +param_init)
2517
26- # over rides
27- enc_layers: -1 # Number of layers in the encoder
28- heads: -1 # Number of heads for transformer self-attention
18+ # We put sizes we wish to change manually at -1
19+ layers: -1
20+ enc_layers: -1
21+ heads: -1
2922glu_depth: -1
3023
31- # customs
32- units_layers: 6 # number of layers in the low level transformer
33- chunks_layers: 6
34- units_heads: 6
35- chunks_heads: 6
36- units_glu_depth: 1 # usually 2
37- chunks_glu_depth: 1 # usually 2
38-
39-
40- # ####################### DECODER ##################
41- dec_layers: 2 # Number of layers in the decoder
42- rnn_size: 600 # Size of rnn hidden states. Overwrites enc_rnn_size and dec_rnn_size
43- # enc_rnn_size: 600 # Size of encoder rnn hidden states. Must be equal to dec_rnn_size
44- # dec_rnn_size: 600 # Size of encoder rnn hidden states. Must be equal to enc_rnn_size
45- cnn_kernel_width: 3 # Size of cnn windows, the kernel_size is (cnn_kernel_width, 1)
46- input_feed: 1 # Feed the context vector at each time step as additional input
47- bridge: True # additional layer between the last encoder state and the first decoder state
24+ # Encoder sizes
25+ transformer_ff: 1024 # Size of hidden transformer feed-forward
26+ units_layers: 2
27+ chunks_layers: 2
28+ units_head: 2
29+ chunks_head: 2
30+ units_glu_depth: 1
31+ chunks_glu_depth: 1
32+
33+ # Decoder sizes
34+ dec_layers: 2
35+ rnn_size: 600
36+ input_feed: 1
37+ bridge: True
4838rnn_type: LSTM
49- # encoder_rnn_type: TagLSTM # [LSTM|GRU|SRU|TagLSTM]
50- # decoder_rnn_type: LSTM # [LSTM|GRU|SRU]
51- # context_gate: both # [source|target|both]
52-
5339
54- # ######################## Model/Attention ########################
55- global_attention: general # The attention type to use [dot|general|mlp|none]
56- global_attention_function: softmax # [softmax|sparsemax]
57- self_attn_type: scaled-dot # Self attention type in Transformer decoder layer [scaled-dot|average]
58- max_relative_positions: 0 # Maximum distance between inputs in relative positions representations
59- generator_function: softmax # to generate probabilities over the vocabulary
60- loss_scale: 0 # For FP16 training, the static loss scale to use
61- # use_pos: True # when hierarchical attention and table, use the pos instead of the full value
6240
41+ # Model/Attention
42+ global_attention: general # Type of attn to use [dot|general|mlp|none]
43+ global_attention_function: softmax # [softmax|sparsemax]
44+ self_attn_type: scaled-dot # self attn type in transformer [scaled-dot|average]
45+ generator_function: softmax
46+ use_pos: True # whether using attributes in attention layers
6347
64- # ########### COPY ############
65- copy_attn: True # Train copy attention layer
66- copy_attn_type: none # [dot|general|mlp|none] # None to use the same as global_attention[softmax|sparsemax]
67- copy_attn_force: True # When available, train to copy
48+ # Model/Copy
49+ copy_attn: True
6850reuse_copy_attn: True # Reuse standard attention for copy
69- # copy_loss_by_seqlength: True # Divide copy loss by length of sequence
70- coverage_attn: False # Train a coverage attention layer.
71- lambda_coverage: 1 # Lambda value for coverage.
51+ copy_attn_force: True # When available, train to copy
52+
53+
54+ # Files and logs
55+ data: experiments/exp-1/data/data # path to datafile from preprocess.py
56+ save_model: experiments/exp-1/models/model # path to store checkpoints
57+ log_file: experiments/exp-1/train-log.txt
58+
59+ report_every: 50 # log current loss every X steps
60+ save_checkpoint_steps: 500 # save a cp every X steps
7261
7362
74- # General
75- data: experiments/exp-10/data/data # Path prefix to the “.train.pt” and “.valid.pt” file path from preprocess.py
76- save_model: experiments/exp-10/models/model # the model will be saved as <save_model>_N.pt where N is the number of steps
77- save_checkpoint_steps: 1000 # Save a checkpoint every X steps
78- keep_checkpoint: -1 # Keep X checkpoints (negative: keep all)
79- gpu_ranks: [0] # list of ranks of each process
63+ # Gpu related:
64+ gpu_ranks: [0] # ids of gpus to use
8065world_size: 1 # total number of distributed processes
81- gpu_backend: nccl # Type of torch distributed backend
82- gpu_verbose_level: 0 # Gives more info on each process per GPU.
83- master_ip: localhost # IP of master for torch.distributed training
84- master_port: 10000 # Port of master for torch.distributed training
85- seed: 456 # Random seed used for the experiments reproducibility
86-
87-
88- # Initialization
89- param_init: 0.1 # Uniform distribution with support (-param_init, param_init)
90- # param_init_glorot: True # Init parameters with xavier_uniform instead of uniform
91- # train_from: experiments/rotowire/model_step_10000.pt # path to pretrained model's state_dict
92- reset_optim: none # Optimization resetter when train_from [none|all|states|keep_states]
93- # pre_word_vecs_enc: <valid_path> # pretrained word embedding for encoder
94- # pre_word_vecs_dec: <valid_path> # pretrained word embedding for decoder
95- # fix_word_vecs_enc: True # Fix word embeddings on the encoder side
96- # fix_word_vecs_dec: True # Fix word embeddings on the decoder side
97-
98-
99- # Optimization/Type
100- batch_size: 4 # Maximum batch size for training
101- batch_type: sents # Batch grouping for batch_size [sents|tokens]
102- normalization: sents # Normalization method of the gradient [sents|tokens]
103- accum_count: [16] # Accumulate gradient this many times
104- accum_steps: [0] # Steps at which accum_count values change
105- valid_steps: 100000 # Perfom validation every X steps
106- valid_batch_size: 32 # Maximum batch size for validation
107- max_generator_batches: 512 # Maximum batches of words in a sequence to run generator on in parallel
108- train_steps: 100000 # Number of training steps
109- # single_pass: True # Make a single pass over the training dataset.
110- optim: adam # Optimization method [sgd|adagrad|adadelta|adam|sparseadam|adafactor|fusedadam]
111- adagrad_accumulator_init: 0 # Initializes the accumulator values in adagrad
112- max_grad_norm: 5 # renormalize gradient to have norm equal to max_grad_norm if it exeeds it
113- dropout: 0.5 # Dropout probability; applied in LSTM stacks
114- truncated_decoder: 0 # Truncated bptt
115- adam_beta1: 0.9 # The beta1 parameter used by Adam
116- adam_beta2: 0.999 # The beta2 parameter used by Adam
117- label_smoothing: 0.0 # probas for non-true labels will be smoothed by epsilon/(vocab_size-1)
118- average_decay: 0 # Moving average decay
119- average_every: 1 # Step for moving average
120-
121-
122- # Optimization/Rate
123- learning_rate: 0.001 # Starting learning rate
124- learning_rate_decay: 0.5 # lr *= <learning_rate_decay>
125- start_decay_steps: 10000 # Start decaying every <decay_steps> after <start_decay_steps>
126- decay_steps: 20000 # Decay every decay_steps
127- decay_method: none # Use a custom decay rate [noam|noamwd|rsqrt|none]
128- warmup_steps: 0 # Number of warmup steps for custom decay
129-
130-
131- # Logging
132- report_every: 50 # Print stats at this interval
133- log_file: ' experiments/exp-10/train-log.txt' # Output logs to a file under this path
134- log_file_level: 0 # [ERROR|CRITICAL|INFO|NOTSET|WARNING|DEBUG|40|50|20|0|30|10]
135- exp_host: ' ' # Send logs to this crayon server
136- exp: ' ' # Name of the experiment for logging
137- # tensorboard: True # Use tensorboardX for visualization during training
138- tensorboard_log_dir: runs/onmt # Log directory for Tensorboard
66+ gpu_backend: nccl # type of torch distributed backend
67+ gpu_verbose_level: 0
68+ master_ip: localhost
69+ master_port: 10000
70+ seed: 123
71+
72+
73+ # Optimization & training
74+ batch_size: 32
75+ batch_type: sents
76+ normalization: sents
77+ accum_count: [2] # Update weights every X batches
78+ accum_steps: [0] # steps at which accum counts value changes
79+ valid_steps: 500 # run models on validation set every X steps
80+ train_steps: 30000
81+ optim: adam
82+ max_grad_norm: 5
83+ dropout: .5
84+ adam_beta1: 0.9
85+ adam_beta2: 0.999
86+ label_smoothing: 0.0
87+ average_decay: 0
88+ average_every: 1
89+
90+ # Learning rate
91+ learning_rate: 0.001
92+ learning_rate_decay: 0.5 # lr *= lr_decay
93+ start_decay_step: 5000
94+ decay_steps: 10000
0 commit comments