KaijuML
diff --git a/‎train.cfg‎
Lines changed: 77 additions & 121 deletions b/‎train.cfg‎
Lines changed: 77 additions & 121 deletions
@@ -1,138 +1,94 @@
-# commented lines are boolean values that defaults to None
-# uncomment to pass as True
-
 # Model/Embeddings
-# src_word_vec_size: 500 # Word embedding size for src.
-# tgt_word_vec_size: 500 # Word embedding size for tgt.
-word_vec_size: 600 # Word embedding size for src and tgt.
-
+word_vec_size: 600 # Word embedding size for src and tgt
+share_embeddings: True # Share embeddings from src and tgt
 
 # Model/Embedding Features
-feat_vec_size: 100 # feature embedding sizes as absolute
-#feat_vec_exponent: 0.7 # Not meant to be changed it suppresses a warning
-feat_merge: mlp # Merge action for incorporating features embeddings [concat|sum|mlp]
+feat_vec_size: 20 # Attribute embedding size
+feat_merge: mlp # Merge action for incorporating feature embeddings [concat|sum|mlp]
 feat_merge_activation: ReLU
 
-######################## MODEL STRUCTURE ############
-model_type: table # Type of source model to use. [text|table|img|audio].
-model_dtype: fp32 # Data type of the model.[rnn|brnn|mean|transformer|htransformer|cnn].
-decoder_type: hrnn # Type of decoder layer to use. [rnn|transformer|cnn].
-layers: -1 # Number of layers in enc/dec.
 
-######################## ENCODER ##################
-encoder_type: htransformer # Type of encoder layer to use. 
-transformer_ff: 1024 # Size of hidden transformer feed-forward
+# Model Structure
+model_type: table # Type of source model to use [text|table|img|audio]
+model_dtype: fp32
+encoder_type: htransformer # Type of encoder [rnn|brnn|transformer|htransformer|cnn]
+decoder_type: hrnn # Type of decoder [rnn|transformer|cnn|hrnn]
+param_init: 0.1 # Uniform distribution with support (-param_init, +param_init)
 
-# over rides 
-enc_layers: -1 # Number of layers in the encoder
-heads: -1 # Number of heads for transformer self-attention
+# We put sizes we wish to change manually at -1
+layers: -1
+enc_layers: -1
+heads: -1
 glu_depth: -1
 
-# customs
-units_layers: 6 # number of layers in the low level transformer
-chunks_layers: 6
-units_heads: 6
-chunks_heads: 6
-units_glu_depth: 1 # usually 2
-chunks_glu_depth: 1 # usually 2
-
-
-######################## DECODER ##################
-dec_layers: 2 # Number of layers in the decoder
-rnn_size: 600 # Size of rnn hidden states. Overwrites enc_rnn_size and dec_rnn_size
-# enc_rnn_size: 600 # Size of encoder rnn hidden states. Must be equal to dec_rnn_size
-# dec_rnn_size: 600 # Size of encoder rnn hidden states. Must be equal to enc_rnn_size
-cnn_kernel_width: 3 # Size of cnn windows, the kernel_size is (cnn_kernel_width, 1)
-input_feed: 1 # Feed the context vector at each time step as additional input
-bridge: True # additional layer between the last encoder state and the first decoder state
+# Encoder sizes
+transformer_ff: 1024 # Size of hidden transformer feed-forward
+units_layers: 2
+chunks_layers: 2
+units_head: 2
+chunks_head: 2
+units_glu_depth: 1
+chunks_glu_depth: 1
+
+# Decoder sizes
+dec_layers: 2
+rnn_size: 600
+input_feed: 1
+bridge: True
 rnn_type: LSTM
-#encoder_rnn_type: TagLSTM # [LSTM|GRU|SRU|TagLSTM]
-#decoder_rnn_type: LSTM # [LSTM|GRU|SRU]
-# context_gate: both # [source|target|both]
-
 
-######################### Model/Attention ########################
-global_attention: general # The attention type to use [dot|general|mlp|none]
-global_attention_function: softmax # [softmax|sparsemax]
-self_attn_type: scaled-dot # Self attention type in Transformer decoder layer [scaled-dot|average]
-max_relative_positions: 0 # Maximum distance between inputs in relative positions representations
-generator_function: softmax # to generate probabilities over the vocabulary 
-loss_scale: 0 # For FP16 training, the static loss scale to use
-#use_pos: True # when hierarchical attention and table, use the pos instead of the full value
 
+# Model/Attention
+global_attention: general # Type of attn to use [dot|general|mlp|none]
+global_attention_function: softmax # [softmax|sparsemax]
+self_attn_type: scaled-dot # self attn type in transformer [scaled-dot|average]
+generator_function: softmax
+use_pos: True # whether using attributes in attention layers
 
-############ COPY ############
-copy_attn: True # Train copy attention layer
-copy_attn_type: none # [dot|general|mlp|none] # None to use the same as global_attention[softmax|sparsemax]
-copy_attn_force: True # When available, train to copy
+# Model/Copy
+copy_attn: True
 reuse_copy_attn: True # Reuse standard attention for copy
-# copy_loss_by_seqlength: True # Divide copy loss by length of sequence
-coverage_attn: False # Train a coverage attention layer.
-lambda_coverage: 1 # Lambda value for coverage.
+copy_attn_force: True # When available, train to copy
+
+
+# Files and logs
+data: experiments/exp-1/data/data # path to datafile from preprocess.py
+save_model: experiments/exp-1/models/model # path to store checkpoints
+log_file: experiments/exp-1/train-log.txt
+
+report_every: 50 # log current loss every X steps
+save_checkpoint_steps: 500 # save a cp every X steps
 
 
-# General
-data: experiments/exp-10/data/data # Path prefix to the “.train.pt” and “.valid.pt” file path from preprocess.py
-save_model: experiments/exp-10/models/model # the model will be saved as <save_model>_N.pt where N is the number of steps
-save_checkpoint_steps: 1000 # Save a checkpoint every X steps
-keep_checkpoint: -1 # Keep X checkpoints (negative: keep all)
-gpu_ranks: [0] # list of ranks of each process
+# Gpu related:
+gpu_ranks: [0] # ids of gpus to use
 world_size: 1 # total number of distributed processes
-gpu_backend: nccl # Type of torch distributed backend
-gpu_verbose_level: 0 # Gives more info on each process per GPU.
-master_ip: localhost # IP of master for torch.distributed training
-master_port: 10000 # Port of master for torch.distributed training
-seed: 456 # Random seed used for the experiments reproducibility
-
-
-# Initialization
-param_init: 0.1 # Uniform distribution with support (-param_init, param_init)
-# param_init_glorot: True # Init parameters with xavier_uniform instead of uniform
-# train_from: experiments/rotowire/model_step_10000.pt # path to pretrained model's state_dict
-reset_optim: none # Optimization resetter when train_from [none|all|states|keep_states]
-# pre_word_vecs_enc: <valid_path> # pretrained word embedding for encoder
-# pre_word_vecs_dec: <valid_path> # pretrained word embedding for decoder
-# fix_word_vecs_enc: True # Fix word embeddings on the encoder side
-# fix_word_vecs_dec: True # Fix word embeddings on the decoder side
-
-
-# Optimization/Type
-batch_size: 4 # Maximum batch size for training
-batch_type: sents # Batch grouping for batch_size [sents|tokens]
-normalization: sents # Normalization method of the gradient [sents|tokens]
-accum_count: [16] # Accumulate gradient this many times
-accum_steps: [0] # Steps at which accum_count values change
-valid_steps: 100000 # Perfom validation every X steps
-valid_batch_size: 32 # Maximum batch size for validation
-max_generator_batches: 512 # Maximum batches of words in a sequence to run generator on in parallel
-train_steps: 100000 # Number of training steps
-# single_pass: True # Make a single pass over the training dataset.
-optim: adam # Optimization method [sgd|adagrad|adadelta|adam|sparseadam|adafactor|fusedadam]
-adagrad_accumulator_init: 0 # Initializes the accumulator values in adagrad
-max_grad_norm: 5 # renormalize gradient to have norm equal to max_grad_norm if it exeeds it
-dropout: 0.5 # Dropout probability; applied in LSTM stacks
-truncated_decoder: 0 # Truncated bptt
-adam_beta1: 0.9 # The beta1 parameter used by Adam
-adam_beta2: 0.999 # The beta2 parameter used by Adam
-label_smoothing: 0.0 # probas for non-true labels will be smoothed by epsilon/(vocab_size-1)
-average_decay: 0 # Moving average decay
-average_every: 1 # Step for moving average
-
-
-# Optimization/Rate
-learning_rate: 0.001 # Starting learning rate
-learning_rate_decay: 0.5 # lr *= <learning_rate_decay>
-start_decay_steps: 10000 # Start decaying every <decay_steps> after <start_decay_steps>
-decay_steps: 20000 # Decay every decay_steps
-decay_method: none # Use a custom decay rate [noam|noamwd|rsqrt|none]
-warmup_steps: 0 # Number of warmup steps for custom decay
-
-
-# Logging
-report_every: 50 # Print stats at this interval
-log_file: 'experiments/exp-10/train-log.txt' # Output logs to a file under this path
-log_file_level: 0 # [ERROR|CRITICAL|INFO|NOTSET|WARNING|DEBUG|40|50|20|0|30|10]
-exp_host: '' # Send logs to this crayon server
-exp: '' # Name of the experiment for logging
-# tensorboard: True # Use tensorboardX for visualization during training
-tensorboard_log_dir: runs/onmt # Log directory for Tensorboard
+gpu_backend: nccl # type of torch distributed backend
+gpu_verbose_level: 0
+master_ip: localhost
+master_port: 10000
+seed: 123
+
+
+# Optimization & training
+batch_size: 32
+batch_type: sents
+normalization: sents
+accum_count: [2] # Update weights every X batches
+accum_steps: [0] # steps at which accum counts value changes
+valid_steps: 500 # run models on validation set every X steps
+train_steps: 30000
+optim: adam
+max_grad_norm: 5
+dropout: .5
+adam_beta1: 0.9
+adam_beta2: 0.999
+label_smoothing: 0.0
+average_decay: 0
+average_every: 1
+
+# Learning rate
+learning_rate: 0.001
+learning_rate_decay: 0.5 # lr *= lr_decay
+start_decay_step: 5000
+decay_steps: 10000