|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Copyright 2017 Hossein Hadian |
| 4 | +# 2017 Chun Chieh Chang |
| 5 | +# 2017 Ashish Arora |
| 6 | + |
| 7 | +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ |
| 8 | +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) |
| 9 | + |
| 10 | +# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_* |
| 11 | +# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0 |
| 12 | +# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0 |
| 13 | + |
| 14 | +set -e -o pipefail |
| 15 | + |
| 16 | +stage=0 |
| 17 | + |
| 18 | +nj=30 |
| 19 | +train_set=train |
| 20 | +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it |
| 21 | + # should have alignments for the specified training data. |
| 22 | +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. |
| 23 | +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. |
| 24 | +ali=tri3_ali |
| 25 | +common_egs_dir= |
| 26 | +reporting_email= |
| 27 | + |
| 28 | +# chain options |
| 29 | +train_stage=-10 |
| 30 | +xent_regularize=0.1 |
| 31 | +frame_subsampling_factor=4 |
| 32 | +alignment_subsampling_factor=1 |
| 33 | +# training chunk-options |
| 34 | +chunk_width=340,300,200,100 |
| 35 | +num_leaves=500 |
| 36 | +# we don't need extra left/right context for TDNN systems. |
| 37 | +chunk_left_context=0 |
| 38 | +chunk_right_context=0 |
| 39 | +tdnn_dim=450 |
| 40 | +# training options |
| 41 | +srand=0 |
| 42 | +remove_egs=false |
| 43 | +lang_test=lang_test |
| 44 | +# End configuration section. |
| 45 | +echo "$0 $@" # Print the command line for logging |
| 46 | + |
| 47 | + |
| 48 | +. ./cmd.sh |
| 49 | +. ./path.sh |
| 50 | +. ./utils/parse_options.sh |
| 51 | + |
| 52 | + |
| 53 | +if ! cuda-compiled; then |
| 54 | + cat <<EOF && exit 1 |
| 55 | +This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA |
| 56 | +If you want to use GPUs (and have them), go to src/, and configure and make on a machine |
| 57 | +where "nvcc" is installed. |
| 58 | +EOF |
| 59 | +fi |
| 60 | + |
| 61 | +gmm_dir=exp/${gmm} |
| 62 | +ali_dir=exp/${ali} |
| 63 | +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats |
| 64 | +dir=exp/chain${nnet3_affix}/cnn${affix} |
| 65 | +train_data_dir=data/${train_set} |
| 66 | +tree_dir=exp/chain${nnet3_affix}/tree |
| 67 | + |
| 68 | +# the 'lang' directory is created by this script. |
| 69 | +# If you create such a directory with a non-standard topology |
| 70 | +# you should probably name it differently. |
| 71 | +lang=data/lang_chain |
| 72 | + |
| 73 | +for f in $train_data_dir/feats.scp \ |
| 74 | + $train_data_dir/feats.scp $gmm_dir/final.mdl \ |
| 75 | + $ali_dir/ali.1.gz $gmm_dir/final.mdl; do |
| 76 | + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 |
| 77 | +done |
| 78 | + |
| 79 | + |
| 80 | +if [ $stage -le 1 ]; then |
| 81 | + echo "$0: creating lang directory $lang with chain-type topology" |
| 82 | + # Create a version of the lang/ directory that has one state per phone in the |
| 83 | + # topo file. [note, it really has two states.. the first one is only repeated |
| 84 | + # once, the second one has zero or more repeats.] |
| 85 | + if [ -d $lang ]; then |
| 86 | + if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then |
| 87 | + echo "$0: $lang already exists, not overwriting it; continuing" |
| 88 | + else |
| 89 | + echo "$0: $lang already exists and seems to be older than data/lang..." |
| 90 | + echo " ... not sure what to do. Exiting." |
| 91 | + exit 1; |
| 92 | + fi |
| 93 | + else |
| 94 | + cp -r data/$lang_test $lang |
| 95 | + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; |
| 96 | + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; |
| 97 | + # Use our special topology... note that later on may have to tune this |
| 98 | + # topology. |
| 99 | + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo |
| 100 | + fi |
| 101 | +fi |
| 102 | + |
| 103 | +if [ $stage -le 2 ]; then |
| 104 | + # Get the alignments as lattices (gives the chain training more freedom). |
| 105 | + # use the same num-jobs as the alignments |
| 106 | + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ |
| 107 | + data/$lang_test $gmm_dir $lat_dir |
| 108 | + rm $lat_dir/fsts.*.gz # save space |
| 109 | +fi |
| 110 | + |
| 111 | +if [ $stage -le 3 ]; then |
| 112 | + # Build a tree using our new topology. We know we have alignments for the |
| 113 | + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use |
| 114 | + # those. The num-leaves is always somewhat less than the num-leaves from |
| 115 | + # the GMM baseline. |
| 116 | + if [ -f $tree_dir/final.mdl ]; then |
| 117 | + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." |
| 118 | + exit 1; |
| 119 | + fi |
| 120 | + steps/nnet3/chain/build_tree.sh \ |
| 121 | + --frame-subsampling-factor $frame_subsampling_factor \ |
| 122 | + --context-opts "--context-width=2 --central-position=1" \ |
| 123 | + --cmd "$cmd" $num_leaves ${train_data_dir} \ |
| 124 | + $lang $ali_dir $tree_dir |
| 125 | +fi |
| 126 | + |
| 127 | + |
| 128 | +if [ $stage -le 4 ]; then |
| 129 | + mkdir -p $dir |
| 130 | + echo "$0: creating neural net configs using the xconfig parser"; |
| 131 | + |
| 132 | + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') |
| 133 | + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) |
| 134 | + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" |
| 135 | + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" |
| 136 | + mkdir -p $dir/configs |
| 137 | + cat <<EOF > $dir/configs/network.xconfig |
| 138 | + input dim=40 name=input |
| 139 | +
|
| 140 | + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 |
| 141 | + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 |
| 142 | + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 |
| 143 | + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 |
| 144 | + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim |
| 145 | + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim |
| 146 | + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim |
| 147 | + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim |
| 148 | +
|
| 149 | + ## adding the layers for chain branch |
| 150 | + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 |
| 151 | + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 |
| 152 | +
|
| 153 | + # adding the layers for xent branch |
| 154 | + # This block prints the configs for a separate output that will be |
| 155 | + # trained with a cross-entropy objective in the 'chain' mod?els... this |
| 156 | + # has the effect of regularizing the hidden parts of the model. we use |
| 157 | + # 0.5 / args.xent_regularize as the learning rate factor- the factor of |
| 158 | + # 0.5 / args.xent_regularize is suitable as it means the xent |
| 159 | + # final-layer learns at a rate independent of the regularization |
| 160 | + # constant; and the 0.5 was tuned so as to make the relative progress |
| 161 | + # similar in the xent and regular final layers. |
| 162 | + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 |
| 163 | + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 |
| 164 | +EOF |
| 165 | + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ |
| 166 | +fi |
| 167 | + |
| 168 | + |
| 169 | +if [ $stage -le 5 ]; then |
| 170 | + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then |
| 171 | + utils/create_split_dir.pl \ |
| 172 | + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage |
| 173 | + fi |
| 174 | + |
| 175 | + steps/nnet3/chain/train.py --stage=$train_stage \ |
| 176 | + --cmd="$cmd" \ |
| 177 | + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ |
| 178 | + --chain.xent-regularize $xent_regularize \ |
| 179 | + --chain.leaky-hmm-coefficient=0.1 \ |
| 180 | + --chain.l2-regularize=0.00005 \ |
| 181 | + --chain.apply-deriv-weights=false \ |
| 182 | + --chain.lm-opts="--num-extra-lm-states=500" \ |
| 183 | + --chain.frame-subsampling-factor=$frame_subsampling_factor \ |
| 184 | + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ |
| 185 | + --trainer.srand=$srand \ |
| 186 | + --trainer.max-param-change=2.0 \ |
| 187 | + --trainer.num-epochs=4 \ |
| 188 | + --trainer.frames-per-iter=1000000 \ |
| 189 | + --trainer.optimization.num-jobs-initial=2 \ |
| 190 | + --trainer.optimization.num-jobs-final=4 \ |
| 191 | + --trainer.optimization.initial-effective-lrate=0.001 \ |
| 192 | + --trainer.optimization.final-effective-lrate=0.0001 \ |
| 193 | + --trainer.optimization.shrink-value=1.0 \ |
| 194 | + --trainer.num-chunk-per-minibatch=64,32 \ |
| 195 | + --trainer.optimization.momentum=0.0 \ |
| 196 | + --egs.chunk-width=$chunk_width \ |
| 197 | + --egs.chunk-left-context=$chunk_left_context \ |
| 198 | + --egs.chunk-right-context=$chunk_right_context \ |
| 199 | + --egs.chunk-left-context-initial=0 \ |
| 200 | + --egs.chunk-right-context-final=0 \ |
| 201 | + --egs.dir="$common_egs_dir" \ |
| 202 | + --egs.opts="--frames-overlap-per-eg 0" \ |
| 203 | + --cleanup.remove-egs=$remove_egs \ |
| 204 | + --use-gpu=true \ |
| 205 | + --reporting.email="$reporting_email" \ |
| 206 | + --feat-dir=$train_data_dir \ |
| 207 | + --tree-dir=$tree_dir \ |
| 208 | + --lat-dir=$lat_dir \ |
| 209 | + --dir=$dir || exit 1; |
| 210 | +fi |
| 211 | + |
| 212 | +if [ $stage -le 6 ]; then |
| 213 | + # The reason we are using data/lang here, instead of $lang, is just to |
| 214 | + # emphasize that it's not actually important to give mkgraph.sh the |
| 215 | + # lang directory with the matched topology (since it gets the |
| 216 | + # topology file from the model). So you could give it a different |
| 217 | + # lang directory, one that contained a wordlist and LM of your choice, |
| 218 | + # as long as phones.txt was compatible. |
| 219 | + |
| 220 | + utils/mkgraph.sh \ |
| 221 | + --self-loop-scale 1.0 data/$lang_test \ |
| 222 | + $dir $dir/graph || exit 1; |
| 223 | +fi |
| 224 | + |
| 225 | +if [ $stage -le 7 ]; then |
| 226 | + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) |
| 227 | + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ |
| 228 | + --extra-left-context $chunk_left_context \ |
| 229 | + --extra-right-context $chunk_right_context \ |
| 230 | + --extra-left-context-initial 0 \ |
| 231 | + --extra-right-context-final 0 \ |
| 232 | + --frames-per-chunk $frames_per_chunk \ |
| 233 | + --nj $nj --cmd "$cmd" \ |
| 234 | + $dir/graph data/test $dir/decode_test || exit 1; |
| 235 | +fi |
0 commit comments