kaldi-asr · danpovey · Mar 7, 2017 · Dec 1, 2016 · Jan 2, 2017 · Jan 2, 2017
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lfr1c_disc.sh
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the CE nnet3 LFR system
+# from run_tdnn_lfr1c. To simplify things, this assumes you are using the
+# "speed-perturbed" data
+# (--speed_perturb true, which is the default) in the baseline run_tdnn_d.sh script.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+
+# Comparing effect of shift:
+# System tdnn_lfr1c_sp_smbr:1 tdnn_lfr1c_sp_smbr:2 tdnn_lfr1c_sp_smbr:3 tdnn_lfr1c_sp_fs_smbr:1 tdnn_lfr1c_sp_fs_smbr:2 tdnn_lfr1c_sp_fs_smbr:3
+# WER on train_dev(tg) 16.26 16.11 16.02 16.02 15.77 15.78
+# WER on train_dev(fg) 15.01 14.91 14.80 14.79 14.58 14.50
+# WER on eval2000(tg) 18.9 18.7 18.6 18.6 18.5 18.5
+# WER on eval2000(fg) 17.4 17.2 17.1 17.1 17.0 16.9
+
+
+set -e
+set -uo pipefail
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true # for training
+cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like
+ # alignments and degs).
+degs_dir= # set this to use preexisting degs.
+nj=65 # have a high number of jobs because this could take a while, and we might
+ # have some stragglers.
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+# originally ran with no affix, with effective_learning_rate=0.0000125;
+# reran by mistake with no affix with effective_learning_rate=0.000005 [was a bit
+# better, see NOTES, but still best after 1st epoch].
+# reran again with affix=slow and effective_learning_rate=0.0000025
+# reran again with affix=slow2 and effective_learning_rate=0.00000125 (this was
+# about the best).
+# before checking in the script, removed the slow2 affix but left with
+# the lowest learning rate.
+disc_affix=
+
+## Egs options. Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_decoding=200
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## Note: extra-left-context and extra-right-context are 0 because this is a TDNN,
+## it's not a recurrent model like an LSTM or BLSTM.
+extra_left_context=0
+extra_right_context=0
+
+
+## Nnet training options
+effective_learning_rate=0.00000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options,
+ # in chain models.
+minibatch_size="300=32,16/150=64,32" # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+ # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+shift_feats=false
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/tdnn_lfr1c_sp
+graph_dir=$srcdir/graph_sw1_tg
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+dir=${srcdir}_${criterion}${disc_affix}
+
+
+if $use_gpu; then
+ if ! cuda-compiled; then
+ cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed. Otherwise, call this script with --use-gpu false
+EOF
+ fi
+ num_threads=1
+else
+ # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+ # almost the same, but this may be a little bit slow.
+ num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+ echo "$0: expected ${srcdir}/final.mdl to exist"
+ exit 1;
+fi
+
+
+frame_subsampling_factor=1
+if [ -f $srcdir/frame_subsampling_factor ]; then
+ frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+fi
+
+affix= # Will be set if doing input frame shift
+if [[ "$shift_feats" = true && $frame_subsampling_factor -ne 1 ]]; then
+ if [ $stage -le 0 ]; then
+ utils/data/shift_and_combine_feats.sh --write-utt2orig $dir/utt2orig \
+ $frame_subsampling_factor $train_data_dir ${train_data_dir}_fs
+ steps/online/nnet2/copy_ivector_dir.sh --utt2orig $dir/utt2orig \
+ $online_ivector_dir ${online_ivector_dir}_fs
+ rm $dir/utt2orig
+ fi
+ online_ivector_dir=${online_ivector_dir}_fs
+ train_data_dir=${train_data_dir}_fs
+ affix=_fs
+fi
+
+if [ $stage -le 1 ]; then
+ # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+ # get excellent GPU utilization though.]
+ steps/nnet3/align.sh --cmd "$decode_cmd" --use-gpu false \
+ --scale-opts '--transition-scale=1.0 --acoustic-scale=0.333 --self-loop-scale=0.333' \
+ --frames-per-chunk $frames_per_chunk_decoding \
+ --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+ --extra-left-context-initial 0 --extra-right-context-final 0 \
+ --online-ivector-dir $online_ivector_dir \
+ --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali${affix} ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+ if [ $stage -le 2 ]; then
+ if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+ utils/create_split_dir.pl \
+ /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+ fi
+ if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+ steps/nnet3/get_degs.sh \
+ --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+ --self-loop-scale 0.333 --acwt 0.333 \
+ --max-copy-jobs $max_copy_jobs \
+ --extra-left-context $extra_left_context \
+ --extra-right-context $extra_right_context \
+ --extra-left-context-initial 0 --extra-right-context-final 0 \
+ --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+ --stage $get_egs_stage \
+ --online-ivector-dir $online_ivector_dir \
+ --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+ $train_data_dir data/lang ${srcdir} ${srcdir}_ali${affix} ${srcdir}_degs${affix} || exit 1
+ fi
+fi
+
+if [ $stage -le 3 ]; then
+ [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs${affix}
+ steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+ --stage $train_stage \
+ --acoustic-scale 0.333 \
+ --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+ --criterion $criterion --drop-frames true \
+ --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+ --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+ --regularization-opts "$regularization_opts" \
+ ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+ for x in `seq $decode_start_epoch $num_epochs`; do
+ for decode_set in train_dev eval2000; do
+ num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+ for iter in epoch$x epoch${x}_adj; do
+ (
+ steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+ --acwt 0.333 --post-decode-acwt 3.0 \
+ --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+ $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
+
+ steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+ data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+ $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_${iter} || exit 1;
+ ) &
+ done
+ done
+ done
+fi
+wait;
+
+if [ $stage -le 5 ] && $cleanup; then
+ # if you run with "--cleanup true --stage 6" you can clean up.
+ # actually, keep the alignments in case we need them later.. they're slow to
+ # create, and quite big.
+ # rm ${srcdir}_ali/ali.*.gz || true
+
+ steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+wait;
+exit 0;
diff --git a/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh b/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2017 Johns Hopkins University (author: Hossein Hadian)
+# Apache 2.0
+
+# This script copies the necessary parts of an online ivector directory
+# optionally applying a mapping to the ivector_online.scp file
+
+utt2orig=
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+ echo "Usage: "
+ echo " $0 [options] <srcdir> <destdir>"
+ echo "e.g.:"
+ echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs"
+ echo "Options"
+ echo " --utt2orig=<file> # utterance id mapping to use"
+ exit 1;
+fi
+
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/ivector_period ]; then
+ echo "$0: no such file $srcdir/ivector_period"
+ exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+ echo "$0: this script requires <srcdir> and <destdir> to be different."
+ exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+cp -r $srcdir/{conf,ivector_period} $destdir
+if [ -z $utt2orig ]; then
+ cp $srcdir/ivector_online.scp $destdir
+else
+ utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp
+fi
+cp $srcdir/final.ie.id $destdir
+
+echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir"
diff --git a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
@@ -4,6 +4,11 @@
 
 # Apache 2.0
 
+write_utt2orig= # if provided, this script will write
+ # a mapping of shifted utterance ids
+ # to the original ones into the file
+ # specified by this option
+
 echo "$0 $@" # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . utils/parse_options.sh
@@ -34,11 +39,18 @@ if [ -f $destdir/feats.scp ]; then
  exit 1
 fi
 
+if [ ! -z $write_utt2orig ]; then
+ awk '{print $1 " " $1}' $srcdir/feats.scp >$write_utt2orig
+fi
+
 tmp_shift_destdirs=()
 for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do
  if [ "$frame_shift" == 0 ]; then continue; fi
  utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1
  tmp_shift_destdirs+=("${destdir}_fs$frame_shift")
+ if [ ! -z $write_utt2orig ]; then
+ awk -v prefix="fs$frame_shift-" '{printf("%s%s %s\n", prefix, $1, $1);}' $srcdir/feats.scp >>$write_utt2orig
+ fi 
 done
 utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1
 rm -r ${tmp_shift_destdirs[@]}