Skip to content

Commit 0b49cba

Browse files
author
Ke Li
committed
add rnnlm recipe for librispeech
1 parent 5b23ace commit 0b49cba

File tree

2 files changed

+145
-0
lines changed

2 files changed

+145
-0
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/bin/bash
2+
3+
# Copyright 2012 Johns Hopkins University (author: Daniel Povey)
4+
# 2018 Ke Li
5+
6+
# This script trains LMs on the librispeech 960 hours training data.
7+
8+
# rnnlm/train_rnnlm.sh: best iteration (out of 26) was 21, linking it to final iteration.
9+
# rnnlm/train_rnnlm.sh: train/dev perplexity was 118.4 / 152.6.
10+
# Train objf: -5.74 -5.51 -5.38 -5.29 -5.22 -5.16 -5.12 -5.08 -5.05 -5.02 -4.99 -4.97 -4.97 -4.93 -4.90 -4.87 -4.84 -4.82 -4.79 -4.77 -4.75 -4.73 -4.71 -4.69 -4.67
11+
# Dev objf: -6.00 -5.61 -5.45 -5.36 -5.29 -5.24 -5.20 -5.18 -5.16 -5.13 -5.12 -5.11 -5.11 -5.09 -5.07 -5.06 -5.05 -5.04 -5.03 -5.03 -5.03 -5.03 -5.03 -5.03 -5.03 -5.03
12+
13+
# Begin configuration section.
14+
15+
dir=exp/rnnlm_lstm_1a
16+
embedding_dim=1024
17+
lstm_rpd=256
18+
lstm_nrpd=256
19+
stage=-10
20+
train_stage=-10
21+
epochs=20
22+
23+
# variables for lattice rescoring
24+
run_lat_rescore=true
25+
run_nbest_rescore=true
26+
run_backward_rnnlm=false
27+
ac_model_dir=exp/chain_cleaned/tdnn_1d_sp
28+
decode_dir_suffix=rnnlm_1a
29+
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
30+
# if it's set, it merges histories in the lattice if they share
31+
# the same ngram history and this prevents the lattice from
32+
# exploding exponentially
33+
pruned_rescore=true
34+
35+
. ./cmd.sh
36+
. ./utils/parse_options.sh
37+
38+
# test of 960 hours training transcriptions
39+
text=data/train_960/text
40+
lexicon=data/lang_nosp/words.txt
41+
text_dir=data/rnnlm/text_960_1a
42+
mkdir -p $dir/config
43+
set -e
44+
45+
for f in $text $lexicon; do
46+
[ ! -f $f ] && \
47+
echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1
48+
done
49+
50+
if [ $stage -le 0 ]; then
51+
mkdir -p $text_dir
52+
echo -n >$text_dir/dev.txt
53+
# hold out one in every 50 lines as dev data.
54+
cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt
55+
fi
56+
57+
if [ $stage -le 1 ]; then
58+
cp $lexicon $dir/config/
59+
n=`cat $dir/config/words.txt | wc -l`
60+
echo "<brk> $n" >> $dir/config/words.txt
61+
62+
# words that are not present in words.txt but are in the training or dev data, will be
63+
# mapped to <SPOKEN_NOISE> during training.
64+
echo "<UNK>" >$dir/config/oov.txt
65+
66+
cat > $dir/config/data_weights.txt <<EOF
67+
librispeech 1 1.0
68+
EOF
69+
70+
rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
71+
--unk-word="<UNK>" \
72+
--data-weights-file=$dir/config/data_weights.txt \
73+
$text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
74+
75+
# choose features
76+
rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
77+
--top-word-features=5000 \
78+
--use-constant-feature=true \
79+
--special-words='<s>,</s>,<brk>,<UNK>,<SPOKEN_NOISE>' \
80+
$dir/config/words.txt > $dir/config/features.txt
81+
82+
cat >$dir/config/xconfig <<EOF
83+
input dim=$embedding_dim name=input
84+
relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
85+
fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
86+
relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
87+
fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
88+
relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
89+
output-layer name=output include-log-softmax=false dim=$embedding_dim
90+
EOF
91+
rnnlm/validate_config_dir.sh $text_dir $dir/config
92+
fi
93+
94+
if [ $stage -le 2 ]; then
95+
# the --unigram-factor option is set larger than the default (100)
96+
# in order to reduce the size of the sampling LM, because rnnlm-get-egs
97+
# was taking up too much CPU (as much as 10 cores).
98+
rnnlm/prepare_rnnlm_dir.sh --unigram-factor 400 \
99+
$text_dir $dir/config $dir
100+
fi
101+
102+
if [ $stage -le 3 ]; then
103+
rnnlm/train_rnnlm.sh --num-jobs-final 2 \
104+
--stage $train_stage \
105+
--num-epochs $epochs \
106+
--cmd "$train_cmd" $dir
107+
fi
108+
109+
LM=fglarge
110+
if [ $stage -le 4 ] && $run_lat_rescore; then
111+
echo "$0: Perform lattice-rescoring on $ac_model_dir"
112+
# LM=tgsmall # if using the original 3-gram G.fst as old lm
113+
pruned=
114+
if $pruned_rescore; then
115+
pruned=_pruned
116+
fi
117+
for decode_set in test_clean test_other dev_clean dev_other; do
118+
decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}
119+
120+
# Lattice rescoring
121+
rnnlm/lmrescore$pruned.sh \
122+
--cmd "$decode_cmd --mem 8G" \
123+
--weight 0.4 --max-ngram-order $ngram_order \
124+
data/lang_test_$LM $dir \
125+
data/${decode_set}_hires ${decode_dir} \
126+
exp/decode_${decode_set}_${LM}_${decode_dir_suffix}_rescore
127+
done
128+
fi
129+
130+
if [ $stage -le 5 ] && $run_nbest_rescore; then
131+
echo "$0: Perform nbest-rescoring on $ac_model_dir"
132+
for decode_set in test_clean test_other dev_clean dev_other; do
133+
decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}
134+
135+
# Nbest rescoring
136+
rnnlm/lmrescore_nbest.sh \
137+
--cmd "$decode_cmd --mem 8G" --N 20 \
138+
0.4 data/lang_test_$LM $dir \
139+
data/${decode_set}_hires ${decode_dir} \
140+
exp/decode_${decode_set}_${LM}_${decode_dir_suffix}_nbest_rescore
141+
done
142+
fi
143+
144+
exit 0

egs/librispeech/s5/rnnlm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../scripts/rnnlm/

0 commit comments

Comments
 (0)