Skip to content

Commit f49b3fc

Browse files
xiaohui-zhangdanpovey
authored andcommitted
[scripts,src] added a script to scale arcs which output <unk> in HCLG.fst; other small fixes (#2499)
1 parent b4eda57 commit f49b3fc

File tree

6 files changed

+66
-11
lines changed

6 files changed

+66
-11
lines changed

egs/multi_en/s5/local/g2p/apply_g2p.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mkdir -p $workdir
2929
echo 'Gathering missing words...'
3030
cat data/*/train/text | \
3131
local/count_oovs.pl $lexicon | \
32-
awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
32+
awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
3333
perl -ape 's/\s/\n/g;' | \
3434
sort | uniq > $workdir/missing.txt
3535
cat $workdir/missing.txt | \

egs/wsj/s5/utils/lang/adjust_unk_arpa.pl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
use Getopt::Long;
99

1010
my $Usage = <<EOU;
11-
# This is a simple script to set/scale the unigram prob of the OOV dict entry in an ARPA lm file.
11+
# This is a simple script to set/scale the prob of n-grams where the OOV dict entry is the predicted word, in an ARPA lm file.
1212
Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa
1313
1414
Allowed options:
1515
--fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to
1616
the unigram prob of the OOV dict entry, rather than using it to
17-
scale the unigram prob.
17+
scale the probs. In this case higher order n-grams containing
18+
the OOV dict entry remain untouched. This is useful when the OOV
19+
dict entry doesn't appear in n-grams (n>1) as the predicted word.
1820
EOU
1921

2022
my $fixed_value = "false";
@@ -37,18 +39,21 @@
3739
if ( $fixed_value eq "true" ) {
3840
print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n";
3941
} else {
40-
print STDERR "$0: Scaling the unigram prob of $unk_word in LM file by $unk_scale.\n";
42+
print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n";
4143
}
4244

43-
my $unigram = 0; # wether we are visiting the unigram field or not.
45+
my $ngram = 0; # the order of ngram we are visiting
4446

4547
# Change the unigram prob of the unk-word in the ARPA LM.
4648
while(<STDIN>) {
47-
if (m/^\\1-grams:$/) { $unigram = 1; }
48-
if (m/^\\2-grams:$/) { $unigram = 0; }
49+
if (m/^\\1-grams:$/) { $ngram = 1; }
50+
if (m/^\\2-grams:$/) { $ngram = 2; }
51+
if (m/^\\3-grams:$/) { $ngram = 3; }
52+
if (m/^\\4-grams:$/) { $ngram = 4; }
53+
if (m/^\\5-grams:$/) { $ngram = 5; }
4954
my @col = split(" ", $_);
50-
if ( $unigram == 1 && @col > 1 && $col[1] eq $unk_word ) {
51-
if ( $fixed_value eq "true" ) {
55+
if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) {
56+
if ( $fixed_value eq "true" && $ngram == 1 ) {
5257
$col[0] = (log($unk_scale) / log(10.0));
5358
} else {
5459
$col[0] += (log($unk_scale) / log(10.0));
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
# Copyright 2018 Xiaohui Zhang
3+
# Apache 2.0
4+
5+
# This script copies a fully expanded decoding graph (HCLG.fst) and scales the scores
6+
# of all arcs whose output symbol is a user-specified OOV symbol (or any other word).
7+
# This achieves an equivalent effect of utils/lang/adjust_unk_arpa.pl, which scales
8+
# the LM prob of all ngrams predicting an OOV symbol, while avoiding re-creating the graph.
9+
10+
set -o pipefail
11+
12+
if [ $# != 4 ]; then
13+
echo "Usage: utils/adjust_unk_graph.sh <oov-dict-entry> <scale> <in-graph-dir> <out-graph-dir>"
14+
echo "e.g.: utils/adjust_unk_graph.sh \"<unk>\" 0.1 exp/tri1/graph exp/tri1/graph_unk_scale_0.1"
15+
exit 1;
16+
fi
17+
18+
if [ -f path.sh ]; then . ./path.sh; fi
19+
20+
oov_word=$1
21+
unk_scale=$2
22+
graphdir_in=$3
23+
graphdir_out=$4
24+
25+
mkdir -p $graphdir_out
26+
27+
required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt"
28+
for f in $required; do
29+
[ ! -f $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
30+
cp -r $graphdir_in/$f $graphdir_out
31+
done
32+
33+
cp -r $graphdir_in/{disambig_tid.int,num_pdfs,phones,phones.txt,words.txt} $graphdir_out
34+
35+
oov_id=`echo $oov_word | utils/sym2int.pl $graphdir_in/words.txt`
36+
[ -z $oov_id ] && echo "adjust_unk_graph.sh: the specified oov symbol $oov_word is out of the vocabulary." && exit 1;
37+
fstprint $graphdir_in/HCLG.fst | awk -v oov=$oov_id -v unk_scale=$unk_scale '{if($4==oov) $5=$5-log(unk_scale);print $0}' | \
38+
fstcompile > $graphdir_out/HCLG.fst || exit 1;

src/lat/sausages.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,18 @@ class MinimumBayesRisk {
8383
MinimumBayesRiskOptions opts = MinimumBayesRiskOptions());
8484

8585
// Uses the provided <words> as <R_> instead of using the lattice best path.
86+
// Note that the default value of opts.decode_mbr is true. If you provide 1-best
87+
// hypothesis from MAP decoding, the output ctm from MBR decoding may be
88+
// mismatched with the provided <words> (<words> would be used as the starting
89+
// point of optimization).
8690
MinimumBayesRisk(const CompactLattice &clat,
8791
const std::vector<int32> &words,
8892
MinimumBayesRiskOptions opts = MinimumBayesRiskOptions());
8993
// Uses the provided <words> as <R_> and <times> of bins instead of using the lattice best path.
94+
// Note that the default value of opts.decode_mbr is true. If you provide 1-best
95+
// hypothesis from MAP decoding, the output ctm from MBR decoding may be
96+
// mismatched with the provided <words> (<words> would be used as the starting
97+
// point of optimization).
9098
MinimumBayesRisk(const CompactLattice &clat,
9199
const std::vector<int32> &words,
92100
const std::vector<std::pair<BaseFloat,BaseFloat> > &times,

src/latbin/lattice-to-ctm-conf.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ int main(int argc, char *argv[]) {
3535
"sequence. In the 3-argument form, we read it from the\n"
3636
"<1best-rspecifier> input; otherwise it is the 1-best of the lattice.\n"
3737
"Then, if --decode-mbr=true, we iteratively refine the hypothesis\n"
38-
"using Minimum Bayes Risk decoding. If you don't need confidences,\n"
38+
"using Minimum Bayes Risk decoding. (Note that the default value of decode_mbr\n"
39+
"is true. If you provide <1best-rspecifier> from MAP decoding, the output ctm\n"
40+
"from MBR decoding may be mismatched with the provided 1best hypothesis (the\n"
41+
"starting point of optimization). If you don't need confidences,\n"
3942
"you can do lattice-1best and pipe to nbest-to-ctm. The ctm this\n"
4043
"program produces will be relative to the utterance-id; a standard\n"
4144
"ctm relative to the filename can be obtained using\n"

src/latbin/lattice-to-nbest.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ int main(int argc, char *argv[]) {
5151
po.Register("lm-scale", &lm_scale, "Scaling factor for language model scores.");
5252
po.Register("n", &n, "Number of distinct paths");
5353
po.Register("random", &random,
54-
"If true, generate n random paths instead of n-best paths");
54+
"If true, generate n random paths instead of n-best paths"
55+
"In this case, all costs in generated paths will be zero.");
5556
po.Register("srand", &srand_seed, "Seed for random number generator "
5657
"(only relevant if --random=true)");
5758

0 commit comments

Comments
 (0)