kaldi-asr
diff --git a/‎egs/multi_en/s5/local/g2p/apply_g2p.sh‎
Lines changed: 1 addition & 1 deletion b/‎egs/multi_en/s5/local/g2p/apply_g2p.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎egs/wsj/s5/utils/lang/adjust_unk_arpa.pl‎
Lines changed: 13 additions & 8 deletions b/‎egs/wsj/s5/utils/lang/adjust_unk_arpa.pl‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎egs/wsj/s5/utils/lang/adjust_unk_graph.sh‎
Lines changed: 38 additions & 0 deletions b/‎egs/wsj/s5/utils/lang/adjust_unk_graph.sh‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/lat/sausages.h‎
Lines changed: 8 additions & 0 deletions b/‎src/lat/sausages.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/latbin/lattice-to-ctm-conf.cc‎
Lines changed: 4 additions & 1 deletion b/‎src/latbin/lattice-to-ctm-conf.cc‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/latbin/lattice-to-nbest.cc‎
Lines changed: 2 additions & 1 deletion b/‎src/latbin/lattice-to-nbest.cc‎
Lines changed: 2 additions & 1 deletion
@@ -29,7 +29,7 @@ mkdir -p $workdir
 echo 'Gathering missing words...'
 cat data/*/train/text | \
  local/count_oovs.pl $lexicon | \
- awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
+ awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
  perl -ape 's/\s/\n/g;' | \
  sort | uniq > $workdir/missing.txt
 cat $workdir/missing.txt | \
 
@@ -8,13 +8,15 @@
 use Getopt::Long;
 
 my $Usage = <<EOU;
-# This is a simple script to set/scale the unigram prob of the OOV dict entry in an ARPA lm file.
+# This is a simple script to set/scale the prob of n-grams where the OOV dict entry is the predicted word, in an ARPA lm file.
 Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa
 
 Allowed options:
  --fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to
  the unigram prob of the OOV dict entry, rather than using it to
- scale the unigram prob.
+ scale the probs. In this case higher order n-grams containing
+ the OOV dict entry remain untouched. This is useful when the OOV
+ dict entry doesn't appear in n-grams (n>1) as the predicted word.
 EOU
 
 my $fixed_value = "false";
@@ -37,18 +39,21 @@
 if ( $fixed_value eq "true" ) {
  print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n";
 } else {
- print STDERR "$0: Scaling the unigram prob of $unk_word in LM file by $unk_scale.\n";
+ print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n";
 }
 
-my $unigram = 0; # wether we are visiting the unigram field or not.
+my $ngram = 0; # the order of ngram we are visiting
 
 # Change the unigram prob of the unk-word in the ARPA LM.
 while(<STDIN>) {
- if (m/^\\1-grams:$/) { $unigram = 1; }
- if (m/^\\2-grams:$/) { $unigram = 0; }
+ if (m/^\\1-grams:$/) { $ngram = 1; }
+ if (m/^\\2-grams:$/) { $ngram = 2; }
+ if (m/^\\3-grams:$/) { $ngram = 3; }
+ if (m/^\\4-grams:$/) { $ngram = 4; }
+ if (m/^\\5-grams:$/) { $ngram = 5; }
  my @col = split(" ", $_);
- if ( $unigram == 1 && @col > 1 && $col[1] eq $unk_word ) {
- if ( $fixed_value eq "true" ) {
+ if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) {
+ if ( $fixed_value eq "true" && $ngram == 1 ) {
  $col[0] = (log($unk_scale) / log(10.0));
  } else {
  $col[0] += (log($unk_scale) / log(10.0));
 
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright 2018 Xiaohui Zhang
+# Apache 2.0
+
+# This script copies a fully expanded decoding graph (HCLG.fst) and scales the scores
+# of all arcs whose output symbol is a user-specified OOV symbol (or any other word).
+# This achieves an equivalent effect of utils/lang/adjust_unk_arpa.pl, which scales
+# the LM prob of all ngrams predicting an OOV symbol, while avoiding re-creating the graph.
+
+set -o pipefail
+
+if [ $# != 4 ]; then
+ echo "Usage: utils/adjust_unk_graph.sh <oov-dict-entry> <scale> <in-graph-dir> <out-graph-dir>"
+ echo "e.g.: utils/adjust_unk_graph.sh \"<unk>\" 0.1 exp/tri1/graph exp/tri1/graph_unk_scale_0.1"
+ exit 1;
+fi
+
+if [ -f path.sh ]; then . ./path.sh; fi
+
+oov_word=$1
+unk_scale=$2
+graphdir_in=$3
+graphdir_out=$4
+
+mkdir -p $graphdir_out
+
+required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt"
+for f in $required; do
+ [ ! -f $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
+ cp -r $graphdir_in/$f $graphdir_out
+done
+
+cp -r $graphdir_in/{disambig_tid.int,num_pdfs,phones,phones.txt,words.txt} $graphdir_out
+
+oov_id=`echo $oov_word | utils/sym2int.pl $graphdir_in/words.txt`
+[ -z $oov_id ] && echo "adjust_unk_graph.sh: the specified oov symbol $oov_word is out of the vocabulary." && exit 1;
+fstprint $graphdir_in/HCLG.fst | awk -v oov=$oov_id -v unk_scale=$unk_scale '{if($4==oov) $5=$5-log(unk_scale);print $0}' | \
+ fstcompile > $graphdir_out/HCLG.fst || exit 1;
@@ -83,10 +83,18 @@ class MinimumBayesRisk {
  MinimumBayesRiskOptions opts = MinimumBayesRiskOptions());
 
  // Uses the provided <words> as <R_> instead of using the lattice best path.
+ // Note that the default value of opts.decode_mbr is true. If you provide 1-best
+ // hypothesis from MAP decoding, the output ctm from MBR decoding may be
+ // mismatched with the provided <words> (<words> would be used as the starting
+ // point of optimization).
  MinimumBayesRisk(const CompactLattice &clat,
  const std::vector<int32> &words,
  MinimumBayesRiskOptions opts = MinimumBayesRiskOptions());
  // Uses the provided <words> as <R_> and <times> of bins instead of using the lattice best path.
+ // Note that the default value of opts.decode_mbr is true. If you provide 1-best
+ // hypothesis from MAP decoding, the output ctm from MBR decoding may be
+ // mismatched with the provided <words> (<words> would be used as the starting
+ // point of optimization).
  MinimumBayesRisk(const CompactLattice &clat,
  const std::vector<int32> &words,
  const std::vector<std::pair<BaseFloat,BaseFloat> > &times,
 
@@ -35,7 +35,10 @@ int main(int argc, char *argv[]) {
  "sequence. In the 3-argument form, we read it from the\n"
  "<1best-rspecifier> input; otherwise it is the 1-best of the lattice.\n"
  "Then, if --decode-mbr=true, we iteratively refine the hypothesis\n"
- "using Minimum Bayes Risk decoding. If you don't need confidences,\n"
+ "using Minimum Bayes Risk decoding. (Note that the default value of decode_mbr\n"
+ "is true. If you provide <1best-rspecifier> from MAP decoding, the output ctm\n"
+ "from MBR decoding may be mismatched with the provided 1best hypothesis (the\n"
+ "starting point of optimization). If you don't need confidences,\n"
  "you can do lattice-1best and pipe to nbest-to-ctm. The ctm this\n"
  "program produces will be relative to the utterance-id; a standard\n"
  "ctm relative to the filename can be obtained using\n"
 
@@ -51,7 +51,8 @@ int main(int argc, char *argv[]) {
  po.Register("lm-scale", &lm_scale, "Scaling factor for language model scores.");
  po.Register("n", &n, "Number of distinct paths");
  po.Register("random", &random,
- "If true, generate n random paths instead of n-best paths");
+ "If true, generate n random paths instead of n-best paths"
+ "In this case, all costs in generated paths will be zero.");
  po.Register("srand", &srand_seed, "Seed for random number generator "
  "(only relevant if --random=true)");