|
8 | 8 | use Getopt::Long; |
9 | 9 |
|
10 | 10 | my $Usage = <<EOU; |
11 | | -# This is a simple script to set/scale the unigram prob of the OOV dict entry in an ARPA lm file. |
| 11 | +# This is a simple script to set/scale the prob of n-grams where the OOV dict entry is the predicted word, in an ARPA lm file. |
12 | 12 | Usage: utils/lang/adjust_unk_arpa.pl [options] <oov-dict-entry> <unk-scale> <input-arpa >output-arpa |
13 | 13 |
|
14 | 14 | Allowed options: |
15 | 15 | --fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to |
16 | 16 | the unigram prob of the OOV dict entry, rather than using it to |
17 | | - scale the unigram prob. |
| 17 | + scale the probs. In this case higher order n-grams containing |
| 18 | + the OOV dict entry remain untouched. This is useful when the OOV |
| 19 | + dict entry doesn't appear in n-grams (n>1) as the predicted word. |
18 | 20 | EOU |
19 | 21 |
|
20 | 22 | my $fixed_value = "false"; |
|
37 | 39 | if ( $fixed_value eq "true" ) { |
38 | 40 | print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n"; |
39 | 41 | } else { |
40 | | - print STDERR "$0: Scaling the unigram prob of $unk_word in LM file by $unk_scale.\n"; |
| 42 | + print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n"; |
41 | 43 | } |
42 | 44 |
|
43 | | -my $unigram = 0; # wether we are visiting the unigram field or not. |
| 45 | +my $ngram = 0; # the order of ngram we are visiting |
44 | 46 |
|
45 | 47 | # Change the unigram prob of the unk-word in the ARPA LM. |
46 | 48 | while(<STDIN>) { |
47 | | - if (m/^\\1-grams:$/) { $unigram = 1; } |
48 | | - if (m/^\\2-grams:$/) { $unigram = 0; } |
| 49 | + if (m/^\\1-grams:$/) { $ngram = 1; } |
| 50 | + if (m/^\\2-grams:$/) { $ngram = 2; } |
| 51 | + if (m/^\\3-grams:$/) { $ngram = 3; } |
| 52 | + if (m/^\\4-grams:$/) { $ngram = 4; } |
| 53 | + if (m/^\\5-grams:$/) { $ngram = 5; } |
49 | 54 | my @col = split(" ", $_); |
50 | | - if ( $unigram == 1 && @col > 1 && $col[1] eq $unk_word ) { |
51 | | - if ( $fixed_value eq "true" ) { |
| 55 | + if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) { |
| 56 | + if ( $fixed_value eq "true" && $ngram == 1 ) { |
52 | 57 | $col[0] = (log($unk_scale) / log(10.0)); |
53 | 58 | } else { |
54 | 59 | $col[0] += (log($unk_scale) / log(10.0)); |
|
0 commit comments