Skip to content

Commit d218412

Browse files
jtrmaldanpovey
authored andcommitted
[egs] clean up the HKUST scripts and add scoring filters (#1436)
Some fixes to scoring (e.g. don't split english words into characters, only chinese ones). Modify scoring to produce CER and WER numbers.
1 parent 8458587 commit d218412

File tree

6 files changed

+74
-8
lines changed

6 files changed

+74
-8
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env perl
2+
# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal)
3+
# Apache 2.0
4+
use utf8;
5+
6+
use open qw(:encoding(utf8));
7+
binmode STDIN, ":utf8";
8+
binmode STDOUT, ":utf8";
9+
binmode STDERR, ":utf8";
10+
11+
while (<>) {
12+
@F = split " ";
13+
print $F[0] . " ";
14+
foreach $s (@F[1..$#F]) {
15+
if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
16+
print " $s";
17+
} else {
18+
@chars = split "", $s;
19+
foreach $c (@chars) {
20+
if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
21+
print " $c";
22+
} else {
23+
print "$c";
24+
}
25+
}
26+
}
27+
print " ";
28+
}
29+
print "\n";
30+
}
31+
32+

egs/hkust/s5/local/hkust_data_prep.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];e
104104
print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
105105
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
106106

107-
sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
108-
[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
107+
sph2pipe=`which sph2pipe` || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
108+
[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
109109

110110
cat $train_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
111111
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
@@ -136,5 +136,4 @@ cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir
136136
cat $dev_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
137137

138138
echo "$0: HKUST data preparation succeeded"
139-
140-
exit;
139+
exit 0

egs/hkust/s5/local/hkust_prepare_dict.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,5 +312,4 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
312312
cat - $dict_dir/lexicon1.txt > $dict_dir/lexicon.txt || exit 1;
313313

314314
echo "$0: HKUST dict preparation succeeded"
315-
316-
exit;
315+
exit 0;

egs/hkust/s5/local/hkust_train_lms.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@ done
1919
dir=data/local/lm
2020
mkdir -p $dir
2121

22+
export LC_ALL=C # You'll get errors about things being not sorted, if you
23+
# have a different locale.
2224
kaldi_lm=`which train_lm.sh`
2325
if [ ! -x $kaldi_lm ]; then
24-
echo "train_lm.sh is not found. Checkout tools/extra/install_kaldi_lm.sh"
26+
echo "$0: train_lm.sh is not found. That might mean it's not installed"
27+
echo "$0: or it is not added to PATH"
28+
echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
2529
exit 1
2630
fi
2731

egs/hkust/s5/local/score.sh

Lines changed: 0 additions & 1 deletion
This file was deleted.

egs/hkust/s5/local/score.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
set -e -o pipefail
4+
set -x
5+
steps/score_kaldi.sh "$@"
6+
steps/score_kaldi_cer.sh --stage 2 "$@"
7+
8+
echo "$0: Done"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env perl
2+
# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal)
3+
# Apache 2.0
4+
use utf8;
5+
6+
use open qw(:encoding(utf8));
7+
binmode STDIN, ":utf8";
8+
binmode STDOUT, ":utf8";
9+
binmode STDERR, ":utf8";
10+
11+
while (<>) {
12+
@F = split " ";
13+
print $F[0] . " ";
14+
foreach $s (@F[1..$#F]) {
15+
if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
16+
print "";
17+
} else {
18+
print "$s"
19+
}
20+
print " ";
21+
}
22+
print "\n";
23+
}
24+
25+

0 commit comments

Comments
 (0)