kaldi-asr
diff --git a/‎egs/hkust/s5/local/character_tokenizer‎
Lines changed: 32 additions & 0 deletions b/‎egs/hkust/s5/local/character_tokenizer‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎egs/hkust/s5/local/hkust_data_prep.sh‎
Lines changed: 3 additions & 4 deletions b/‎egs/hkust/s5/local/hkust_data_prep.sh‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎egs/hkust/s5/local/hkust_prepare_dict.sh‎
Lines changed: 1 addition & 2 deletions b/‎egs/hkust/s5/local/hkust_prepare_dict.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎egs/hkust/s5/local/hkust_train_lms.sh‎
Lines changed: 5 additions & 1 deletion b/‎egs/hkust/s5/local/hkust_train_lms.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎egs/hkust/s5/local/score.sh‎
Lines changed: 0 additions & 1 deletion b/‎egs/hkust/s5/local/score.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎egs/hkust/s5/local/score.sh‎
Lines changed: 8 additions & 0 deletions b/‎egs/hkust/s5/local/score.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎egs/hkust/s5/local/wer_output_filter‎
Lines changed: 25 additions & 0 deletions b/‎egs/hkust/s5/local/wer_output_filter‎
Lines changed: 25 additions & 0 deletions
@@ -0,0 +1,32 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+ @F = split " ";
+ print $F[0] . " "; 
+ foreach $s (@F[1..$#F]) {
+ if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+ print " $s";
+ } else {
+ @chars = split "", $s;
+ foreach $c (@chars) {
+ if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
+ print " $c";
+ } else {
+ print "$c";
+ }
+ }
+ }
+ print " ";
+ }
+ print "\n";
+}
+
+
@@ -104,8 +104,8 @@ awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];e
  print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
 awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
 
-sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
-[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
+sph2pipe=`which sph2pipe` || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
 
 cat $train_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
  printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
@@ -136,5 +136,4 @@ cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir
 cat $dev_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
 
 echo "$0: HKUST data preparation succeeded"
-
-exit;
+exit 0
@@ -312,5 +312,4 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
  cat - $dict_dir/lexicon1.txt > $dict_dir/lexicon.txt || exit 1;
 
 echo "$0: HKUST dict preparation succeeded"
-
-exit;
+exit 0;
@@ -19,9 +19,13 @@ done
 dir=data/local/lm
 mkdir -p $dir
 
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+ # have a different locale.
 kaldi_lm=`which train_lm.sh`
 if [ ! -x $kaldi_lm ]; then
- echo "train_lm.sh is not found. Checkout tools/extra/install_kaldi_lm.sh"
+ echo "$0: train_lm.sh is not found. That might mean it's not installed"
+ echo "$0: or it is not added to PATH"
+ echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
  exit 1
 fi
 
 
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+ @F = split " ";
+ print $F[0] . " "; 
+ foreach $s (@F[1..$#F]) {
+ if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+ print "";
+ } else {
+ print "$s"
+ }
+ print " ";
+ }
+ print "\n";
+}
+
+