kaldi-asr
diff --git a/‎egs/wsj/s5/utils/scoring/wer_ops_details.pl‎
Lines changed: 50 additions & 9 deletions b/‎egs/wsj/s5/utils/scoring/wer_ops_details.pl‎
Lines changed: 50 additions & 9 deletions
diff --git a/‎egs/wsj/s5/utils/scoring/wer_per_spk_details.pl‎
Lines changed: 52 additions & 8 deletions b/‎egs/wsj/s5/utils/scoring/wer_per_spk_details.pl‎
Lines changed: 52 additions & 8 deletions
diff --git a/‎egs/wsj/s5/utils/scoring/wer_per_utt_details.pl‎
Lines changed: 50 additions & 7 deletions b/‎egs/wsj/s5/utils/scoring/wer_per_utt_details.pl‎
Lines changed: 50 additions & 7 deletions
diff --git a/‎egs/yesno/s5/local/score.sh‎
Lines changed: 0 additions & 57 deletions b/‎egs/yesno/s5/local/score.sh‎
Lines changed: 0 additions & 57 deletions
diff --git a/‎egs/yesno/s5/local/score.sh‎
Lines changed: 1 addition & 0 deletions b/‎egs/yesno/s5/local/score.sh‎
Lines changed: 1 addition & 0 deletions
@@ -24,23 +24,58 @@
 
 use strict;
 use warnings;
-use utf8;
-#use List::Util qw[max];
-use Data::Dumper;
 use Getopt::Long;
 use Pod::Usage;
 
 
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
 my $help;
 my $special_symbol= "<eps>";
 my $separator=";";
 my $extra_size=4;
 my $max_size=16;
 
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines 
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to 
+# make sure the length of the (decoded) string 
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text); 
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
 
+ return 0;
+}
 sub print_line {
  my $op = $_[0];
  my $rewf = $_[1];
@@ -64,9 +99,15 @@ sub max {
 
 my %EDIT_OPS;
 my %UTT;
-while (<STDIN>) {
- chomp;
- my @entries = split(" ", $_);
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ my @entries = split(" ", $line);
  next if @entries < 2;
  next if ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
  if (scalar @entries <= 2 ) {
 
@@ -24,18 +24,13 @@
 
 use strict;
 use warnings;
-use utf8;
 use List::Util qw[max];
 use Getopt::Long;
 use Pod::Usage;
-use open qw(:std :encoding(UTF-8));
 
 
 #use Data::Dumper;
 
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
 my $WIDTH=10;
 my $SPK_WIDTH=15;
 my $help;
@@ -50,6 +45,49 @@
 my %UTTMAP;
 my %PERSPK_STATS;
 
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines 
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to 
+# make sure the length of the (decoded) string 
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text); 
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
+
+ return 0;
+}
+
 sub print_header {
 
  my $f="%${WIDTH}s";
@@ -102,9 +140,15 @@ sub format_sys {
 }
 close(UTT2SPK);
 
-while (<STDIN>) {
- chomp;
- my @entries = split(" ", $_);
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ my @entries = split(" ", $line);
  next if @entries < 2;
  next if $entries[1] ne "#csid" ; 
  die "Incompatible entry $_ " if @entries != 6;
 
@@ -24,17 +24,13 @@
 #
 use strict;
 use warnings;
-use utf8;
 use List::Util qw[max];
 use Getopt::Long;
 use Pod::Usage;
 
 
 #use Data::Dumper;
 
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-
 my $special_symbol= "<eps>";
 my $separator=";";
 my $output_hyp = 1;
@@ -72,9 +68,56 @@ sub cjustify {
  return sprintf("%s%s%s", " " x $left_spaces, $str, " " x $right_spaces);
 }
 
-while (<STDIN>) {
- chomp;
- (my $utt_id, my $alignment) = split (" ", $_, 2);
+# this function reads the opened file (supplied as a first
+# parameter) into an array of lines. For each
+# line, it tests whether it's a valid utf-8 compatible
+# line. If all lines are valid utf-8, it returns the lines 
+# decoded as utf-8, otherwise it assumes the file's encoding
+# is one of those 1-byte encodings, such as ISO-8859-x
+# or Windows CP-X.
+# Please recall we do not really care about
+# the actually encoding, we just need to 
+# make sure the length of the (decoded) string 
+# is correct (to make the output formatting looking right).
+sub get_utf8_or_bytestream {
+ use Encode qw(decode encode);
+ my $is_utf_compatible = 1;
+ my @unicode_lines;
+ my @raw_lines;
+ my $raw_text;
+ my $lineno = 0;
+ my $file = shift;
+
+ while (<$file>) {
+ $raw_text = $_;
+ last unless $raw_text;
+ if ($is_utf_compatible) {
+ my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
+ $is_utf_compatible = $is_utf_compatible && defined($decoded_text); 
+ push @unicode_lines, $decoded_text;
+ }
+ push @raw_lines, $raw_text;
+ $lineno += 1;
+ }
+
+ if (!$is_utf_compatible) {
+ print STDERR "$0: Note: handling as byte stream\n";
+ return (0, @raw_lines);
+ } else {
+ print STDERR "$0: Note: handling as utf-8 text\n";
+ return (1, @unicode_lines);
+ }
+}
+
+(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
+if ($is_utf8) {
+ binmode(STDOUT, ":utf8");
+}
+
+while (@text) {
+ my $line = shift @text;
+ chomp $line;
+ (my $utt_id, my $alignment) = split (" ", $line, 2);
  my @alignment_pairs = split(" ", $alignment); #splits on spaces, does not create empty fields
 
  my @HYP;
 
@@ -0,0 +1 @@
+../steps/score_kaldi.sh