Skip to content

Commit 9a6e1c0

Browse files
authored
[egs,scripts] For scoring analysis scripts, guess if it's utf-8 compatible stream or not (#1903)
Relates to the scripts providing detailed error analysis after scoring.
1 parent 13e3a0b commit 9a6e1c0

File tree

4 files changed

+153
-81
lines changed

4 files changed

+153
-81
lines changed

egs/wsj/s5/utils/scoring/wer_ops_details.pl

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,58 @@
2424

2525
use strict;
2626
use warnings;
27-
use utf8;
28-
#use List::Util qw[max];
29-
use Data::Dumper;
3027
use Getopt::Long;
3128
use Pod::Usage;
3229

3330

34-
binmode STDIN, ":utf8";
35-
binmode STDOUT, ":utf8";
36-
3731
my $help;
3832
my $special_symbol= "<eps>";
3933
my $separator=";";
4034
my $extra_size=4;
4135
my $max_size=16;
4236

37+
# this function reads the opened file (supplied as a first
38+
# parameter) into an array of lines. For each
39+
# line, it tests whether it's a valid utf-8 compatible
40+
# line. If all lines are valid utf-8, it returns the lines
41+
# decoded as utf-8, otherwise it assumes the file's encoding
42+
# is one of those 1-byte encodings, such as ISO-8859-x
43+
# or Windows CP-X.
44+
# Please recall we do not really care about
45+
# the actually encoding, we just need to
46+
# make sure the length of the (decoded) string
47+
# is correct (to make the output formatting looking right).
48+
sub get_utf8_or_bytestream {
49+
use Encode qw(decode encode);
50+
my $is_utf_compatible = 1;
51+
my @unicode_lines;
52+
my @raw_lines;
53+
my $raw_text;
54+
my $lineno = 0;
55+
my $file = shift;
56+
57+
while (<$file>) {
58+
$raw_text = $_;
59+
last unless $raw_text;
60+
if ($is_utf_compatible) {
61+
my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
62+
$is_utf_compatible = $is_utf_compatible && defined($decoded_text);
63+
push @unicode_lines, $decoded_text;
64+
}
65+
push @raw_lines, $raw_text;
66+
$lineno += 1;
67+
}
68+
69+
if (!$is_utf_compatible) {
70+
print STDERR "$0: Note: handling as byte stream\n";
71+
return (0, @raw_lines);
72+
} else {
73+
print STDERR "$0: Note: handling as utf-8 text\n";
74+
return (1, @unicode_lines);
75+
}
4376

77+
return 0;
78+
}
4479
sub print_line {
4580
my $op = $_[0];
4681
my $rewf = $_[1];
@@ -64,9 +99,15 @@ sub max {
6499

65100
my %EDIT_OPS;
66101
my %UTT;
67-
while (<STDIN>) {
68-
chomp;
69-
my @entries = split(" ", $_);
102+
(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
103+
if ($is_utf8) {
104+
binmode(STDOUT, ":utf8");
105+
}
106+
107+
while (@text) {
108+
my $line = shift @text;
109+
chomp $line;
110+
my @entries = split(" ", $line);
70111
next if @entries < 2;
71112
next if ($entries[1] ne "hyp") and ($entries[1] ne "ref") ;
72113
if (scalar @entries <= 2 ) {

egs/wsj/s5/utils/scoring/wer_per_spk_details.pl

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,13 @@
2424

2525
use strict;
2626
use warnings;
27-
use utf8;
2827
use List::Util qw[max];
2928
use Getopt::Long;
3029
use Pod::Usage;
31-
use open qw(:std :encoding(UTF-8));
3230

3331

3432
#use Data::Dumper;
3533

36-
binmode STDIN, ":utf8";
37-
binmode STDOUT, ":utf8";
38-
3934
my $WIDTH=10;
4035
my $SPK_WIDTH=15;
4136
my $help;
@@ -50,6 +45,49 @@
5045
my %UTTMAP;
5146
my %PERSPK_STATS;
5247

48+
# this function reads the opened file (supplied as a first
49+
# parameter) into an array of lines. For each
50+
# line, it tests whether it's a valid utf-8 compatible
51+
# line. If all lines are valid utf-8, it returns the lines
52+
# decoded as utf-8, otherwise it assumes the file's encoding
53+
# is one of those 1-byte encodings, such as ISO-8859-x
54+
# or Windows CP-X.
55+
# Please recall we do not really care about
56+
# the actually encoding, we just need to
57+
# make sure the length of the (decoded) string
58+
# is correct (to make the output formatting looking right).
59+
sub get_utf8_or_bytestream {
60+
use Encode qw(decode encode);
61+
my $is_utf_compatible = 1;
62+
my @unicode_lines;
63+
my @raw_lines;
64+
my $raw_text;
65+
my $lineno = 0;
66+
my $file = shift;
67+
68+
while (<$file>) {
69+
$raw_text = $_;
70+
last unless $raw_text;
71+
if ($is_utf_compatible) {
72+
my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
73+
$is_utf_compatible = $is_utf_compatible && defined($decoded_text);
74+
push @unicode_lines, $decoded_text;
75+
}
76+
push @raw_lines, $raw_text;
77+
$lineno += 1;
78+
}
79+
80+
if (!$is_utf_compatible) {
81+
print STDERR "$0: Note: handling as byte stream\n";
82+
return (0, @raw_lines);
83+
} else {
84+
print STDERR "$0: Note: handling as utf-8 text\n";
85+
return (1, @unicode_lines);
86+
}
87+
88+
return 0;
89+
}
90+
5391
sub print_header {
5492

5593
my $f="%${WIDTH}s";
@@ -102,9 +140,15 @@ sub format_sys {
102140
}
103141
close(UTT2SPK);
104142

105-
while (<STDIN>) {
106-
chomp;
107-
my @entries = split(" ", $_);
143+
(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
144+
if ($is_utf8) {
145+
binmode(STDOUT, ":utf8");
146+
}
147+
148+
while (@text) {
149+
my $line = shift @text;
150+
chomp $line;
151+
my @entries = split(" ", $line);
108152
next if @entries < 2;
109153
next if $entries[1] ne "#csid" ;
110154
die "Incompatible entry $_ " if @entries != 6;

egs/wsj/s5/utils/scoring/wer_per_utt_details.pl

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,13 @@
2424
#
2525
use strict;
2626
use warnings;
27-
use utf8;
2827
use List::Util qw[max];
2928
use Getopt::Long;
3029
use Pod::Usage;
3130

3231

3332
#use Data::Dumper;
3433

35-
binmode STDIN, ":utf8";
36-
binmode STDOUT, ":utf8";
37-
3834
my $special_symbol= "<eps>";
3935
my $separator=";";
4036
my $output_hyp = 1;
@@ -72,9 +68,56 @@ sub cjustify {
7268
return sprintf("%s%s%s", " " x $left_spaces, $str, " " x $right_spaces);
7369
}
7470

75-
while (<STDIN>) {
76-
chomp;
77-
(my $utt_id, my $alignment) = split (" ", $_, 2);
71+
# this function reads the opened file (supplied as a first
72+
# parameter) into an array of lines. For each
73+
# line, it tests whether it's a valid utf-8 compatible
74+
# line. If all lines are valid utf-8, it returns the lines
75+
# decoded as utf-8, otherwise it assumes the file's encoding
76+
# is one of those 1-byte encodings, such as ISO-8859-x
77+
# or Windows CP-X.
78+
# Please recall we do not really care about
79+
# the actually encoding, we just need to
80+
# make sure the length of the (decoded) string
81+
# is correct (to make the output formatting looking right).
82+
sub get_utf8_or_bytestream {
83+
use Encode qw(decode encode);
84+
my $is_utf_compatible = 1;
85+
my @unicode_lines;
86+
my @raw_lines;
87+
my $raw_text;
88+
my $lineno = 0;
89+
my $file = shift;
90+
91+
while (<$file>) {
92+
$raw_text = $_;
93+
last unless $raw_text;
94+
if ($is_utf_compatible) {
95+
my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
96+
$is_utf_compatible = $is_utf_compatible && defined($decoded_text);
97+
push @unicode_lines, $decoded_text;
98+
}
99+
push @raw_lines, $raw_text;
100+
$lineno += 1;
101+
}
102+
103+
if (!$is_utf_compatible) {
104+
print STDERR "$0: Note: handling as byte stream\n";
105+
return (0, @raw_lines);
106+
} else {
107+
print STDERR "$0: Note: handling as utf-8 text\n";
108+
return (1, @unicode_lines);
109+
}
110+
}
111+
112+
(my $is_utf8, my @text) = get_utf8_or_bytestream(\*STDIN);
113+
if ($is_utf8) {
114+
binmode(STDOUT, ":utf8");
115+
}
116+
117+
while (@text) {
118+
my $line = shift @text;
119+
chomp $line;
120+
(my $utt_id, my $alignment) = split (" ", $line, 2);
78121
my @alignment_pairs = split(" ", $alignment); #splits on spaces, does not create empty fields
79122

80123
my @HYP;

egs/yesno/s5/local/score.sh

Lines changed: 0 additions & 57 deletions
This file was deleted.

egs/yesno/s5/local/score.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../steps/score_kaldi.sh

0 commit comments

Comments
 (0)