Skip to content

Commit 04b1f7d

Browse files
babakrkbdanpovey
authored andcommitted
[egs,scripts] Byte-pair encoding (BPE) applied MADCAT Arabic OCR (#2434)
1 parent c8db7a9 commit 04b1f7d

File tree

9 files changed

+693
-5
lines changed

9 files changed

+693
-5
lines changed

egs/madcat_ar/v1/local/prepare_dict.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ mkdir -p $dir
1212

1313
local/prepare_lexicon.py $dir
1414

15-
cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt || exit 1;
15+
cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
1616

1717
echo '<sil> SIL' >> $dir/lexicon.txt
1818

egs/madcat_ar/v1/local/prepare_lexicon.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
line_vect = line.strip().split(' ')
2020
for i in range(1, len(line_vect)):
2121
characters = list(line_vect[i])
22-
characters = " ".join(characters)
22+
# Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
23+
characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
2324
lex[line_vect[i]] = characters
2425
if line_vect[i] == '#':
2526
lex[line_vect[i]] = "<HASH>"
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
# This script, prepend '|' to every words in the transcript to mark
5+
# the beginning of the words for finding the initial-space of every word
6+
# after decoding.
7+
8+
import sys, io
9+
10+
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
11+
output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
12+
for line in infile:
13+
output.write(' '.join([ "|"+word for word in line.split()]) + '\n')
14+
15+

egs/madcat_ar/v1/local/reverse.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
# This script, reverse all latin and digits sequences
5+
# (including words like MP3) to put them in the right order in the images.
6+
7+
import re, os, sys, io
8+
9+
in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
10+
out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
11+
for line in in_stream:
12+
out_stream.write( re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]' , lambda m:m.group(0)[::-1] , line ))
13+

egs/madcat_ar/v1/local/wer_output_filter

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ binmode STDERR, ":utf8";
1212
# Arabic-specific normalization
1313
while (<>) {
1414
@F = split " ";
15-
print $F[0] . " ";
15+
print $F[0];
1616
foreach $s (@F[1..$#F]) {
1717
$s =~ s/\x{0623}/\x{0627}/g;
1818
$s =~ s/\x{0625}/\x{0627}/g;
@@ -34,12 +34,12 @@ while (<>) {
3434
$s =~ s/\x{0621}//g;
3535
$s =~ s/[\x{064b}-\x{0655}]//g;
3636
$s =~ s/\x{0640}//g;
37+
$s =~ s/\|/ /g;
3738
if ($s ne "") {
3839
print "$s";
3940
} else {
4041
print "";
4142
}
42-
print " ";
4343
}
4444
print "\n";
4545
}

egs/madcat_ar/v1/run_end2end.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,19 @@ fi
8080

8181
if [ $stage -le 5 ]; then
8282
echo "$0: Preparing dictionary and lang..."
83+
cut -d' ' -f2- data/train/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
84+
for set in test train dev ; do
85+
cut -d' ' -f1 data/$set/text > data/$set/ids
86+
cut -d' ' -f2- data/$set/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c data/train/bpe.out | sed 's/@@//g' > data/$set/bpe_text
87+
mv data/$set/text data/$set/text.old
88+
paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
89+
done
8390
local/prepare_dict.sh
84-
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.9999 \
91+
# This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
92+
# So we set --sil-prob to 0.0
93+
utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
8594
data/local/dict "<sil>" data/lang/temp data/lang
95+
utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
8696
fi
8797

8898
if [ $stage -le 6 ]; then
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/bash
2+
. ./path.sh
3+
4+
final_sil_prob=0.5
5+
6+
echo "$0 $@" # Print the command line for logging
7+
8+
. utils/parse_options.sh
9+
10+
if [ $# -ne 1 ]; then
11+
echo "Usage: $0 <lang>"
12+
echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in "
13+
echo " lang/ directory <lang>."
14+
echo " This can be useful in systems with byte-pair encoded (BPE) lexicons, in which"
15+
echo " the word-initial silence is part of the lexicon, so we turn off the standard"
16+
echo " optional silence in the lexicon"
17+
echo "options:"
18+
echo " --final-sil-prob <final silence probability> # default 0.5"
19+
exit 1;
20+
fi
21+
22+
lang=$1
23+
24+
if [ $lang/phones/final_sil_prob -nt $lang/phones/nonsilence.txt ]; then
25+
echo "$0 $lang/phones/final_sil_prob exists. Exiting..."
26+
exit 1;
27+
fi
28+
29+
sil_eq_zero=$(echo $(perl -e "if ( $final_sil_prob == 0.0) {print 'true';} else {print 'false';}"))
30+
sil_eq_one=$(echo $(perl -e "if ( $final_sil_prob == 1.0) {print 'true';} else {print 'false';}"))
31+
sil_lt_zero=$(echo $(perl -e "if ( $final_sil_prob < 0.0) {print 'true';} else {print 'false';}"))
32+
sil_gt_one=$(echo $(perl -e "if ( $final_sil_prob > 1.0) {print 'true';} else {print 'false';}"))
33+
34+
if $sil_lt_zero || $sil_gt_one; then
35+
echo "$0 final-sil-prob should be between 0.0 and 1.0. Final silence was not added."
36+
exit 1;
37+
else
38+
if $sil_eq_zero; then
39+
echo "$0 final-sil-prob = 0 => Final silence was not added."
40+
exit 0;
41+
elif $sil_eq_one; then
42+
echo -e "0\t1\t1\t0\n1" | fstcompile > $lang/final_sil.fst
43+
else
44+
log_silprob=$(echo $(perl -e "print log $final_sil_prob"))
45+
echo -e "0\t1\t1\t0\t$log_silprob\n0\t$log_silprob\n1\t0.0" | fstcompile > $lang/final_sil.fst
46+
fi
47+
mv $lang/L.fst $lang/L.fst.orig
48+
mv $lang/L_disambig.fst $lang/L_disambig.fst.orig
49+
fstconcat $lang/L.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L.fst
50+
fstconcat $lang/L_disambig.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L_disambig.fst
51+
echo "$final_sil_prob" > $lang/phones/final_sil_prob
52+
fi
53+

0 commit comments

Comments
 (0)