Skip to content
40 changes: 0 additions & 40 deletions egs/madcat_ar/v1/local/download_data.sh

This file was deleted.

69 changes: 69 additions & 0 deletions egs/madcat_ar/v1/local/prepare_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora
# 2017 Hossein Hadian
# Apache 2.0

# This script downloads the data splits for MADCAT Arabic dataset and prepares the training
# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
# It also uses Arabic Gigaword text corpus for language modeling.

# Eg. local/prepare_data.sh
# Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11
# وهناك تداخل بين الرأسمالية الإسرائيلية
# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
# images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1
# data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png

download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
data_splits=data/download/data_splits
stage=0
download_dir=data/download
gigacorpus=data/local/gigawordcorpus
gigaword_loc=/export/corpora5/LDC/LDC2011T11
use_extra_corpus_text=true

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh || exit 1;

if [ -d $data_splits ]; then
echo "$0: Not downloading the data splits as it is already there."
else
if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
mkdir -p $data_splits
echo "$0: Downloading the data splits..."
wget -P $data_splits $train_split_url || exit 1;
wget -P $data_splits $test_split_url || exit 1;
wget -P $data_splits $dev_split_url || exit 1;
fi
echo "$0: Done downloading the data splits"
fi

if [ -d $download_dir1 ]; then
echo "$0: madcat arabic data directory is present."
else
if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
echo "$0: please download madcat data..."
fi
fi

mkdir -p $download_dir data/local
if $use_extra_corpus_text; then
mkdir -p $gigacorpus
cp -r $gigaword_loc/. $gigacorpus
for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do
for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do
gzip -d $file
done
for file in $gigacorpus/arb_gw_5/data/$newswire/*; do
sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt
done
done
fi
9 changes: 5 additions & 4 deletions egs/madcat_ar/v1/run_end2end.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ images_scp_dir=data/local
overwrite=false
subset=false
augment=false
use_extra_corpus_text=true
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh
Expand All @@ -35,9 +36,9 @@ if [ $stage -le 0 ]; then
echo "Exiting with status 1 to avoid data corruption"
exit 1;
fi
echo "$0: Downloading data splits...$(date)"
local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3
local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3 \
--use_extra_corpus_text $use_extra_corpus_text

for set in test train dev; do
data_split_file=$data_splits_dir/madcat.$set.raw.lineid
Expand All @@ -48,7 +49,7 @@ if [ $stage -le 0 ]; then
--data data/local/$set --subset $subset --augment $augment || exit 1
done

echo "$0: Preparing data..."
echo "$0: Processing data..."
for set in dev train test; do
local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
Expand Down
13 changes: 13 additions & 0 deletions egs/rimes/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Rimes is a French handwriting recognition database created by A2iA.
The database was created by asking individuals to write letters on a given scenario like
a change of personal information, payment difficulty, damage declaration. The
dataset has been used in several international research including ICFHR 2008,
ICDAR-2009, ICDAR-2011 competitions for isolated word level and
line level recognition tasks.

It contains 11333 training lines and 788 test lines. It does not include
a validation split but in a recent publication a 10% sampling of the total
training lines for validation purposes were performed
(http://www.jpuigcerver.net/pubs/jpuigcerver_icdar2017.pdf).
We have used a similar train, test and validation split.
More info: http://www.a2ialab.com/doku.php?id=rimes_database:start
13 changes: 13 additions & 0 deletions egs/rimes/v1/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

export cmd="retry.pl queue.pl"
1 change: 1 addition & 0 deletions egs/rimes/v1/image
88 changes: 88 additions & 0 deletions egs/rimes/v1/local/chain/compare_wer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash

# this script is used for comparing decoding results between systems.
# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}

# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora

if [ $# == 0 ]; then
echo "Usage: $0: <dir1> [<dir2> ... ]"
echo "e.g.: $0 exp/chain/cnn{1a,1b}"
exit 1
fi
. ./path.sh

echo "# $0 $*"
used_epochs=false

echo -n "# System "
for x in $*; do printf "% 10s" " $(basename $x)"; done
echo

echo -n "# WER "
for x in $*; do
wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER "
for x in $*; do
cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

echo -n "# WER val "
for x in $*; do
wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER val "
for x in $*; do
cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

if $used_epochs; then
exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
fi

echo -n "# Final train prob "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final valid prob "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final train prob (xent) "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final valid prob (xent) "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Parameters "
for x in $*; do
params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
printf "% 10s" $params
done
echo
1 change: 1 addition & 0 deletions egs/rimes/v1/local/chain/run_cnn_e2eali.sh
1 change: 1 addition & 0 deletions egs/rimes/v1/local/chain/run_e2e_cnn.sh
Loading