kaldi-asr · danpovey · Dec 28, 2018 · Dec 23, 2018 · Dec 23, 2018 · Dec 23, 2018
diff --git a/egs/madcat_ar/v1/local/download_data.sh b/egs/madcat_ar/v1/local/download_data.sh
diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright 2017 Chun Chieh Chang
+# 2017 Ashish Arora
+# 2017 Hossein Hadian
+# Apache 2.0
+
+# This script downloads the data splits for MADCAT Arabic dataset and prepares the training
+# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also uses Arabic Gigaword text corpus for language modeling.
+
+# Eg. local/prepare_data.sh
+# Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11 
+# وهناك تداخل بين الرأسمالية الإسرائيلية
+# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
+# images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 
+# data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png
+
+download_dir1=/export/corpora/LDC/LDC2012T15/data
+download_dir2=/export/corpora/LDC/LDC2013T09/data
+download_dir3=/export/corpora/LDC/LDC2013T15/data
+train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
+test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
+dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
+data_splits=data/download/data_splits
+stage=0
+download_dir=data/download
+gigacorpus=data/local/gigawordcorpus
+gigaword_loc=/export/corpora5/LDC/LDC2011T11
+use_extra_corpus_text=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+if [ -d $data_splits ]; then
+ echo "$0: Not downloading the data splits as it is already there."
+else
+ if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
+ mkdir -p $data_splits
+ echo "$0: Downloading the data splits..."
+ wget -P $data_splits $train_split_url || exit 1;
+ wget -P $data_splits $test_split_url || exit 1;
+ wget -P $data_splits $dev_split_url || exit 1;
+ fi
+ echo "$0: Done downloading the data splits"
+fi
+
+if [ -d $download_dir1 ]; then
+ echo "$0: madcat arabic data directory is present."
+else
+ if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
+ echo "$0: please download madcat data..."
+ fi
+fi
+
+mkdir -p $download_dir data/local
+if $use_extra_corpus_text; then
+ mkdir -p $gigacorpus
+ cp -r $gigaword_loc/. $gigacorpus
+ for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do
+ for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do
+ gzip -d $file
+ done
+ for file in $gigacorpus/arb_gw_5/data/$newswire/*; do
+ sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt
+ done
+ done
+fi
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
@@ -19,6 +19,7 @@ images_scp_dir=data/local
 overwrite=false
 subset=false
 augment=false
+use_extra_corpus_text=true
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
  ## This relates to the queue.
 . ./path.sh
@@ -35,9 +36,9 @@ if [ $stage -le 0 ]; then
  echo "Exiting with status 1 to avoid data corruption"
  exit 1;
  fi
- echo "$0: Downloading data splits...$(date)"
- local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
- --download_dir2 $download_dir2 --download_dir3 $download_dir3
+ local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
+  --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
+ --use_extra_corpus_text $use_extra_corpus_text
 
  for set in test train dev; do
  data_split_file=$data_splits_dir/madcat.$set.raw.lineid
@@ -48,7 +49,7 @@ if [ $stage -le 0 ]; then
  --data data/local/$set --subset $subset --augment $augment || exit 1
  done
 
- echo "$0: Preparing data..."
+ echo "$0: Processing data..."
  for set in dev train test; do
  local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
  $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \

diff --git a/egs/rimes/README.txt b/egs/rimes/README.txt
@@ -0,0 +1,13 @@
+Rimes is a French handwriting recognition database created by A2iA.
+The database was created by asking individuals to write letters on a given scenario like
+a change of personal information, payment difficulty, damage declaration. The
+dataset has been used in several international research including ICFHR 2008,
+ICDAR-2009, ICDAR-2011 competitions for isolated word level and
+line level recognition tasks.
+
+It contains 11333 training lines and 788 test lines. It does not include
+a validation split but in a recent publication a 10% sampling of the total
+training lines for validation purposes were performed
+(http://www.jpuigcerver.net/pubs/jpuigcerver_icdar2017.pdf).
+We have used a similar train, test and validation split.
+More info: http://www.a2ialab.com/doku.php?id=rimes_database:start
diff --git a/egs/rimes/v1/cmd.sh b/egs/rimes/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine). queue.pl works with GridEngine (qsub). slurm.pl works
+# with slurm. Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration. Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="retry.pl queue.pl"
diff --git a/egs/rimes/v1/image b/egs/rimes/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
diff --git a/egs/rimes/v1/local/chain/compare_wer.sh b/egs/rimes/v1/local/chain/compare_wer.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright 2017 Chun Chieh Chang
+# 2017 Ashish Arora
+
+if [ $# == 0 ]; then
+ echo "Usage: $0: <dir1> [<dir2> ... ]"
+ echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+ exit 1
+fi
+. ./path.sh
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System "
+for x in $*; do printf "% 10s" " $(basename $x)"; done
+echo
+
+echo -n "# WER "
+for x in $*; do
+ wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER "
+for x in $*; do
+ cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+echo -n "# WER val "
+for x in $*; do
+ wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val "
+for x in $*; do
+ cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+ exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent) "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent) "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Parameters "
+for x in $*; do
+ params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+ printf "% 10s" $params
+done
+echo
diff --git a/egs/rimes/v1/local/chain/run_cnn_e2eali.sh b/egs/rimes/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
diff --git a/egs/rimes/v1/local/chain/run_e2e_cnn.sh b/egs/rimes/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh