kaldi-asr
diff --git a/‎egs/fame/README.txt‎
Lines changed: 15 additions & 0 deletions b/‎egs/fame/README.txt‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎egs/fame/s5/RESULTS‎
Lines changed: 28 additions & 0 deletions b/‎egs/fame/s5/RESULTS‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎egs/fame/s5/cmd.sh‎
Lines changed: 1 addition & 0 deletions b/‎egs/fame/s5/cmd.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/fame/s5/conf/decode_dnn.config‎
Lines changed: 2 additions & 0 deletions b/‎egs/fame/s5/conf/decode_dnn.config‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎egs/fame/s5/conf/fbank.conf‎
Lines changed: 2 additions & 0 deletions b/‎egs/fame/s5/conf/fbank.conf‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎egs/fame/s5/conf/mfcc.conf‎
Lines changed: 1 addition & 0 deletions b/‎egs/fame/s5/conf/mfcc.conf‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/fame/s5/conf/mfcc_hires.conf‎
Lines changed: 10 additions & 0 deletions b/‎egs/fame/s5/conf/mfcc_hires.conf‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎egs/fame/s5/conf/online_cmvn.conf‎
Lines changed: 1 addition & 0 deletions b/‎egs/fame/s5/conf/online_cmvn.conf‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/fame/s5/local/fame_data_prep.sh‎
Lines changed: 53 additions & 0 deletions b/‎egs/fame/s5/local/fame_data_prep.sh‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎egs/fame/s5/local/fame_dict_prep.sh‎
Lines changed: 36 additions & 0 deletions b/‎egs/fame/s5/local/fame_dict_prep.sh‎
Lines changed: 36 additions & 0 deletions
@@ -0,0 +1,15 @@
+The FAME! Speech Corpus
+
+The components of the Frisian data collection are speech and language resources gathered for building a large vocabulary ASR system for the Frisian language. Firstly, a new broadcast database is created by collecting recordings from the archives of the regional broadcaster Omrop Fryslân, and annotating them with various information such as the language switches and speaker details. The second component of this collection is a language model created on a text corpus with diverse vocabulary. Thirdly, a Frisian phonetic dictionary with the mappings between the Frisian words and phones is built to make the ASR viable for this under-resourced language. Finally, an ASR recipe is provided which uses all previous resources to perform recognition and present the recognition performances.
+
+The Corpus consists of short utterances extracted from 203 audio segments of approximately 5 minutes long which are parts of various radio programs covering a time span of almost 50 years (1966-2015), adding a longitudinal dimension to the database. The content of the recordings are very diverse including radio programs about culture, history, literature, sports, nature, agriculture, politics, society and languages. The total duration of the manually annotated radio broadcasts sums up to 18 hours, 33 minutes and 57 seconds. The stereo audio data has a sampling frequency of 48 kHz and 16-bit resolution per sample. The available meta-information helped the annotators to identify these speakers and mark them either using their names or the same label (if the name is not known). There are 309 identified speakers in the FAME! Speech Corpus, 21 of whom appear at least 3 times in the database. These speakers are mostly program presenters and celebrities appearing multiple times in different recordings over years. There are 233 unidentified speakers due to lack of meta-information. The total number of word- and sentence-level code-switching cases in the FAME! Speech Corpus is equal to 3837. Music portions have been removed, except where these overlap with speech.
+
+A full description of the FAME! Speech Corpus is provided in:
+
+Yilmaz, E., Heuvel, H. van den, Van de Velde, H., Kampstra, F., Algra, J., Leeuwen, D. van:
+
+Open Source Speech and Language Resources for Frisian Language.
+
+In: Proceedings Interspeech 2016, pp. 1536--1540, 8-12 September 2016, San Francisco
+
+Please check http://www.ru.nl/clst/datasets/ to get the FAME! Speech Corpus
@@ -0,0 +1,28 @@
+%WER 41.10 [ 4974 / 12101, 522 ins, 1223 del, 3229 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_devel/wer_11_0.0
+%WER 38.10 [ 4909 / 12886, 527 ins, 1220 del, 3162 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_test/wer_11_0.0
+%WER 41.06 [ 4969 / 12101, 514 ins, 1277 del, 3178 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it1/wer_11_0.0
+%WER 40.38 [ 4886 / 12101, 515 ins, 1225 del, 3146 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it3/wer_11_0.0
+%WER 40.15 [ 4859 / 12101, 514 ins, 1177 del, 3168 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it6/wer_10_0.5
+%WER 37.86 [ 4879 / 12886, 596 ins, 1083 del, 3200 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it1/wer_10_0.0
+%WER 37.16 [ 4789 / 12886, 592 ins, 1056 del, 3141 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it3/wer_10_0.0
+%WER 36.92 [ 4757 / 12886, 618 ins, 1010 del, 3129 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it6/wer_10_0.0
+%WER 42.38 [ 5129 / 12101, 576 ins, 1171 del, 3382 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode_devel/wer_11_0.0
+%WER 39.14 [ 5043 / 12886, 536 ins, 1172 del, 3335 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode_test/wer_11_0.0
+%WER 42.05 [ 5088 / 12101, 525 ins, 1282 del, 3281 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it1/wer_11_0.0
+%WER 41.41 [ 5011 / 12101, 461 ins, 1345 del, 3205 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it3/wer_11_0.5
+%WER 40.97 [ 4958 / 12101, 485 ins, 1279 del, 3194 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it6/wer_11_0.5
+%WER 38.79 [ 4998 / 12886, 512 ins, 1194 del, 3292 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it1/wer_11_0.0
+%WER 38.16 [ 4917 / 12886, 544 ins, 1128 del, 3245 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it3/wer_11_0.0
+%WER 37.68 [ 4856 / 12886, 564 ins, 1068 del, 3224 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it6/wer_11_0.0
+%WER 70.85 [ 8574 / 12101, 414 ins, 2596 del, 5564 sub ] exp/mono/decode_devel/wer_9_0.0
+%WER 68.17 [ 8785 / 12886, 413 ins, 2704 del, 5668 sub ] exp/mono/decode_test/wer_9_0.0
+%WER 44.05 [ 5330 / 12101, 560 ins, 1467 del, 3303 sub ] exp/sgmm2/decode_devel/wer_10_0.0
+%WER 40.22 [ 5183 / 12886, 680 ins, 1142 del, 3361 sub ] exp/sgmm2/decode_test/wer_9_0.0
+%WER 54.39 [ 6582 / 12101, 695 ins, 1595 del, 4292 sub ] exp/tri1/decode_devel/wer_10_0.0
+%WER 51.60 [ 6649 / 12886, 630 ins, 1706 del, 4313 sub ] exp/tri1/decode_test/wer_11_0.0
+%WER 51.53 [ 6236 / 12101, 659 ins, 1675 del, 3902 sub ] exp/tri2/decode_devel/wer_11_0.0
+%WER 48.32 [ 6226 / 12886, 643 ins, 1669 del, 3914 sub ] exp/tri2/decode_test/wer_12_0.0
+%WER 47.15 [ 5706 / 12101, 580 ins, 1537 del, 3589 sub ] exp/tri3/decode_devel/wer_13_0.0
+%WER 52.13 [ 6308 / 12101, 623 ins, 1706 del, 3979 sub ] exp/tri3/decode_devel.si/wer_11_0.5
+%WER 43.71 [ 5632 / 12886, 594 ins, 1538 del, 3500 sub ] exp/tri3/decode_test/wer_14_0.0
+%WER 48.21 [ 6212 / 12886, 825 ins, 1358 del, 4029 sub ] exp/tri3/decode_test.si/wer_10_0.0
@@ -0,0 +1 @@
+../../wsj/s5/cmd.sh
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding. Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
@@ -0,0 +1,2 @@
+# No non-default options for now.
+
@@ -0,0 +1 @@
+--use-energy=false # only non-default option.
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false # use average of log energy, not energy.
+--num-mel-bins=40 # similar to Google's setup.
+--num-ceps=40 # there is no dimensionality reduction.
+--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so
+ # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2015-2016 Sarah Flora Juan
+# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016 Radboud University (Author: Emre Yilmaz)
+
+# Apache 2.0
+
+corpus=$1
+set -e -o pipefail
+if [ -z "$corpus" ] ; then
+ echo >&2 "The script $0 expects one parameter -- the location of the FAME! speech database"
+ exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+ echo >&2 "The directory $corpus does not exist"
+fi
+
+echo "Preparing train, development and test data"
+mkdir -p data data/local data/train data/devel data/test
+
+for x in train devel test; do
+ echo "Copy spk2utt, utt2spk, wav.scp, text for $x"
+ cp $corpus/data/$x/text data/$x/text || exit 1;
+ cp $corpus/data/$x/spk2utt data/$x/spk2utt || exit 1;
+ cp $corpus/data/$x/utt2spk data/$x/utt2spk || exit 1;
+
+ # the corpus wav.scp contains physical paths, so we just re-generate
+ # the file again from scratchn instead of figuring out how to edit it
+ for rec in $(awk '{print $1}' $corpus/data/$x/text) ; do
+ spk=${rec%_*}
+ filename=$corpus/fame/wav/${x}/${rec:8}.wav
+ if [ ! -f "$filename" ] ; then
+ echo >&2 "The file $filename could not be found ($rec)"
+ exit 1
+ fi
+ # we might want to store physical paths as a general rule
+ filename=$(readlink -f $filename)
+ echo "$rec $filename"
+ done > data/$x/wav.scp
+
+ # fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+ # duplicate entries and so on). Also, it regenerates the spk2utt from
+ # utt2sp
+ utils/fix_data_dir.sh data/$x
+done
+
+echo "Copying language model"
+if [ -f $corpus/lm/LM_FR_IKN3G ] ; then
+ gzip -c $corpus/lm/LM_FR_IKN3G > data/local/LM.gz
+fi
+
+echo "Data preparation completed."
+
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright 2015-2016 Sarah Flora Juan
+# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016 Radboud University (Author: Emre Yilmaz)
+
+# Apache 2.0
+
+corpus=$1
+if [ -z "$corpus" ] ; then
+ echo >&2 "The script $0 expects one parameter -- the location of the Iban corpus"
+ exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+ echo >&2 "The directory $corpus does not exist"
+fi
+
+mkdir -p data/lang data/local/dict
+
+
+cat $corpus/lexicon/lex.asr $corpus/lexicon/lex.oov > data/local/dict/lexicon.txt
+echo "!SIL	SIL" >> data/local/dict/lexicon.txt
+echo "<UNK>	SPN" >> data/local/dict/lexicon.txt
+env LC_ALL=C sort -u -o data/local/dict/lexicon.txt data/local/dict/lexicon.txt
+cat data/local/dict/lexicon.txt | \
+ perl -ane 'print join("\n", @F[1..$#F]) . "\n"; ' | \
+ sort -u | grep -v 'SIL' > data/local/dict/nonsilence_phones.txt
+
+
+touch data/local/dict/extra_questions.txt
+touch data/local/dict/optional_silence.txt
+
+echo "SIL" > data/local/dict/optional_silence.txt
+echo "SIL" > data/local/dict/silence_phones.txt
+echo "<UNK>" > data/local/dict/oov.txt
+
+echo "Dictionary preparation succeeded"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+beam=18.0 # beam for decoding. Was 13.0 in the scripts.`
	`2`	`+lattice_beam=10.0 # this has most effect on size of the lattices.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# No non-default options for now.`
	`2`	`+`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+--use-energy=false # only non-default option.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh`