kaldi-asr
diff --git a/‎egs/spanish_dimex100/README.txt‎
Lines changed: 22 additions & 0 deletions b/‎egs/spanish_dimex100/README.txt‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎egs/spanish_dimex100/s5/.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/RESULTS‎
Lines changed: 7 additions & 0 deletions b/‎egs/spanish_dimex100/s5/RESULTS‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/cmd.sh‎
Lines changed: 15 additions & 0 deletions b/‎egs/spanish_dimex100/s5/cmd.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/conf/decode.config‎
Lines changed: 3 additions & 0 deletions b/‎egs/spanish_dimex100/s5/conf/decode.config‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/conf/mfcc.conf‎
Lines changed: 1 addition & 0 deletions b/‎egs/spanish_dimex100/s5/conf/mfcc.conf‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/local/data_prep.sh‎
Lines changed: 286 additions & 0 deletions b/‎egs/spanish_dimex100/s5/local/data_prep.sh‎
Lines changed: 286 additions & 0 deletions
diff --git a/‎egs/spanish_dimex100/s5/local/lang_prep.sh‎
Lines changed: 53 additions & 0 deletions b/‎egs/spanish_dimex100/s5/local/lang_prep.sh‎
Lines changed: 53 additions & 0 deletions
@@ -0,0 +1,22 @@
+About the DIMEx100 corpus:
+ Mexican Spanish clean speech corpus introduced in Pineda, et al. (2001).
+ "DIMEx100: A New Phonetic and Speech Corpus for Mexican Spanish".
+
+ > Studio recorded audio with a total of 6000 phrases by 100 speakers.
+ > Mono/16 bit/44.1 khz
+ > Three different levels of transcription
+ > For additional information about the corpus design and
+ characteristics refer to (Pineda, 2001)
+
+
+ Created by the computer science department of the "Investigaciones en
+ Matemáticas Aplicadas y en Sistemas (IIMAS)" institute at the "National
+ Autonomous University of Mexico (UNAM)".
+
+ DIMEx100 corpus is available free of charge for academic purposes
+ exclusively. For commercial use a formal agreement with UNAM is required.
+ For more information refer to
+ http://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html
+
+Example author:
+ Daniel A. Campoverde <alx@sillybytes.net>
@@ -0,0 +1,5 @@
+DVDCorpusDimex100.zip
+CorpusDimex100
+
+data
+*.wav
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+exit 0
+
+# Result on decode_test (tri2b_mmi_b0.05)
+%WER 7.58 [ 72 / 950, 50 ins, 0 del, 22 sub ] exp/tri2b_mmi_b0.05/decode_test/wer_15:2
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine). queue.pl works with GridEngine (qsub). slurm.pl works
+# with slurm. Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration. Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
@@ -0,0 +1,3 @@
+first_beam=10.0
+beam=13.0
+lattice_beam=6.0
@@ -0,0 +1 @@
+--use-energy=false
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+## $ ./local/data_prep.sh
+
+mkdir -p "data/train" "data/test" "data/local"
+
+source ./path.sh
+
+# Dimex100 unziped corpus root directory
+CORPUS_DIR="$1"
+
+# Corpus data
+#
+# Number of Different speakers: 100
+# Speakers common utterances: 10
+# Speakers individual utterances: 50
+#
+# Training/testing split
+#
+# Common utterances for training: 10 (100%)
+# Individual utterances for training: 40 (80%)
+# Individual utterances for testing: 10 (20%)
+N_SPEAKERS=100
+N_COMMON_UTTERANCES=10
+N_INDIVIDUAL_UTTERANCES=50
+N_INDIVIDUAL_UTTERANCES_TRAINING=40
+N_INDIVIDUAL_UTTERANCES_TESTING=10
+
+# speakerId-utteranceId-[c|i]
+# c = speaker common utterances (10)
+# i = speaker individual utterances (50)
+#
+# e.g.:
+# s001-01-c
+# ...
+# s001-10-c
+# ...
+# s001-01-i
+# ...
+# s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+# 10/10 common utterances go into training
+# 40/50 individual utterances go into training
+# 10/50 individual utterances go into testing
+
+function make_speaker_id
+{
+ printf "s%03d" "$1"
+}
+
+function make_sentence_id
+{
+ printf "%02d" "$1"
+}
+
+#####################################
+# Convert wave audio to 16-bit, 16kHz
+#####################################
+
+function convert_to_16khz
+{
+ for i in $(seq 1 $N_SPEAKERS); do
+ speaker_id=$(make_speaker_id $i)
+
+ mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
+ mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"
+
+ # Common utterances
+ for j in $(seq 1 $N_COMMON_UTTERANCES); do
+ sentence_id=$(make_sentence_id $j)
+ old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
+ new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+ sox "$old_wav_file" -r 16k "$new_wav_file"
+ done
+
+ # Individual utterances
+ for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+ sentence_id=$(make_sentence_id $k)
+ old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
+ new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+ sox "$old_wav_file" -r 16k "$new_wav_file"
+ done
+ done
+}
+
+if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
+ echo
+ echo Converting audio from 44.1kHz to 16kHz
+ echo
+ convert_to_16khz
+fi
+
+
+
+#################
+# data/train/text
+# data/test/text
+#################
+
+# speakerId-utteranceId-[c|i]
+# c = speaker common utterances (10)
+# i = speaker individual utterances (50)
+#
+# e.g.:
+# s001-01-c
+# ...
+# s001-10-c
+# ...
+# s001-01-i
+# ...
+# s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+# 10/10 common utterances go into training
+# 40/50 individual utterances go into training
+# 10/50 individual utterances go into testing
+
+
+
+function clean
+{
+ echo "$1" \
+ | tr -d '\r' \
+ | tr '[:upper:]' '[:lower:]' \
+ | sed \
+ -e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
+ -e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
+ -e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
+ | tr -d -c "a-zA-Z0-9 \r\n"
+ # | tr -d -c "_,.;:\-?¿!'\"()" \
+}
+
+### Generate data/train/text
+for i in $(seq 1 $N_SPEAKERS); do
+ speaker_id=$(make_speaker_id $i)
+
+ # Common utterances
+ for j in $(seq 1 $N_COMMON_UTTERANCES); do
+ sentence_id=$(make_sentence_id $j)
+ utterance_id="$speaker_id-$sentence_id-c"
+ trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
+ iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+ if [ -f "$trans_file.utf8" ]; then
+ transcription=$(cat "$trans_file.utf8")
+ transcription=$(clean "$transcription")
+ echo "$utterance_id $transcription" >> "data/train/text"
+ fi
+ done
+
+ # Individual utterances
+ for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+ sentence_id=$(make_sentence_id $k)
+ utterance_id="$speaker_id-$sentence_id-i"
+ trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+ iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+ if [ -f "$trans_file.utf8" ]; then
+ transcription=$(cat "$trans_file.utf8")
+ transcription=$(clean "$transcription")
+ echo "$utterance_id $transcription" >> "data/train/text"
+ fi
+ done
+
+done
+
+
+### Generate data/test/text
+for i in $(seq 1 $N_SPEAKERS); do
+ speaker_id=$(make_speaker_id $i)
+
+ # Individual utterances
+ for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+ sentence_id=$(make_sentence_id $k)
+ utterance_id="$speaker_id-$sentence_id-i"
+ trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+ iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+ if [ -f "$trans_file.utf8" ]; then
+ transcription=$(cat "$trans_file.utf8")
+ transcription=$(clean "$transcription")
+ echo "$utterance_id $transcription" >> "data/test/text"
+ fi
+ done
+
+done
+
+
+
+
+####################
+# data/train/wav.scp
+# data/test/wav.scp
+####################
+
+
+### Generate data/train/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+ speaker_id=$(make_speaker_id $i)
+
+ # Common utterances
+ for j in $(seq 1 $N_COMMON_UTTERANCES); do
+ sentence_id=$(make_sentence_id $j)
+ utterance_id="$speaker_id-$sentence_id-c"
+ wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+ if [ -f "$wav_file" ]; then
+ echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+ fi
+ done
+
+ # Individual utterances
+ for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+ sentence_id=$(make_sentence_id $k)
+ utterance_id="$speaker_id-$sentence_id-i"
+ wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+ if [ -f "$wav_file" ]; then
+ echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+ fi
+ done
+
+done
+
+
+### Generate data/test/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+ speaker_id=$(make_speaker_id $i)
+
+ # Individual utterances
+ for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+ sentence_id=$(make_sentence_id $k)
+ utterance_id="$speaker_id-$sentence_id-i"
+ wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+ if [ -f "$wav_file" ]; then
+ echo "$utterance_id $wav_file" >> "data/test/wav.scp"
+ fi
+ done
+
+done
+
+
+
+
+####################
+# data/train/utt2spk
+# data/test/utt2spk
+####################
+
+# Take IDs from 'text' file to avoid including missing data's IDs
+
+### Generate data/train/utt2spk
+utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+ speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+ echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
+done <<< "$utterance_ids"
+
+
+### Generate data/test/utt2spk
+utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+ speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+ echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
+done <<< "$utterance_ids"
+
+
+############
+# Sort files
+############
+
+LC_ALL=C sort -o "data/train/text" "data/train/text"
+LC_ALL=C sort -o "data/test/text" "data/test/text"
+LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
+LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
+LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
+LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"
+
+
+####################
+# data/train/spk2utt
+# data/test/spk2utt
+####################
+utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
+utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+## $ ./local/data_prep.sh
+
+CORPUS_DIR="$1"
+
+mkdir -p "data/local/dict"
+
+source ./path.sh
+
+#############################
+# data/local/dict/lexicon.txt
+#############################
+
+export LC_ALL=C
+
+echo -e '!SIL sil\n<UNK> spn' > data/local/dict/lexicon.txt
+cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
+ | tr '[:upper:]' '[:lower:]' \
+ | sed -e 's/([0123456789]*)//g' \
+ -e 's/\([^ ]\)n\~/\1n/g' \
+ -e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
+ -e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
+ -e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
+ | sed -e 's/_7n.*$//' \
+ -e 's/atl_7tica/atletica/' \
+ -e 's/biol_7gicas/biologicas/' \
+ -e 's/elec_7ctrico/electrico/' \
+ -e 's/gr_7afico/grafico/' \
+ -e 's/s_7lo/solo/' \
+ | sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
+ | sed -e 's/\t/ /g' -e '/^$/d' \
+ | sort | uniq \
+ >> data/local/dict/lexicon.txt
+
+
+#######################################
+# data/local/dict/silence_phones.txt
+# data/local/dict/optional_silence.txt
+# data/local/dict/nonsilence_phones.txt
+# data/local/dict/extra_questions.txt
+#######################################
+
+echo -e 'sil\nspn' > data/local/dict/silence_phones.txt
+echo -e 'sil' > data/local/dict/optional_silence.txt
+cat data/local/dict/lexicon.txt \
+ | grep -v '<UNK>' \
+ | grep -v '!SIL' \
+ | cut -d' ' -f1 --complement \
+ | sed 's/ /\n/g' \
+ | sort -u \
+ > data/local/dict/nonsilence_phones.txt
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +DVDCorpusDimex100.zip
 +CorpusDimex100
++
 +data
 +*.wav
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+first_beam=10.0`
	`2`	`+beam=13.0`
	`3`	`+lattice_beam=6.0`