Skip to content

Commit e3a9844

Browse files
alx741danpovey
authored andcommitted
[egs] Add Spanish dimex100 example (#3254)
1 parent 286e8af commit e3a9844

File tree

14 files changed

+628
-0
lines changed

14 files changed

+628
-0
lines changed

egs/spanish_dimex100/README.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
About the DIMEx100 corpus:
2+
Mexican Spanish clean speech corpus introduced in Pineda, et al. (2001).
3+
"DIMEx100: A New Phonetic and Speech Corpus for Mexican Spanish".
4+
5+
> Studio recorded audio with a total of 6000 phrases by 100 speakers.
6+
> Mono/16 bit/44.1 khz
7+
> Three different levels of transcription
8+
> For additional information about the corpus design and
9+
characteristics refer to (Pineda, 2001)
10+
11+
12+
Created by the computer science department of the "Investigaciones en
13+
Matemáticas Aplicadas y en Sistemas (IIMAS)" institute at the "National
14+
Autonomous University of Mexico (UNAM)".
15+
16+
DIMEx100 corpus is available free of charge for academic purposes
17+
exclusively. For commercial use a formal agreement with UNAM is required.
18+
For more information refer to
19+
http://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html
20+
21+
Example author:
22+
Daniel A. Campoverde <alx@sillybytes.net>

egs/spanish_dimex100/s5/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
DVDCorpusDimex100.zip
2+
CorpusDimex100
3+
4+
data
5+
*.wav

egs/spanish_dimex100/s5/RESULTS

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
4+
exit 0
5+
6+
# Result on decode_test (tri2b_mmi_b0.05)
7+
%WER 7.58 [ 72 / 950, 50 ins, 0 del, 22 sub ] exp/tri2b_mmi_b0.05/decode_test/wer_15:2

egs/spanish_dimex100/s5/cmd.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# you can change cmd.sh depending on what type of queue you are using.
2+
# If you have no queueing system and want to run on a local machine, you
3+
# can change all instances 'queue.pl' to run.pl (but be careful and run
4+
# commands one by one: most recipes will exhaust the memory on your
5+
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
6+
# with slurm. Different queues are configured differently, with different
7+
# queue names and different ways of specifying things like memory;
8+
# to account for these differences you can create and edit the file
9+
# conf/queue.conf to match your queue's configuration. Search for
10+
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11+
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12+
13+
export train_cmd="queue.pl --mem 2G"
14+
export decode_cmd="queue.pl --mem 4G"
15+
export mkgraph_cmd="queue.pl --mem 8G"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
first_beam=10.0
2+
beam=13.0
3+
lattice_beam=6.0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--use-energy=false
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
#!/bin/bash
2+
3+
## Only run this file from the example root directory
4+
## $ ./local/data_prep.sh
5+
6+
mkdir -p "data/train" "data/test" "data/local"
7+
8+
source ./path.sh
9+
10+
# Dimex100 unziped corpus root directory
11+
CORPUS_DIR="$1"
12+
13+
# Corpus data
14+
#
15+
# Number of Different speakers: 100
16+
# Speakers common utterances: 10
17+
# Speakers individual utterances: 50
18+
#
19+
# Training/testing split
20+
#
21+
# Common utterances for training: 10 (100%)
22+
# Individual utterances for training: 40 (80%)
23+
# Individual utterances for testing: 10 (20%)
24+
N_SPEAKERS=100
25+
N_COMMON_UTTERANCES=10
26+
N_INDIVIDUAL_UTTERANCES=50
27+
N_INDIVIDUAL_UTTERANCES_TRAINING=40
28+
N_INDIVIDUAL_UTTERANCES_TESTING=10
29+
30+
# speakerId-utteranceId-[c|i]
31+
# c = speaker common utterances (10)
32+
# i = speaker individual utterances (50)
33+
#
34+
# e.g.:
35+
# s001-01-c
36+
# ...
37+
# s001-10-c
38+
# ...
39+
# s001-01-i
40+
# ...
41+
# s001-50-i
42+
43+
## 80-20 train-test split
44+
## Only individual utterances are used in testing
45+
# 10/10 common utterances go into training
46+
# 40/50 individual utterances go into training
47+
# 10/50 individual utterances go into testing
48+
49+
function make_speaker_id
50+
{
51+
printf "s%03d" "$1"
52+
}
53+
54+
function make_sentence_id
55+
{
56+
printf "%02d" "$1"
57+
}
58+
59+
#####################################
60+
# Convert wave audio to 16-bit, 16kHz
61+
#####################################
62+
63+
function convert_to_16khz
64+
{
65+
for i in $(seq 1 $N_SPEAKERS); do
66+
speaker_id=$(make_speaker_id $i)
67+
68+
mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
69+
mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"
70+
71+
# Common utterances
72+
for j in $(seq 1 $N_COMMON_UTTERANCES); do
73+
sentence_id=$(make_sentence_id $j)
74+
old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
75+
new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
76+
sox "$old_wav_file" -r 16k "$new_wav_file"
77+
done
78+
79+
# Individual utterances
80+
for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
81+
sentence_id=$(make_sentence_id $k)
82+
old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
83+
new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
84+
sox "$old_wav_file" -r 16k "$new_wav_file"
85+
done
86+
done
87+
}
88+
89+
if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
90+
echo
91+
echo Converting audio from 44.1kHz to 16kHz
92+
echo
93+
convert_to_16khz
94+
fi
95+
96+
97+
98+
#################
99+
# data/train/text
100+
# data/test/text
101+
#################
102+
103+
# speakerId-utteranceId-[c|i]
104+
# c = speaker common utterances (10)
105+
# i = speaker individual utterances (50)
106+
#
107+
# e.g.:
108+
# s001-01-c
109+
# ...
110+
# s001-10-c
111+
# ...
112+
# s001-01-i
113+
# ...
114+
# s001-50-i
115+
116+
## 80-20 train-test split
117+
## Only individual utterances are used in testing
118+
# 10/10 common utterances go into training
119+
# 40/50 individual utterances go into training
120+
# 10/50 individual utterances go into testing
121+
122+
123+
124+
function clean
125+
{
126+
echo "$1" \
127+
| tr -d '\r' \
128+
| tr '[:upper:]' '[:lower:]' \
129+
| sed \
130+
-e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
131+
-e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
132+
-e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
133+
| tr -d -c "a-zA-Z0-9 \r\n"
134+
# | tr -d -c "_,.;:\-?¿!'\"()" \
135+
}
136+
137+
### Generate data/train/text
138+
for i in $(seq 1 $N_SPEAKERS); do
139+
speaker_id=$(make_speaker_id $i)
140+
141+
# Common utterances
142+
for j in $(seq 1 $N_COMMON_UTTERANCES); do
143+
sentence_id=$(make_sentence_id $j)
144+
utterance_id="$speaker_id-$sentence_id-c"
145+
trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
146+
iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
147+
if [ -f "$trans_file.utf8" ]; then
148+
transcription=$(cat "$trans_file.utf8")
149+
transcription=$(clean "$transcription")
150+
echo "$utterance_id $transcription" >> "data/train/text"
151+
fi
152+
done
153+
154+
# Individual utterances
155+
for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
156+
sentence_id=$(make_sentence_id $k)
157+
utterance_id="$speaker_id-$sentence_id-i"
158+
trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
159+
iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
160+
if [ -f "$trans_file.utf8" ]; then
161+
transcription=$(cat "$trans_file.utf8")
162+
transcription=$(clean "$transcription")
163+
echo "$utterance_id $transcription" >> "data/train/text"
164+
fi
165+
done
166+
167+
done
168+
169+
170+
### Generate data/test/text
171+
for i in $(seq 1 $N_SPEAKERS); do
172+
speaker_id=$(make_speaker_id $i)
173+
174+
# Individual utterances
175+
for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
176+
sentence_id=$(make_sentence_id $k)
177+
utterance_id="$speaker_id-$sentence_id-i"
178+
trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
179+
iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
180+
if [ -f "$trans_file.utf8" ]; then
181+
transcription=$(cat "$trans_file.utf8")
182+
transcription=$(clean "$transcription")
183+
echo "$utterance_id $transcription" >> "data/test/text"
184+
fi
185+
done
186+
187+
done
188+
189+
190+
191+
192+
####################
193+
# data/train/wav.scp
194+
# data/test/wav.scp
195+
####################
196+
197+
198+
### Generate data/train/wav.scp
199+
for i in $(seq 1 $N_SPEAKERS); do
200+
speaker_id=$(make_speaker_id $i)
201+
202+
# Common utterances
203+
for j in $(seq 1 $N_COMMON_UTTERANCES); do
204+
sentence_id=$(make_sentence_id $j)
205+
utterance_id="$speaker_id-$sentence_id-c"
206+
wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
207+
if [ -f "$wav_file" ]; then
208+
echo "$utterance_id $wav_file" >> "data/train/wav.scp"
209+
fi
210+
done
211+
212+
# Individual utterances
213+
for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
214+
sentence_id=$(make_sentence_id $k)
215+
utterance_id="$speaker_id-$sentence_id-i"
216+
wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
217+
if [ -f "$wav_file" ]; then
218+
echo "$utterance_id $wav_file" >> "data/train/wav.scp"
219+
fi
220+
done
221+
222+
done
223+
224+
225+
### Generate data/test/wav.scp
226+
for i in $(seq 1 $N_SPEAKERS); do
227+
speaker_id=$(make_speaker_id $i)
228+
229+
# Individual utterances
230+
for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
231+
sentence_id=$(make_sentence_id $k)
232+
utterance_id="$speaker_id-$sentence_id-i"
233+
wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
234+
if [ -f "$wav_file" ]; then
235+
echo "$utterance_id $wav_file" >> "data/test/wav.scp"
236+
fi
237+
done
238+
239+
done
240+
241+
242+
243+
244+
####################
245+
# data/train/utt2spk
246+
# data/test/utt2spk
247+
####################
248+
249+
# Take IDs from 'text' file to avoid including missing data's IDs
250+
251+
### Generate data/train/utt2spk
252+
utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)
253+
254+
while read -r utterance_id; do
255+
speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
256+
echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
257+
done <<< "$utterance_ids"
258+
259+
260+
### Generate data/test/utt2spk
261+
utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)
262+
263+
while read -r utterance_id; do
264+
speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
265+
echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
266+
done <<< "$utterance_ids"
267+
268+
269+
############
270+
# Sort files
271+
############
272+
273+
LC_ALL=C sort -o "data/train/text" "data/train/text"
274+
LC_ALL=C sort -o "data/test/text" "data/test/text"
275+
LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
276+
LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
277+
LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
278+
LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"
279+
280+
281+
####################
282+
# data/train/spk2utt
283+
# data/test/spk2utt
284+
####################
285+
utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
286+
utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/bash
2+
3+
## Only run this file from the example root directory
4+
## $ ./local/data_prep.sh
5+
6+
CORPUS_DIR="$1"
7+
8+
mkdir -p "data/local/dict"
9+
10+
source ./path.sh
11+
12+
#############################
13+
# data/local/dict/lexicon.txt
14+
#############################
15+
16+
export LC_ALL=C
17+
18+
echo -e '!SIL sil\n<UNK> spn' > data/local/dict/lexicon.txt
19+
cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
20+
| tr '[:upper:]' '[:lower:]' \
21+
| sed -e 's/([0123456789]*)//g' \
22+
-e 's/\([^ ]\)n\~/\1n/g' \
23+
-e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
24+
-e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
25+
-e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
26+
| sed -e 's/_7n.*$//' \
27+
-e 's/atl_7tica/atletica/' \
28+
-e 's/biol_7gicas/biologicas/' \
29+
-e 's/elec_7ctrico/electrico/' \
30+
-e 's/gr_7afico/grafico/' \
31+
-e 's/s_7lo/solo/' \
32+
| sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
33+
| sed -e 's/\t/ /g' -e '/^$/d' \
34+
| sort | uniq \
35+
>> data/local/dict/lexicon.txt
36+
37+
38+
#######################################
39+
# data/local/dict/silence_phones.txt
40+
# data/local/dict/optional_silence.txt
41+
# data/local/dict/nonsilence_phones.txt
42+
# data/local/dict/extra_questions.txt
43+
#######################################
44+
45+
echo -e 'sil\nspn' > data/local/dict/silence_phones.txt
46+
echo -e 'sil' > data/local/dict/optional_silence.txt
47+
cat data/local/dict/lexicon.txt \
48+
| grep -v '<UNK>' \
49+
| grep -v '!SIL' \
50+
| cut -d' ' -f1 --complement \
51+
| sed 's/ /\n/g' \
52+
| sort -u \
53+
> data/local/dict/nonsilence_phones.txt

0 commit comments

Comments
 (0)