Skip to content

Commit 1cd6d2a

Browse files
galvdanpovey
authored andcommitted
[egs] mini-librispeech: download data from openslr if it's not present (#1598)
1 parent e527572 commit 1cd6d2a

File tree

2 files changed

+102
-5
lines changed

2 files changed

+102
-5
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
3+
# Copyright 2014 Johns Hopkins University (author: Daniel Povey)
4+
# 2017 Luminar Technologies, Inc. (author: Daniel Galvez)
5+
# Apache 2.0
6+
7+
remove_archive=false
8+
9+
if [ "$1" == --remove-archive ]; then
10+
remove_archive=true
11+
shift
12+
fi
13+
14+
if [ $# -ne 3 ]; then
15+
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
16+
echo "e.g.: $0 /export/a05/dgalvez/ www.openslr.org/resources/31 dev-clean-2"
17+
echo "With --remove-archive it will remove the archive after successfully un-tarring it."
18+
echo "<corpus-part> can be one of: dev-clean-2, test-clean-5, dev-other, test-other,"
19+
echo " train-clean-100, train-clean-360, train-other-500."
20+
fi
21+
22+
data=$1
23+
url=$2
24+
part=$3
25+
26+
if [ ! -d "$data" ]; then
27+
echo "$0: no such directory $data"
28+
exit 1;
29+
fi
30+
31+
part_ok=false
32+
list="dev-clean-2 train-clean-5"
33+
for x in $list; do
34+
if [ "$part" == $x ]; then part_ok=true; fi
35+
done
36+
if ! $part_ok; then
37+
echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
38+
exit 1;
39+
fi
40+
41+
if [ -z "$url" ]; then
42+
echo "$0: empty URL base."
43+
exit 1;
44+
fi
45+
46+
if [ -f $data/LibriSpeech/$part/.complete ]; then
47+
echo "$0: data part $part was already successfully extracted, nothing to do."
48+
exit 0;
49+
fi
50+
51+
52+
sizes="126046265 332747356"
53+
54+
if [ -f $data/$part.tar.gz ]; then
55+
size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
56+
size_ok=false
57+
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
58+
if ! $size_ok; then
59+
echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
60+
echo "does not equal the size of one of the archives."
61+
rm $data/$part.tar.gz
62+
else
63+
echo "$data/$part.tar.gz exists and appears to be complete."
64+
fi
65+
fi
66+
67+
if [ ! -f $data/$part.tar.gz ]; then
68+
if ! which wget >/dev/null; then
69+
echo "$0: wget is not installed."
70+
exit 1;
71+
fi
72+
full_url=$url/$part.tar.gz
73+
echo "$0: downloading data from $full_url. This may take some time, please be patient."
74+
75+
cd $data
76+
if ! wget --no-check-certificate $full_url; then
77+
echo "$0: error executing wget $full_url"
78+
exit 1;
79+
fi
80+
fi
81+
82+
cd $data
83+
84+
if ! tar -xvzf $part.tar.gz; then
85+
echo "$0: error un-tarring archive $data/$part.tar.gz"
86+
exit 1;
87+
fi
88+
89+
touch $data/LibriSpeech/$part/.complete
90+
91+
echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
92+
93+
if $remove_archive; then
94+
echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
95+
rm $data/$part.tar.gz
96+
fi

egs/mini_librispeech/s5/run.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Note: this works only on pre-downloaded data on the CLSP servers
44
data=/export/a05/dgalvez/
55

6-
data_url=www.openslr.org/resources/TODO # TODO
6+
data_url=www.openslr.org/resources/31
77
lm_url=www.openslr.org/resources/11
88

99
. ./cmd.sh
@@ -15,10 +15,11 @@ stage=0
1515
# TODO(galv): Reconsider this
1616
set -euxo pipefail
1717

18-
# TODO(galv): Modify openslr.org to contain the minified training dataset.
19-
# for part in dev-clean-2 train-clean-5; do
20-
# local/download_and_untar.sh $data $data_url $part
21-
# done
18+
mkdir -p $data
19+
20+
for part in dev-clean-2 train-clean-5; do
21+
local/download_and_untar.sh $data $data_url $part
22+
done
2223

2324
if [ $stage -le 0 ]; then
2425
local/download_lm.sh $lm_url data/local/lm

0 commit comments

Comments
 (0)