|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) |
| 4 | +# 2017 Luminar Technologies, Inc. (author: Daniel Galvez) |
| 5 | +# Apache 2.0 |
| 6 | + |
| 7 | +remove_archive=false |
| 8 | + |
| 9 | +if [ "$1" == --remove-archive ]; then |
| 10 | + remove_archive=true |
| 11 | + shift |
| 12 | +fi |
| 13 | + |
| 14 | +if [ $# -ne 3 ]; then |
| 15 | + echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" |
| 16 | + echo "e.g.: $0 /export/a05/dgalvez/ www.openslr.org/resources/31 dev-clean-2" |
| 17 | + echo "With --remove-archive it will remove the archive after successfully un-tarring it." |
| 18 | + echo "<corpus-part> can be one of: dev-clean-2, test-clean-5, dev-other, test-other," |
| 19 | + echo " train-clean-100, train-clean-360, train-other-500." |
| 20 | +fi |
| 21 | + |
| 22 | +data=$1 |
| 23 | +url=$2 |
| 24 | +part=$3 |
| 25 | + |
| 26 | +if [ ! -d "$data" ]; then |
| 27 | + echo "$0: no such directory $data" |
| 28 | + exit 1; |
| 29 | +fi |
| 30 | + |
| 31 | +part_ok=false |
| 32 | +list="dev-clean-2 train-clean-5" |
| 33 | +for x in $list; do |
| 34 | + if [ "$part" == $x ]; then part_ok=true; fi |
| 35 | +done |
| 36 | +if ! $part_ok; then |
| 37 | + echo "$0: expected <corpus-part> to be one of $list, but got '$part'" |
| 38 | + exit 1; |
| 39 | +fi |
| 40 | + |
| 41 | +if [ -z "$url" ]; then |
| 42 | + echo "$0: empty URL base." |
| 43 | + exit 1; |
| 44 | +fi |
| 45 | + |
| 46 | +if [ -f $data/LibriSpeech/$part/.complete ]; then |
| 47 | + echo "$0: data part $part was already successfully extracted, nothing to do." |
| 48 | + exit 0; |
| 49 | +fi |
| 50 | + |
| 51 | + |
| 52 | +sizes="126046265 332747356" |
| 53 | + |
| 54 | +if [ -f $data/$part.tar.gz ]; then |
| 55 | + size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') |
| 56 | + size_ok=false |
| 57 | + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done |
| 58 | + if ! $size_ok; then |
| 59 | + echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" |
| 60 | + echo "does not equal the size of one of the archives." |
| 61 | + rm $data/$part.tar.gz |
| 62 | + else |
| 63 | + echo "$data/$part.tar.gz exists and appears to be complete." |
| 64 | + fi |
| 65 | +fi |
| 66 | + |
| 67 | +if [ ! -f $data/$part.tar.gz ]; then |
| 68 | + if ! which wget >/dev/null; then |
| 69 | + echo "$0: wget is not installed." |
| 70 | + exit 1; |
| 71 | + fi |
| 72 | + full_url=$url/$part.tar.gz |
| 73 | + echo "$0: downloading data from $full_url. This may take some time, please be patient." |
| 74 | + |
| 75 | + cd $data |
| 76 | + if ! wget --no-check-certificate $full_url; then |
| 77 | + echo "$0: error executing wget $full_url" |
| 78 | + exit 1; |
| 79 | + fi |
| 80 | +fi |
| 81 | + |
| 82 | +cd $data |
| 83 | + |
| 84 | +if ! tar -xvzf $part.tar.gz; then |
| 85 | + echo "$0: error un-tarring archive $data/$part.tar.gz" |
| 86 | + exit 1; |
| 87 | +fi |
| 88 | + |
| 89 | +touch $data/LibriSpeech/$part/.complete |
| 90 | + |
| 91 | +echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" |
| 92 | + |
| 93 | +if $remove_archive; then |
| 94 | + echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." |
| 95 | + rm $data/$part.tar.gz |
| 96 | +fi |
0 commit comments