@@ -151,36 +151,40 @@ mkdir -p $dir/log $dir/info
151151# Get list of validation utterances.
152152frame_shift=$( utils/data/get_frame_shift.sh $data ) || exit 1
153153
154+ if [ -f $data /utt2uniq ]; then
155+ # Must hold out all augmented versions of the same utterance.
156+ echo " $0 : File $data /utt2uniq exists, so ensuring the hold-out set" \
157+ " includes all perturbed versions of the same source utterance."
158+ utils/utt2spk_to_spk2utt.pl $data /utt2uniq 2> /dev/null |
159+ awk -v max_utt=$num_utts_subset ' {
160+ for (n=2;n<=NF;n++) print $n;
161+ printed += NF-1;
162+ if (printed >= max_utt) nextfile; }' |
163+ sort > $dir /valid_uttlist
164+ else
165+ awk ' {print $1}' $data /utt2spk | \
166+ utils/shuffle_list.pl 2> /dev/null | \
167+ head -$num_utts_subset > $dir /valid_uttlist
168+ fi
169+ len_valid_uttlist=$( wc -l < $dir /valid_uttlist)
170+
154171awk ' {print $1}' $data /utt2spk | \
155- utils/shuffle_list.pl 2> /dev/null | head -$num_utts_subset > $dir /valid_uttlist
172+ utils/filter_scp.pl --exclude $dir /valid_uttlist | \
173+ utils/shuffle_list.pl 2> /dev/null | \
174+ head -$num_utts_subset > $dir /train_subset_uttlist
175+ len_trainsub_uttlist=$( wc -l < $dir /train_subset_uttlist)
156176
157- len_uttlist= $( wc -l < $dir /valid_uttlist )
158- if [ $len_uttlist -lt $num_utts_subset ]; then
159- echo " Number of utterances is very small. Please check your data." && exit 1;
177+ if [[ $len_valid_uttlist -lt $num_utts_subset ||
178+ $len_trainsub_uttlist -lt $num_utts_subset ] ]; then
179+ echo " $0 : Number of utterances is very small. Please check your data." && exit 1;
160180fi
161181
162- if [ -f $data /utt2uniq ]; then # this matters if you use data augmentation.
163- # because of this stage we can again have utts with lengths less than
164- # frames_per_eg
165- echo " File $data /utt2uniq exists, so augmenting valid_uttlist to"
166- echo " include all perturbed versions of the same 'real' utterances."
167- mv $dir /valid_uttlist $dir /valid_uttlist.tmp
168- utils/utt2spk_to_spk2utt.pl $data /utt2uniq > $dir /uniq2utt
169- cat $dir /valid_uttlist.tmp | utils/apply_map.pl $data /utt2uniq | \
170- sort | uniq | utils/apply_map.pl $dir /uniq2utt | \
171- awk ' {for(n=1;n<=NF;n++) print $n;}' | sort > $dir /valid_uttlist
172- rm $dir /uniq2utt $dir /valid_uttlist.tmp
173- fi
182+ echo " $0 : Holding out $len_valid_uttlist utterances in validation set and" \
183+ " $len_trainsub_uttlist in training diagnostic set, out of total" \
184+ " $( wc -l < $data /utt2spk) ."
174185
175- echo " $0 : creating egs. To ensure they are not deleted later you can do: touch $dir /.nodelete"
176186
177- awk ' {print $1}' $data /utt2spk | \
178- utils/filter_scp.pl --exclude $dir /valid_uttlist | \
179- utils/shuffle_list.pl 2> /dev/null | head -$num_utts_subset > $dir /train_subset_uttlist
180- len_uttlist=$( wc -l < $dir /train_subset_uttlist)
181- if [ $len_uttlist -lt $num_utts_subset ]; then
182- echo " Number of utterances is very small. Please check your data." && exit 1;
183- fi
187+ echo " $0 : creating egs. To ensure they are not deleted later you can do: touch $dir /.nodelete"
184188
185189# # Set up features.
186190echo " $0 : feature type is raw"
@@ -342,9 +346,8 @@ if [ $stage -le 2 ]; then
342346 $egs_opts --normalization-fst-scale=$normalization_fst_scale \
343347 $trans_mdl_opt $chaindir /normalization.fst \
344348 " $train_subset_feats " ark,s,cs:- " ark:$dir /train_subset_all.cegs" || exit 1
345- wait
346349 sleep 5 # wait for file system to sync.
347- echo " ... Getting subsets of validation examples for diagnostics and combination."
350+ echo " $0 : Getting subsets of validation examples for diagnostics and combination."
348351 if $generate_egs_scp ; then
349352 valid_diagnostic_output=" ark,scp:$dir /valid_diagnostic.cegs,$dir /valid_diagnostic.scp"
350353 train_diagnostic_output=" ark,scp:$dir /train_diagnostic.cegs,$dir /train_diagnostic.scp"
@@ -365,7 +368,6 @@ if [ $stage -le 2 ]; then
365368 $cmd $dir /log/create_train_subset_diagnostic.log \
366369 nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir /train_subset_all.cegs \
367370 $train_diagnostic_output || exit 1
368- wait
369371 sleep 5 # wait for file system to sync.
370372 if $generate_egs_scp ; then
371373 cat $dir /valid_combine.cegs $dir /train_combine.cegs | \
@@ -375,7 +377,7 @@ if [ $stage -le 2 ]; then
375377 fi
376378
377379 for f in $dir /{combine,train_diagnostic,valid_diagnostic}.cegs; do
378- [ ! -s $f ] && echo " No examples in file $f " && exit 1;
380+ [ ! -s $f ] && echo " $0 : No examples in file $f " && exit 1;
379381 done
380382 rm $dir /valid_all.cegs $dir /train_subset_all.cegs $dir /{train,valid}_combine.cegs
381383 ) || touch $dir /.error &
@@ -412,7 +414,7 @@ if [ $stage -le 4 ]; then
412414fi
413415
414416if [ -f $dir /.error ]; then
415- echo " Error detected while creating train/valid egs" && exit 1
417+ echo " $0 : Error detected while creating train/valid egs" && exit 1
416418fi
417419
418420if [ $stage -le 5 ]; then
485487
486488wait
487489if [ -f $dir /.error ]; then
488- echo " Error detected while creating train/valid egs" && exit 1
490+ echo " $0 : Error detected while creating train/valid egs" && exit 1
489491fi
490492
491493if [ $stage -le 6 ]; then
492- echo " $0 : removing temporary archives"
494+ echo " $0 : Removing temporary archives, alignments and lattices "
493495 (
494496 cd $dir
495497 for f in $( ls -l . | grep ' cegs_orig' | awk ' { X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }' ) ; do rm $f ; done
@@ -501,7 +503,6 @@ if [ $stage -le 6 ]; then
501503 # there are some extra soft links that we should delete.
502504 for f in $dir /cegs.* .* .ark; do rm $f ; done
503505 fi
504- echo " $0 : removing temporary alignments, lattices and transforms"
505506 rm $dir /ali.{ark,scp} 2> /dev/null
506507 rm $dir /lat_special.* .{ark,scp} 2> /dev/null
507508fi
0 commit comments