Skip to content

Commit 213ae52

Browse files
kkm000danpovey
authored andcommitted
[scripts] Avoid holding out more data than the requested num-utts (due to utt2uniq) (#3141)
1 parent aead118 commit 213ae52

File tree

1 file changed

+33
-32
lines changed

1 file changed

+33
-32
lines changed

egs/wsj/s5/steps/nnet3/chain/get_egs.sh

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -151,36 +151,40 @@ mkdir -p $dir/log $dir/info
151151
# Get list of validation utterances.
152152
frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
153153

154+
if [ -f $data/utt2uniq ]; then
155+
# Must hold out all augmented versions of the same utterance.
156+
echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
157+
"includes all perturbed versions of the same source utterance."
158+
utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null |
159+
awk -v max_utt=$num_utts_subset '{
160+
for (n=2;n<=NF;n++) print $n;
161+
printed += NF-1;
162+
if (printed >= max_utt) nextfile; }' |
163+
sort > $dir/valid_uttlist
164+
else
165+
awk '{print $1}' $data/utt2spk | \
166+
utils/shuffle_list.pl 2>/dev/null | \
167+
head -$num_utts_subset > $dir/valid_uttlist
168+
fi
169+
len_valid_uttlist=$(wc -l < $dir/valid_uttlist)
170+
154171
awk '{print $1}' $data/utt2spk | \
155-
utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist
172+
utils/filter_scp.pl --exclude $dir/valid_uttlist | \
173+
utils/shuffle_list.pl 2>/dev/null | \
174+
head -$num_utts_subset > $dir/train_subset_uttlist
175+
len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist)
156176

157-
len_uttlist=$(wc -l < $dir/valid_uttlist)
158-
if [ $len_uttlist -lt $num_utts_subset ]; then
159-
echo "Number of utterances is very small. Please check your data." && exit 1;
177+
if [[ $len_valid_uttlist -lt $num_utts_subset ||
178+
$len_trainsub_uttlist -lt $num_utts_subset ]]; then
179+
echo "$0: Number of utterances is very small. Please check your data." && exit 1;
160180
fi
161181

162-
if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation.
163-
# because of this stage we can again have utts with lengths less than
164-
# frames_per_eg
165-
echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
166-
echo "include all perturbed versions of the same 'real' utterances."
167-
mv $dir/valid_uttlist $dir/valid_uttlist.tmp
168-
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
169-
cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
170-
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
171-
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist
172-
rm $dir/uniq2utt $dir/valid_uttlist.tmp
173-
fi
182+
echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \
183+
"$len_trainsub_uttlist in training diagnostic set, out of total" \
184+
"$(wc -l < $data/utt2spk)."
174185

175-
echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete"
176186

177-
awk '{print $1}' $data/utt2spk | \
178-
utils/filter_scp.pl --exclude $dir/valid_uttlist | \
179-
utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist
180-
len_uttlist=$(wc -l <$dir/train_subset_uttlist)
181-
if [ $len_uttlist -lt $num_utts_subset ]; then
182-
echo "Number of utterances is very small. Please check your data." && exit 1;
183-
fi
187+
echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete"
184188

185189
## Set up features.
186190
echo "$0: feature type is raw"
@@ -342,9 +346,8 @@ if [ $stage -le 2 ]; then
342346
$egs_opts --normalization-fst-scale=$normalization_fst_scale \
343347
$trans_mdl_opt $chaindir/normalization.fst \
344348
"$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
345-
wait
346349
sleep 5 # wait for file system to sync.
347-
echo "... Getting subsets of validation examples for diagnostics and combination."
350+
echo "$0: Getting subsets of validation examples for diagnostics and combination."
348351
if $generate_egs_scp; then
349352
valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
350353
train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
@@ -365,7 +368,6 @@ if [ $stage -le 2 ]; then
365368
$cmd $dir/log/create_train_subset_diagnostic.log \
366369
nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
367370
$train_diagnostic_output || exit 1
368-
wait
369371
sleep 5 # wait for file system to sync.
370372
if $generate_egs_scp; then
371373
cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
@@ -375,7 +377,7 @@ if [ $stage -le 2 ]; then
375377
fi
376378

377379
for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
378-
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
380+
[ ! -s $f ] && echo "$0: No examples in file $f" && exit 1;
379381
done
380382
rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
381383
) || touch $dir/.error &
@@ -412,7 +414,7 @@ if [ $stage -le 4 ]; then
412414
fi
413415

414416
if [ -f $dir/.error ]; then
415-
echo "Error detected while creating train/valid egs" && exit 1
417+
echo "$0: Error detected while creating train/valid egs" && exit 1
416418
fi
417419

418420
if [ $stage -le 5 ]; then
@@ -485,11 +487,11 @@ fi
485487

486488
wait
487489
if [ -f $dir/.error ]; then
488-
echo "Error detected while creating train/valid egs" && exit 1
490+
echo "$0: Error detected while creating train/valid egs" && exit 1
489491
fi
490492

491493
if [ $stage -le 6 ]; then
492-
echo "$0: removing temporary archives"
494+
echo "$0: Removing temporary archives, alignments and lattices"
493495
(
494496
cd $dir
495497
for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done
@@ -501,7 +503,6 @@ if [ $stage -le 6 ]; then
501503
# there are some extra soft links that we should delete.
502504
for f in $dir/cegs.*.*.ark; do rm $f; done
503505
fi
504-
echo "$0: removing temporary alignments, lattices and transforms"
505506
rm $dir/ali.{ark,scp} 2>/dev/null
506507
rm $dir/lat_special.*.{ark,scp} 2>/dev/null
507508
fi

0 commit comments

Comments
 (0)