kaldi-asr
diff --git a/‎egs/wsj/s5/steps/nnet3/chain/get_egs.sh‎
Lines changed: 33 additions & 32 deletions b/‎egs/wsj/s5/steps/nnet3/chain/get_egs.sh‎
Lines changed: 33 additions & 32 deletions
@@ -151,36 +151,40 @@ mkdir -p $dir/log $dir/info
 # Get list of validation utterances.
 frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
 
+if [ -f $data/utt2uniq ]; then
+ # Must hold out all augmented versions of the same utterance.
+ echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
+ "includes all perturbed versions of the same source utterance."
+ utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null |
+ awk -v max_utt=$num_utts_subset '{
+ for (n=2;n<=NF;n++) print $n;
+ printed += NF-1;
+ if (printed >= max_utt) nextfile; }' |
+ sort > $dir/valid_uttlist
+else
+ awk '{print $1}' $data/utt2spk | \
+ utils/shuffle_list.pl 2>/dev/null | \
+ head -$num_utts_subset > $dir/valid_uttlist
+fi
+len_valid_uttlist=$(wc -l < $dir/valid_uttlist)
+
 awk '{print $1}' $data/utt2spk | \
- utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist
+ utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+ utils/shuffle_list.pl 2>/dev/null | \
+ head -$num_utts_subset > $dir/train_subset_uttlist
+len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist)
 
-len_uttlist=$(wc -l < $dir/valid_uttlist)
-if [ $len_uttlist -lt $num_utts_subset ]; then
- echo "Number of utterances is very small. Please check your data." && exit 1;
+if [[ $len_valid_uttlist -lt $num_utts_subset ||
+ $len_trainsub_uttlist -lt $num_utts_subset ]]; then
+ echo "$0: Number of utterances is very small. Please check your data." && exit 1;
 fi
 
-if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation.
- # because of this stage we can again have utts with lengths less than
- # frames_per_eg
- echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
- echo "include all perturbed versions of the same 'real' utterances."
- mv $dir/valid_uttlist $dir/valid_uttlist.tmp
- utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
- cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
- sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
- awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist
- rm $dir/uniq2utt $dir/valid_uttlist.tmp
-fi
+echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \
+ "$len_trainsub_uttlist in training diagnostic set, out of total" \
+ "$(wc -l < $data/utt2spk)."
 
-echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete"
 
-awk '{print $1}' $data/utt2spk | \
- utils/filter_scp.pl --exclude $dir/valid_uttlist | \
- utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist
-len_uttlist=$(wc -l <$dir/train_subset_uttlist)
-if [ $len_uttlist -lt $num_utts_subset ]; then
- echo "Number of utterances is very small. Please check your data." && exit 1;
-fi
+echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete"
 
 ## Set up features.
 echo "$0: feature type is raw"
@@ -342,9 +346,8 @@ if [ $stage -le 2 ]; then
  $egs_opts --normalization-fst-scale=$normalization_fst_scale \
  $trans_mdl_opt $chaindir/normalization.fst \
  "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
- wait
  sleep 5 # wait for file system to sync.
- echo "... Getting subsets of validation examples for diagnostics and combination."
+ echo "$0: Getting subsets of validation examples for diagnostics and combination."
  if $generate_egs_scp; then
  valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
  train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
@@ -365,7 +368,6 @@ if [ $stage -le 2 ]; then
  $cmd $dir/log/create_train_subset_diagnostic.log \
  nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
  $train_diagnostic_output || exit 1
- wait
  sleep 5 # wait for file system to sync.
  if $generate_egs_scp; then
  cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
@@ -375,7 +377,7 @@ if [ $stage -le 2 ]; then
  fi
 
  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
- [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+ [ ! -s $f ] && echo "$0: No examples in file $f" && exit 1;
  done
  rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
  ) || touch $dir/.error &
@@ -412,7 +414,7 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ -f $dir/.error ]; then
- echo "Error detected while creating train/valid egs" && exit 1
+ echo "$0: Error detected while creating train/valid egs" && exit 1
 fi
 
 if [ $stage -le 5 ]; then
@@ -485,11 +487,11 @@ fi
 
 wait
 if [ -f $dir/.error ]; then
- echo "Error detected while creating train/valid egs" && exit 1
+ echo "$0: Error detected while creating train/valid egs" && exit 1
 fi
 
 if [ $stage -le 6 ]; then
- echo "$0: removing temporary archives"
+ echo "$0: Removing temporary archives, alignments and lattices"
  (
  cd $dir
  for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done
@@ -501,7 +503,6 @@ if [ $stage -le 6 ]; then
  # there are some extra soft links that we should delete.
  for f in $dir/cegs.*.*.ark; do rm $f; done
  fi
- echo "$0: removing temporary alignments, lattices and transforms"
  rm $dir/ali.{ark,scp} 2>/dev/null
  rm $dir/lat_special.*.{ark,scp} 2>/dev/null
 fi