Skip to content

Commit

Permalink
[scripts] Trust frame_shift and utt2num_frames if found (kaldi-asr#3313)
Browse files Browse the repository at this point in the history
Getting utt2dur involves accessing wave files, and potentially
running full pipelines in wav.scp, which may take hours for a
large data set. If utt2num_frames exists, use it instead if
frame rate is known.

Issue: kaldi-asr#3303
Fixes: kaldi-asr#3297 "cat: broken pipe"
  • Loading branch information
kkm (aka Kirill Katsnelson) authored and danpovey committed Jun 19, 2019
1 parent 9569384 commit 94aef8d
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 33 deletions.
63 changes: 33 additions & 30 deletions egs/wsj/s5/utils/data/get_frame_shift.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,32 @@
. ./path.sh

if [ $# != 1 ]; then
echo "Usage: $0 <datadir>"
echo "e.g.:"
echo " $0 data/train"
echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
echo "If <datadir> does not contain utt2dur, this script may call utils/data/get_utt2dur.sh,"
echo "which will require write permission to <datadir>"
cat >&2 <<EOF
Usage: frame_shift=\$($0 <datadir>)
e.g.: frame_shift=\$($0 data/train)
This script prints the frame-shift in seconds (e.g. 0.01) to the standard out.
Its output is intended to be captured in a shell variable.
If <datadir> does not contain the file utt2dur, this script may invoke
utils/data/get_utt2dur.sh, which will require write permission to <datadir>.
EOF
exit 1
fi

export LC_ALL=C

dir=$1

if [[ -s $dir/frame_shift ]]; then
cat $dir/frame_shift
exit
fi

if [ ! -f $dir/feats.scp ]; then
echo "$0: $dir/feats.scp does not exist" 1>&2
exit 1
fi

if [ ! -s $dir/utt2dur ]; then
if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
Expand All @@ -35,37 +48,27 @@ if [ ! -s $dir/utt2dur ]; then
exit 0
fi
echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
utils/data/get_utt2dur.sh $dir 1>&2
utils/data/get_utt2dur.sh 1>&2 $dir || exit 1
fi

if [ ! -s $dir/frame_shift ]; then
if [ ! -f $dir/feats.scp ]; then
echo "$0: $dir/feats.scp does not exist" 1>&2
exit 1
fi

temp=$(mktemp /tmp/tmp.XXXX)
temp=$(mktemp /tmp/tmp.XXXX) || exit 1

feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp

if [ -z $temp ]; then
echo "$0: error running feat-to-len" 1>&2
exit 1
fi

frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | \
awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }') || exit 1;

echo $frame_shift > $dir/frame_shift
if [[ ! -s $temp ]]; then
rm $temp
fi

frame_shift=$(cat $dir/frame_shift)
if [ -z "$frame_shift" ]; then
echo "$0: Could not read get frame shift from directory $dir" 1>&2
echo "$0: error running feat-to-len" 1>&2
exit 1
fi

echo $frame_shift
frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
{ dur += $2; frames += $4; }
END { shift = dur / frames;
if (shift > 0.01 && shift < 0.0102) shift = 0.01;
print shift; }') || exit 1;

rm $temp

echo $frame_shift > $dir/frame_shift
echo $frame_shift
exit 0
18 changes: 15 additions & 3 deletions egs/wsj/s5/utils/data/get_utt2dur.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ if [ $# != 1 ]; then
echo " $0 data/train"
echo " Options:"
echo " --frame-shift # frame shift in seconds. Only relevant when we are"
echo " # getting duration from feats.scp (default: 0.01). "
echo " # getting duration from feats.scp, and only if the "
echo " # file frame_shift does not exist (default: 0.01). "
exit 1
fi

Expand All @@ -40,12 +41,17 @@ fi
if [ -s $data/segments ]; then
echo "$0: working out $data/utt2dur from $data/segments"
awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur
elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
frame_shift=$(cat $data/frame_shift) || exit 1
# The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
awk -v fs=$frame_shift '{ $2=($2+1.5)*fs; print }' <$data/utt2num_frames >$data/utt2dur
elif [ -f $data/wav.scp ]; then
echo "$0: segments file does not exist so getting durations from wave files"

# if the wav.scp contains only lines of the form
# utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph |
if cat $data/wav.scp | perl -e '
if perl <$data/wav.scp -e '
while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space.
@A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
$A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
Expand Down Expand Up @@ -102,7 +108,13 @@ elif [ -f $data/wav.scp ]; then
fi
elif [ -f $data/feats.scp ]; then
echo "$0: wave file does not exist so getting durations from feats files"
feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
if [[ -s $data/frame_shift ]]; then
frame_shift=$(cat $data/frame_shift) || exit 1
echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
fi
# The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
feat-to-len scp:$data/feats.scp ark,t:- |
awk -v frame_shift=$frame_shift '{print $1, ($2+1.5)*frame_shift}' >$data/utt2dur
else
echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
exit 1
Expand Down

0 comments on commit 94aef8d

Please sign in to comment.