-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[scripts] Extend combine_ali_dirs.sh to combine alignment lattices (#…
…3315) Relevant discussion: https://groups.google.com/forum/#!topic/kaldi-help/2uxfByEAmfw
- Loading branch information
Showing
2 changed files
with
180 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,105 +1,209 @@ | ||
#!/bin/bash | ||
# Copyright 2016 Xiaohui Zhang Apache 2.0. | ||
# Copyright 2019 SmartAction (kkm) | ||
|
||
# This srcipt operates on alignment directories, such as exp/tri4a_ali | ||
# the output is a new ali dir which has alignments from all the input ali dirs | ||
# This script combines alignment directories, such as exp/tri4a_ali, and | ||
# validates matching of the utterances and alignments after combining. | ||
|
||
# Begin configuration section. | ||
cmd=run.pl | ||
extra_files= | ||
num_jobs=4 | ||
nj=4 | ||
combine_lat=true | ||
combine_ali=true | ||
tolerance=10 | ||
# End configuration section. | ||
echo "$0 $@" # Print the command line for logging | ||
echo "$0 $@" # Print the command line for logging. | ||
|
||
if [ -f path.sh ]; then . ./path.sh; fi | ||
. parse_options.sh || exit 1; | ||
[[ -f path.sh ]] && . ./path.sh | ||
. parse_options.sh || exit 1 | ||
|
||
export LC_ALL=C | ||
|
||
if [[ $# -lt 3 ]]; then | ||
echo "Usage: $0 [options] <data> <dest-ali-dir> <src-ali-dir1> <src-ali-dir2> ..." | ||
echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2" | ||
echo "Options:" | ||
echo " --extra-files <file1 file2...> # specify addtional files in 'src-ali-dir1' to copy" | ||
echo " --num-jobs <nj> # number of jobs used to split the data directory." | ||
echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones." | ||
echo " Other than alignments, only files from the first src ali dir are copied." | ||
cat >&2 <<EOF | ||
Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ... | ||
e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2 | ||
Options: | ||
--nj <nj> # number of jobs to split combined archives [4] | ||
--combine_ali false # merge ali.*.gz if present [true] | ||
--combine_lat false # merge lat.*.gz if present [true] | ||
--tolerance <int,%> # maximum percentage of missing alignments or lattices | ||
# w.r.t. total utterances in <data> before error is | ||
# reported [10] | ||
The script checks that certain important files are present and compatible in all | ||
source directories (phones.txt, tree); other are copied from the first source | ||
(cmvn_opts, final.mdl) without much checking. | ||
Both --combine_ali and --combine_lat are true by default, but the script | ||
proceeds with a warning if directories do not contain either alignments or | ||
alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir> | ||
after the script completes if additional programmatic check is required. | ||
EOF | ||
exit 1; | ||
fi | ||
|
||
data=$1; | ||
shift; | ||
dest=$1; | ||
shift; | ||
first_src=$1; | ||
|
||
mkdir -p $dest; | ||
rm $dest/{ali.*.gz,num_jobs} 2>/dev/null | ||
|
||
cp $first_src/phones.txt $dest 2>/dev/null | ||
|
||
export LC_ALL=C | ||
if [[ ! $combine_lat && ! $combine_ali ]]; then | ||
echo "$0: at least one of --combine_lat and --combine_ali must be true" | ||
exit 1 | ||
fi | ||
|
||
for dir in $*; do | ||
if [ ! -f $dir/ali.1.gz ]; then | ||
echo "$0: check if alignments (ali.*.gz) are present in $dir." | ||
exit 1; | ||
data=$1 | ||
dest=$2 | ||
shift 2 | ||
first_src=$1 | ||
|
||
do_ali=$combine_ali | ||
do_lat=$combine_lat | ||
|
||
# Check if alignments and/or lattices are present. Since we combine both, | ||
# whichever present, issue a warning only. Also verify that the target is | ||
# different from any source; we cannot combine in-place, and a lot of damage | ||
# could result. | ||
for src in $@; do | ||
if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \ | ||
"$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then | ||
echo "$0: error: Source $src is same as target $dest." | ||
exit 1 | ||
fi | ||
if $do_ali && [[ ! -f $src/ali.1.gz ]]; then | ||
echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \ | ||
"combining. Consider '--combine_ali false' to suppress this warning." | ||
do_ali=false | ||
fi | ||
if $do_lat && [[ ! -f $src/lat.1.gz ]]; then | ||
echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\ | ||
"not combining. Consider '--combine_lat false' to suppress this warning." | ||
do_lat=false | ||
fi | ||
done | ||
|
||
for dir in $*; do | ||
for f in tree; do | ||
diff $first_src/$f $dir/$f 1>/dev/null 2>&1 | ||
if [ $? -ne 0 ]; then | ||
echo "$0: Cannot combine alignment directories with different $f files." | ||
fi | ||
done | ||
done | ||
if ! $do_ali && ! $do_lat; then | ||
echo "$0: error: Cannot combine directories." | ||
exit 1 | ||
fi | ||
|
||
for f in final.mdl tree cmvn_opts num_jobs $extra_files; do | ||
# Verify that required files are present in the first directory. | ||
for f in cmvn_opts final.mdl num_jobs phones.txt tree; do | ||
if [ ! -f $first_src/$f ]; then | ||
echo "combine_ali_dir.sh: no such file $first_src/$f" | ||
exit 1; | ||
echo "$0: error: Required source file $first_src/$f is missing." | ||
exit 1 | ||
fi | ||
cp $first_src/$f $dest/ | ||
done | ||
|
||
src_id=0 | ||
temp_dir=$dest/temp | ||
[ -d $temp_dir ] && rm -r $temp_dir; | ||
mkdir -p $temp_dir | ||
echo "$0: dumping alignments in each source directory as single archive and index." | ||
for dir in $*; do | ||
src_id=$((src_id + 1)) | ||
cur_num_jobs=$(cat $dir/num_jobs) || exit 1; | ||
alis=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/ali.$n.gz "; done) | ||
$cmd $dir/log/copy_alignments.log \ | ||
copy-int-vector "ark:gunzip -c $alis|" \ | ||
ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1; | ||
# Verify that phones and trees are compatible in all directories, and than | ||
# num_jobs files are present, too. | ||
for src in $@; do | ||
if [[ $src != $first_src ]]; then | ||
if [[ ! -f $src/num_jobs ]]; then | ||
echo "$0: error: Required source file $src/num_jobs is missing." | ||
exit 1 | ||
fi | ||
if ! cmp -s $first_src/tree $src/tree; then | ||
echo "$0: error: tree $src/tree is either missing or not the" \ | ||
"same as $first_src/tree." | ||
exit 1 | ||
fi | ||
if [[ ! -f $src/phones.txt ]]; then | ||
echo "$0: error: Required source file $src/phones.txt is missing." | ||
exit 1 | ||
fi | ||
utils/lang/check_phones_compatible.sh $first_src/phones.txt \ | ||
$src/phones.txt || exit 1 | ||
fi | ||
done | ||
sort -m $temp_dir/ali.*.scp > $temp_dir/ali.scp || exit 1; | ||
|
||
echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files." | ||
utils/split_data.sh $data $num_jobs || exit 1; | ||
# All checks passed, ok to prepare directory. Copy model and other files from | ||
# the first source, as they either checked to be compatible, or we do not care | ||
# if they are. | ||
mkdir -p $dest || exit 1 | ||
rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree} | ||
$do_ali && rm -f $dest/ali.*.{gz,scp} | ||
$do_lat && rm -f $dest/lat.*.{gz,scp} | ||
cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1 | ||
cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null # If present. | ||
echo $nj > $dest/num_jobs || exit 1 | ||
|
||
# Make temporary directory, delete on signal, but not on 'exit 1'. | ||
temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1 | ||
cleanup() { rm -rf "$temp_dir"; } | ||
trap cleanup HUP INT TERM | ||
echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \ | ||
"script failure, so you could examine it for troubleshooting." | ||
|
||
|
||
# This function may be called twice, once to combine alignments and the second | ||
# time to combine lattices. The two invocations are as follows: | ||
# do_combine ali alignments copy-int-vector $@ | ||
# do_combine lat lattices lattice-copy $@ | ||
# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into | ||
# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the | ||
# program used to copy corresponding objects. | ||
do_combine() { | ||
local ark=$1 entities=$2 copy_program=$3 | ||
shift 3 | ||
|
||
echo "$0: Gathering $entities from each source directory." | ||
# Assign all source gzipped archive names to an exported variable, one each | ||
# per source directory, so that we can copy archives in a job per source. | ||
src_id=0 | ||
for src in $@; do | ||
src_id=$((src_id + 1)) | ||
nj_src=$(cat $src/num_jobs) || exit 1 | ||
# Create and export variable src_arcs_${src_id} for the job runner. | ||
# Each numbered variable will contain the list of archives, e. g.: | ||
# src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..." | ||
# ('printf' repeats its format as long as there are more arguments). | ||
printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src) | ||
export src_arks_${src_id} | ||
done | ||
|
||
echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files." | ||
utils/filter_scps.pl JOB=1:$num_jobs \ | ||
$data/split$num_jobs/JOB/utt2spk $temp_dir/ali.scp $temp_dir/ali.JOB.scp | ||
# Gather archives in parallel jobs. | ||
$cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \ | ||
$copy_program \ | ||
"ark:gunzip -c \${src_arks_JOB} |" \ | ||
"ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1 | ||
|
||
# Merge (presumed already sorted) scp's into a single script. | ||
sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1 | ||
|
||
echo "$0: Splitting combined $entities into $nj archives on speaker boundary." | ||
$cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \ | ||
$copy_program \ | ||
"scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \ | ||
"ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1 | ||
|
||
# Get some interesting stats, and signal an error if error threshold exceeded. | ||
n_utt=$(wc -l <$data/utt2spk) | ||
n_ali=$(wc -l <$temp_dir/$ark.scp) | ||
n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l) | ||
n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l) | ||
n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);") | ||
echo "$0: Combined $n_ali $entities for $n_utt utterances." \ | ||
"There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \ | ||
"$entities, and $n_ali_no_utt $entities not matching any utterance." | ||
|
||
if (( $n_utt_no_ali_pct >= $tolerance )); then | ||
echo "$0: error: Percentage of utterances missing $entities," \ | ||
"${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%." | ||
exit 1 | ||
fi | ||
|
||
for i in `seq 1 $num_jobs`; do | ||
copy-int-vector scp:$temp_dir/ali.${i}.scp "ark:|gzip -c >$dest/ali.$i.gz" || exit 1; | ||
done | ||
return 0 | ||
} | ||
|
||
echo $num_jobs > $dest/num_jobs || exit 1 | ||
# Do the actual combining. Do not check returned exit code, as | ||
# the function always calls 'exit 1' on failure. | ||
$do_ali && do_combine ali 'alignments' copy-int-vector "$@" | ||
$do_lat && do_combine lat 'lattices' lattice-copy "$@" | ||
|
||
echo "$0: checking the alignment files generated have at least 90% of the utterances." | ||
for i in `seq 1 $num_jobs`; do | ||
num_lines=`cat $temp_dir/ali.$i.scp | wc -l` || exit 1; | ||
num_lines_tot=`cat $data/split$num_jobs/$i/utt2spk | wc -l` || exit 1; | ||
python -c "import sys; | ||
percent = 100.0 * float($num_lines) / $num_lines_tot | ||
if percent < 90 : | ||
print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))" || exit 1; | ||
done | ||
rm -r $temp_dir 2>/dev/null | ||
# Delete the temporary directory on success. | ||
cleanup | ||
|
||
echo "Combined alignments and stored in $dest" | ||
what= | ||
$do_ali && what+='alignments ' | ||
$do_ali && $do_lat && what+='and ' | ||
$do_lat && what+='lattices ' | ||
echo "$0: Stored combined ${what}in $dest" # No period, interferes with | ||
# copy/paste from tty emulator. | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
combine_ali_dirs.sh |