Skip to content

Commit

Permalink
[scripts] Make extend_lang.sh support lexiconp_silprob.txt (kaldi-asr…
Browse files Browse the repository at this point in the history
  • Loading branch information
DongjiGao authored and Bar-BY committed Jan 21, 2020
1 parent 676d4b4 commit e87d013
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 28 deletions.
124 changes: 96 additions & 28 deletions egs/wsj/s5/utils/lang/extend_lang.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash
# Copyright 2018 Johns Hopkins University (Author: Daniel Povey);
# 2019 Dongji Gao

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -18,6 +19,7 @@

# Begin configuration section.
sil_prob=0.5
silprob_file=
# end configuration section

echo "$0 $@" # Print the command line for logging
Expand All @@ -43,6 +45,7 @@ if [ $# -ne 3 ]; then
echo ""
echo "Options"
echo " --sil-prob <probability of silence> # default: 0.5 [must have 0 <= silprob < 1]"
echo " --silprob-file <file contains silence probability> # must be provided if lexicon is lexiconp_silprob.txt"
exit 1;
fi

Expand All @@ -52,6 +55,7 @@ dir=$3

[ -f path.sh ] && . ./path.sh


for f in $srcdir/phones.txt $lexicon; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
Expand Down Expand Up @@ -79,11 +83,26 @@ tmpdir=$dir/temp
rm -r $tmpdir 2>/dev/null
mkdir -p $tmpdir


# TODO: more checking.
if [ $(basename $lexicon) != lexiconp.txt ]; then
echo "$0: currently this script only supports the lexiconp.txt format; your lexicon"
echo " ... has to have that filename."
silprob=false

if [ $(basename $lexicon) == "lexiconp_silprob.txt" ]; then
silprob=true
if [ -z $silprob_file ] ; then
echo "silprob_file not provided, checking $srcdir"
if [ -f $srcdir/silprob.txt ]; then
silprob_file=$srcdir/silprob.txt
echo "silprob_file found in $srcdir"
else
echo "silprob_file not found in $srcdir" && exit 1;
fi
else
if [ ! -f $silprob_file ]; then
echo "$silprob_file does not exist" && exit 1;
fi
fi
elif [ $(basename $lexicon) != lexiconp.txt ]; then
echo "$0: currently this script only supports the lexiconp.txt or lexiconp_silprob.txt format;"
echo " ... your lexicon has to have that filename."
fi

# Get the list of extra words.
Expand All @@ -105,22 +124,45 @@ fi

if [ -f $dir/phones/word_boundary.txt ]; then
# was `if $position_dependent_phones; then..` in prepare_lang.sh
# TODO: add support for silprobs
perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
if "$silprob"; then
perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
$wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
< $lexicon > $tmpdir/lexiconp.txt || exit 1;
< $lexicon > $tmpdir/lexiconp_silprob.txt
else
perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
< $lexicon > $tmpdir/lexiconp.txt || exit 1;
fi
else
cp $lexicon $tmpdir/lexiconp.txt
if "$silprob"; then
cp $lexicon $tempdir/lexiconp_silprob.txt
else
cp $lexicon $tmpdir/lexiconp.txt
fi
fi

# Check that there are no unseen phones in the lexicon.
if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then
echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
exit 1
if "$silprob"; then
if ! utils/sym2int.pl -f 6- $srcdir/phones.txt $tmpdir/lexiconp_silprob.txt >/dev/null; then
echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
exit 1
fi
else
if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then
echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
exit 1
fi
fi

ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
if "$silprob"; then
ndisambig=$(utils/add_lex_disambig.pl --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi

ndisambig=$[ndisambig+1] # Add one to disambiguate silence.

Expand All @@ -146,6 +188,15 @@ silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1;
echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
exit 1;

if "$silprob"; then
# remove the silprob
cat $tmpdir/lexiconp_silprob.txt |\
awk '{
for(i=1; i<=NF; i++) {
if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
}
}' > $tmpdir/lexiconp.txt
fi

# First remove pron-probs from the lexicon.
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
Expand All @@ -161,7 +212,7 @@ if [ -f $dir/phones/nonterminals.txt ]; then
for w in "#nonterm_begin" "#nonterm_end" $(cat $dir/phones/nonterminals.txt); do
echo $w $w # These are words without pronunciations, so leave those prons
# empty.
done >> $dir/phones/align_lexicon.txt
done >> $dir/phones/align_lexicon.txt
fi

# create phones/align_lexicon.int from phones/align_lexicon.txt
Expand All @@ -170,22 +221,39 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \

# Create the basic L.fst without disambiguation symbols, for use
# in training.

utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
$tmpdir/lexiconp.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
if "$silprob"; then
utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
$tmpdir/lexiconp_silprob.txt $silprob_file | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
$tmpdir/lexiconp.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi


# and create the version that has disambiguation symbols.
utils/lang/make_lexicon_fst.py $grammar_opts \
--sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
$tmpdir/lexiconp_disambig.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
if "$silprob"; then
utils/lang/make_lexicon_fst_silprob.py $grammar_opts \
--sil-phone=$silphone --sil-disambig='#'$ndisambig \
$tmpdir/lexiconp_silprob_disambig.txt $silprob_file | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
else
utils/lang/make_lexicon_fst.py $grammar_opts \
--sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
$tmpdir/lexiconp_disambig.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi


echo "$(basename $0): validating output directory"
Expand Down
1 change: 1 addition & 0 deletions egs/wsj/s5/utils/prepare_extended_lang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ word_list= # if a word list (mapping words from the srcdict to IDs) is provided,
# we'll make sure the IDs of these words are kept as before.
# end configuration sections

echo "$0: warning: This sript is is now deprecated. You may want to use utils/lang/extend_lang.sh"
echo "$0 $@" # Print the command line for logging

. utils/parse_options.sh
Expand Down

0 comments on commit e87d013

Please sign in to comment.