Skip to content

Commit

Permalink
[egs] minor fixes to multi_en recipe (#2013)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaohui-zhang authored and danpovey committed Nov 14, 2017
1 parent 640a8e3 commit ac8adf2
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 18 deletions.
14 changes: 11 additions & 3 deletions egs/multi_en/s5/local/prepare_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ replace_cmudict_symbols=( "ah" "ah l" "ah n" )
. ./cmd.sh
. ./path.sh

#check existing directories
if [ $# -lt 1 ] || [ $# -gt 2 ]; then
echo "Usage: prepare_dict.sh /path/to/SWBD [/path/to/TEDLIUM_r2]"
exit 1;
fi

SWBD_DIR=$1
TEDLIUM_DIR=$2

# This function filters lines that are common in both files
function filter_common {
awk 'NR==FNR{arr[$0]++;next} arr[$0] {print}' $1 $2
Expand All @@ -38,12 +47,11 @@ for i in "${replace_swbd_symbols[@]}"; do
done

# Prepare switchboard lexicon
local/swbd1_data_download.sh /export/corpora3/LDC/LDC97S62

local/swbd1_data_download.sh $SWBD_DIR
local/swbd1_prepare_dict.sh

# Prepare cmudict + tedlium lexicon
local/cmu_tedlium_prepare_dict.sh /export/corpora5/TEDLIUM_release2
local/cmu_tedlium_prepare_dict.sh $TEDLIUM_DIR

dir=data/local/dict_combined
swbd_dir=data/local/dict_swbd
Expand Down
2 changes: 1 addition & 1 deletion egs/multi_en/s5/local/swbd1_data_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ mkdir -p $dir

# Audio data directory check
if [ ! -d $SWBD_DIR ]; then
echo "Error: run.sh requires a directory argument"
echo "Error: swbd1_data_download.sh requires a valid directory argument"
exit 1;
fi

Expand Down
2 changes: 2 additions & 0 deletions egs/multi_en/s5/local/swbd1_prepare_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
# assume swbd_p1_data_prep.sh was done already.
[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;

rm -f $dir/lexicon0.txt
cp $srcdict $dir/lexicon0.txt || exit 1;
chmod +w $dir/lexicon0.txt
patch <local/swbd_dict.patch $dir/lexicon0.txt || exit 1;

#(2a) Dictionary preparation:
Expand Down
15 changes: 1 addition & 14 deletions egs/multi_en/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ fi
# across all training transcripts.
if [ $stage -le 4 ]; then
# We prepare the dictionary in data/local/dict_combined.
local/prepare_dict.sh
local/prepare_dict.sh $swbd $tedlium2
local/g2p/train_g2p.sh --stage 0 --silence-phones "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p
dict_dir=data/local/dict_nosp
mkdir -p $dict_dir
Expand Down Expand Up @@ -148,19 +148,6 @@ fi

# fix and validate training data directories
if [ $stage -le 8 ]; then
# create segments file for wsj
awk '{print $1, $1, 0, -1}' data/wsj/train/utt2spk > data/wsj/train/segments
for f in `awk '{print $5}' data/wsj/train/wav.scp`; do
head -c 1024 $f | grep sample_count | awk '{print $3/16000}'
done > wsj_durations
paste -d' ' <(cut -d' ' -f1-3 data/wsj/train/segments) wsj_durations > wsj_segments
mv data/wsj/train/segments{,.bkp}
mv wsj_segments data/wsj/train/segments
rm -f wsj_segments wsj_durations
# create segments files for librispeech
for c in librispeech_100 librispeech_360 librispeech_500; do
awk '{print $1, $1, 0, $2}' data/$c/train/utt2dur > data/$c/train/segments;
done
# get rid of spk2gender files because not all corpora have them
rm -f data/*/train/spk2gender
# create reco2channel_and_file files for wsj and librispeech
Expand Down

0 comments on commit ac8adf2

Please sign in to comment.