Skip to content

Commit

Permalink
[egs] Minor fixes to x-vector based recipes (kaldi-asr#2426)
Browse files Browse the repository at this point in the history
Add missing data prep scripts for MUSAN for callhome_diarization; Copy vad.scp and segments to *_cmn data folders after prepare_feats; Fix check before create_split_dir
  • Loading branch information
entn-at authored and danpovey committed May 18, 2018
1 parent 108832d commit 2ad8d78
Show file tree
Hide file tree
Showing 10 changed files with 168 additions and 8 deletions.
119 changes: 119 additions & 0 deletions egs/callhome_diarization/v1/local/make_musan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
# Copyright 2015 David Snyder
# Apache 2.0.
#
# This file is meant to be invoked by make_musan.sh.

import os, sys

def process_music_annotations(path):
utt2spk = {}
utt2vocals = {}
lines = open(path, 'r').readlines()
for line in lines:
utt, genres, vocals, musician = line.rstrip().split()[:4]
# For this application, the musican ID isn't important
utt2spk[utt] = utt
utt2vocals[utt] = vocals == "Y"
return utt2spk, utt2vocals

def prepare_music(root_dir, use_vocals):
utt2vocals = {}
utt2spk = {}
utt2wav = {}
num_good_files = 0
num_bad_files = 0
music_dir = os.path.join(root_dir, "music")
for root, dirs, files in os.walk(music_dir):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".wav"):
utt = str(file).replace(".wav", "")
utt2wav[utt] = file_path
elif str(file) == "ANNOTATIONS":
utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
utt2spk.update(utt2spk_part)
utt2vocals.update(utt2vocals_part)
utt2spk_str = ""
utt2wav_str = ""
for utt in utt2vocals:
if utt in utt2wav:
if use_vocals or not utt2vocals[utt]:
utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
num_good_files += 1
else:
print("Missing file", utt)
num_bad_files += 1
print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
return utt2spk_str, utt2wav_str

def prepare_speech(root_dir):
utt2spk = {}
utt2wav = {}
num_good_files = 0
num_bad_files = 0
speech_dir = os.path.join(root_dir, "speech")
for root, dirs, files in os.walk(speech_dir):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".wav"):
utt = str(file).replace(".wav", "")
utt2wav[utt] = file_path
utt2spk[utt] = utt
utt2spk_str = ""
utt2wav_str = ""
for utt in utt2spk:
if utt in utt2wav:
utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
num_good_files += 1
else:
print("Missing file", utt)
num_bad_files += 1
print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
return utt2spk_str, utt2wav_str

def prepare_noise(root_dir):
utt2spk = {}
utt2wav = {}
num_good_files = 0
num_bad_files = 0
noise_dir = os.path.join(root_dir, "noise")
for root, dirs, files in os.walk(noise_dir):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".wav"):
utt = str(file).replace(".wav", "")
utt2wav[utt] = file_path
utt2spk[utt] = utt
utt2spk_str = ""
utt2wav_str = ""
for utt in utt2spk:
if utt in utt2wav:
utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
num_good_files += 1
else:
print("Missing file", utt)
num_bad_files += 1
print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
return utt2spk_str, utt2wav_str

def main():
in_dir = sys.argv[1]
out_dir = sys.argv[2]
use_vocals = sys.argv[3] == "Y"
utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
wav_fi.write(utt2wav)
utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
utt2spk_fi.write(utt2spk)


if __name__=="__main__":
main()
37 changes: 37 additions & 0 deletions egs/callhome_diarization/v1/local/make_musan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
# Copyright 2015 David Snyder
# Apache 2.0.
#
# This script, called by ../run.sh, creates the MUSAN
# data directory. The required dataset is freely available at
# http://www.openslr.org/17/

set -e
in_dir=$1
data_dir=$2
use_vocals='Y'

mkdir -p local/musan.tmp

echo "Preparing ${data_dir}/musan..."
mkdir -p ${data_dir}/musan
local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}

utils/fix_data_dir.sh ${data_dir}/musan

grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
${data_dir}/musan ${data_dir}/musan_music
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
${data_dir}/musan ${data_dir}/musan_speech
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
${data_dir}/musan ${data_dir}/musan_noise

utils/fix_data_dir.sh ${data_dir}/musan_music
utils/fix_data_dir.sh ${data_dir}/musan_speech
utils/fix_data_dir.sh ${data_dir}/musan_noise

rm -rf local/musan.tmp

Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ mkdir -p $dir/log
mkdir -p $data_out
featdir=$(utils/make_absolute.sh $dir)

if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ mkdir -p $dir/log
mkdir -p $data_out
featdir=$(utils/make_absolute.sh $dir)

if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
if [ $stage -le 4 ]; then
echo "$0: Getting neural network training egs";
# dump egs.
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
utils/create_split_dir.pl \
/export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
fi
Expand Down
6 changes: 5 additions & 1 deletion egs/callhome_diarization/v2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,11 @@ if [ $stage -le 1 ]; then
for name in sre callhome1 callhome2; do
local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
data/$name data/${name}_cmn exp/${name}_cmn
utils/fix_data_dir.sh data/${name}_cmn
cp data/$name/vad.scp data/${name}_cmn/
if [ -f data/$name/segments ]; then
cp data/$name/segments data/${name}_cmn/
fi
utils/fix_data_dir.sh data/${name}_cmn
done

echo "0.01" > data/sre_cmn/frame_shift
Expand Down
2 changes: 1 addition & 1 deletion egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ mkdir -p $dir/log
mkdir -p $data_out
featdir=$(utils/make_absolute.sh $dir)

if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/$USER/kaldi-data/egs/sre16/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
fi
Expand Down
2 changes: 1 addition & 1 deletion egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
if [ $stage -le 4 ]; then
echo "$0: Getting neural network training egs";
# dump egs.
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
utils/create_split_dir.pl \
/export/b{03,04,05,06}/$USER/kaldi-data/egs/sre16/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ mkdir -p $dir/log
mkdir -p $data_out
featdir=$(utils/make_absolute.sh $dir)

if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
utils/create_split_dir.pl \
/export/b{14,15,16,17}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
if [ $stage -le 6 ]; then
echo "$0: Getting neural network training egs";
# dump egs.
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
utils/create_split_dir.pl \
/export/b{03,04,05,06}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
fi
Expand Down

0 comments on commit 2ad8d78

Please sign in to comment.