-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[egs] Add recipes for Speakers in the Wild (SITW) (#2422)
- Loading branch information
1 parent
7051384
commit 447e964
Showing
29 changed files
with
1,277 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
|
||
This directory (sitw) contains example scripts for the Speakers in the | ||
Wild (SITW) Speaker Recognition Challenge. The SITW corpus is required, | ||
and can be obtained by following the directions at the url | ||
http://www.speech.sri.com/projects/sitw/ | ||
|
||
Additional data sources (e.g., VoxCeleb and MUSAN) are required to train | ||
the systems in the subdirectories. See the corresponding README.txt files | ||
in the subdirectories for more details. | ||
|
||
Note: This recipe requires ffmpeg to be installed and its location included | ||
in $PATH. | ||
|
||
The subdirectories "v1" and so on are different speaker recognition | ||
recipes. The recipe in v1 is a traditional i-vector system while the v2 | ||
recipe uses DNN embeddings called x-vectors. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
|
||
This is a traditional i-vector recipe for Speakers in the Wild. The | ||
following datasets are used: | ||
|
||
Evaluation | ||
|
||
Speakers in the Wild http://www.speech.sri.com/projects/sitw | ||
|
||
System Development | ||
|
||
VoxCeleb 1 http://www.robots.ox.ac.uk/~vgg/data/voxceleb | ||
VoxCeleb 2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb2 | ||
MUSAN http://www.openslr.org/17 | ||
RIR_NOISES http://www.openslr.org/28 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# you can change cmd.sh depending on what type of queue you are using. | ||
# If you have no queueing system and want to run on a local machine, you | ||
# can change all instances 'queue.pl' to run.pl (but be careful and run | ||
# commands one by one: most recipes will exhaust the memory on your | ||
# machine). queue.pl works with GridEngine (qsub). slurm.pl works | ||
# with slurm. Different queues are configured differently, with different | ||
# queue names and different ways of specifying things like memory; | ||
# to account for these differences you can create and edit the file | ||
# conf/queue.conf to match your queue's configuration. Search for | ||
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, | ||
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. | ||
|
||
export train_cmd="queue.pl --mem 4G" | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
--sample-frequency=16000 | ||
--frame-length=25 # the default is 25 | ||
--low-freq=20 # the default. | ||
--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). | ||
--num-mel-bins=30 | ||
--num-ceps=24 | ||
--snip-edges=false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
--vad-energy-threshold=5.5 | ||
--vad-energy-mean-scale=0.5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2015 David Snyder | ||
# 2018 Ewald Enzinger | ||
# Apache 2.0. | ||
# | ||
# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). | ||
# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz. | ||
# | ||
# This file is meant to be invoked by make_musan.sh. | ||
|
||
import os, sys | ||
|
||
def process_music_annotations(path): | ||
utt2spk = {} | ||
utt2vocals = {} | ||
lines = open(path, 'r').readlines() | ||
for line in lines: | ||
utt, genres, vocals, musician = line.rstrip().split()[:4] | ||
# For this application, the musican ID isn't important | ||
utt2spk[utt] = utt | ||
utt2vocals[utt] = vocals == "Y" | ||
return utt2spk, utt2vocals | ||
|
||
def prepare_music(root_dir, use_vocals): | ||
utt2vocals = {} | ||
utt2spk = {} | ||
utt2wav = {} | ||
num_good_files = 0 | ||
num_bad_files = 0 | ||
music_dir = os.path.join(root_dir, "music") | ||
for root, dirs, files in os.walk(music_dir): | ||
for file in files: | ||
file_path = os.path.join(root, file) | ||
if file.endswith(".wav"): | ||
utt = str(file).replace(".wav", "") | ||
utt2wav[utt] = file_path | ||
elif str(file) == "ANNOTATIONS": | ||
utt2spk_part, utt2vocals_part = process_music_annotations(file_path) | ||
utt2spk.update(utt2spk_part) | ||
utt2vocals.update(utt2vocals_part) | ||
utt2spk_str = "" | ||
utt2wav_str = "" | ||
for utt in utt2vocals: | ||
if utt in utt2wav: | ||
if use_vocals or not utt2vocals[utt]: | ||
utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" | ||
utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" | ||
num_good_files += 1 | ||
else: | ||
print("Missing file", utt) | ||
num_bad_files += 1 | ||
print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") | ||
return utt2spk_str, utt2wav_str | ||
|
||
def prepare_speech(root_dir): | ||
utt2spk = {} | ||
utt2wav = {} | ||
num_good_files = 0 | ||
num_bad_files = 0 | ||
speech_dir = os.path.join(root_dir, "speech") | ||
for root, dirs, files in os.walk(speech_dir): | ||
for file in files: | ||
file_path = os.path.join(root, file) | ||
if file.endswith(".wav"): | ||
utt = str(file).replace(".wav", "") | ||
utt2wav[utt] = file_path | ||
utt2spk[utt] = utt | ||
utt2spk_str = "" | ||
utt2wav_str = "" | ||
for utt in utt2spk: | ||
if utt in utt2wav: | ||
utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" | ||
utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" | ||
num_good_files += 1 | ||
else: | ||
print("Missing file", utt) | ||
num_bad_files += 1 | ||
print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") | ||
return utt2spk_str, utt2wav_str | ||
|
||
def prepare_noise(root_dir): | ||
utt2spk = {} | ||
utt2wav = {} | ||
num_good_files = 0 | ||
num_bad_files = 0 | ||
noise_dir = os.path.join(root_dir, "noise") | ||
for root, dirs, files in os.walk(noise_dir): | ||
for file in files: | ||
file_path = os.path.join(root, file) | ||
if file.endswith(".wav"): | ||
utt = str(file).replace(".wav", "") | ||
utt2wav[utt] = file_path | ||
utt2spk[utt] = utt | ||
utt2spk_str = "" | ||
utt2wav_str = "" | ||
for utt in utt2spk: | ||
if utt in utt2wav: | ||
utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" | ||
utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" | ||
num_good_files += 1 | ||
else: | ||
print("Missing file", utt) | ||
num_bad_files += 1 | ||
print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") | ||
return utt2spk_str, utt2wav_str | ||
|
||
def main(): | ||
in_dir = sys.argv[1] | ||
out_dir = sys.argv[2] | ||
use_vocals = sys.argv[3] == "Y" | ||
utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) | ||
utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) | ||
utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) | ||
utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise | ||
utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise | ||
wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') | ||
wav_fi.write(utt2wav) | ||
utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') | ||
utt2spk_fi.write(utt2spk) | ||
|
||
|
||
if __name__=="__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
# Copyright 2015 David Snyder | ||
# Apache 2.0. | ||
# | ||
# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). | ||
# | ||
# This script, called by ../run.sh, creates the MUSAN | ||
# data directory. The required dataset is freely available at | ||
# http://www.openslr.org/17/ | ||
|
||
set -e | ||
in_dir=$1 | ||
data_dir=$2 | ||
use_vocals='Y' | ||
|
||
mkdir -p local/musan.tmp | ||
|
||
echo "Preparing ${data_dir}/musan..." | ||
mkdir -p ${data_dir}/musan | ||
local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} | ||
|
||
utils/fix_data_dir.sh ${data_dir}/musan | ||
|
||
grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music | ||
grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech | ||
grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise | ||
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ | ||
${data_dir}/musan ${data_dir}/musan_music | ||
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ | ||
${data_dir}/musan ${data_dir}/musan_speech | ||
utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ | ||
${data_dir}/musan ${data_dir}/musan_noise | ||
|
||
utils/fix_data_dir.sh ${data_dir}/musan_music | ||
utils/fix_data_dir.sh ${data_dir}/musan_speech | ||
utils/fix_data_dir.sh ${data_dir}/musan_noise | ||
|
||
rm -rf local/musan.tmp | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
# Copyrigh 2017 Ignacio Viñals | ||
# 2017-2018 David Snyder | ||
# | ||
# This script prepares the SITW data. It creates separate directories | ||
# for dev enroll, eval enroll, dev test, and eval test. It also prepares | ||
# multiple trials files, in the test directories, but we usually only use the | ||
# core-core.lst list. | ||
|
||
if [ $# != 2 ]; then | ||
echo "Usage: make_sitw.sh <SITW_PATH> <this_out_dir>" | ||
echo "E.g.: make_sitw.sh /export/corpora/SRI/sitw data" | ||
exit 1 | ||
fi | ||
|
||
in_dir=$1 | ||
out_dir=$2 | ||
|
||
# Prepare the enrollment data | ||
for mode in dev eval; do | ||
this_out_dir=${out_dir}/sitw_${mode}_enroll | ||
mkdir -p $this_out_dir 2>/dev/null | ||
WAVFILE=$this_out_dir/wav.scp | ||
SPKFILE=$this_out_dir/utt2spk | ||
MODFILE=$this_out_dir/utt2cond | ||
rm $WAVFILE $SPKFILE $MODFILE 2>/dev/null | ||
this_in_dir=${in_dir}/$mode | ||
|
||
for enroll in core assist; do | ||
cat $this_in_dir/lists/enroll-${enroll}.lst | \ | ||
while read line; do | ||
wav_id=`echo $line| awk '{print $2}' |\ | ||
awk 'BEGIN{FS="[./]"}{print $(NF-1)}'` | ||
spkr_id=`echo $line| awk '{print $1}'` | ||
WAV=`echo $line | awk '{print this_in_dir"/"$2}' this_in_dir=$this_in_dir` | ||
echo "${spkr_id}_${wav_id} sox -t flac $WAV -t wav -r 16k -b 16 - channels 1 |" >> $WAVFILE | ||
echo "${spkr_id}_${wav_id} ${spkr_id}" >> $SPKFILE | ||
echo "${spkr_id}_${wav_id} $enroll $mode" >> $MODFILE | ||
done | ||
done | ||
utils/fix_data_dir.sh $this_out_dir | ||
done | ||
|
||
# Prepare the test data | ||
for mode in dev eval; do | ||
this_out_dir=${out_dir}/sitw_${mode}_test | ||
mkdir -p $this_out_dir 2>/dev/null | ||
WAVFILE=$this_out_dir/wav.scp | ||
SPKFILE=$this_out_dir/utt2spk | ||
MODFILE=$this_out_dir/utt2cond | ||
rm $WAVFILE $SPKFILE $MODFILE 2>/dev/null | ||
mkdir -p $this_out_dir/trials 2>/dev/null | ||
mkdir -p $this_out_dir/trials/aux 2>/dev/null | ||
this_in_dir=${in_dir}/$mode | ||
|
||
for trial in core multi; do | ||
cat $this_in_dir/lists/test-${trial}.lst | awk '{print $1,$2}' |\ | ||
while read line; do | ||
wav_id=`echo $line | awk 'BEGIN{FS="[./]"} {print $(NF-1)}'` | ||
WAV=`echo $line | awk '{print this_in_dir"/"$1}' this_in_dir=$this_in_dir` | ||
echo "${wav_id} sox -t flac $WAV -t wav -r 16k -b 16 - channels 1 |" >> $WAVFILE | ||
echo "${wav_id} ${wav_id}" >> $SPKFILE | ||
echo "${wav_id} $trial $mode" >> $MODFILE | ||
done | ||
done | ||
|
||
for trial in core-core core-multi assist-core assist-multi; do | ||
cat $this_in_dir/keys/$trial.lst | sed 's@audio/@@g' | sed 's@.flac@@g' |\ | ||
awk '{if ($3=="tgt") | ||
{print $1,$2,"target"} | ||
else | ||
{print $1,$2,"nontarget"} | ||
}' > $this_out_dir/trials/${trial}.lst | ||
done | ||
|
||
for trial in $this_in_dir/keys/aux/* ; do | ||
trial_name=`basename $trial` | ||
cat $trial | sed 's@audio/@@g' | sed 's@.flac@@g' |\ | ||
awk '{if ($3=="tgt") | ||
{print $1,$2,"target"} | ||
else | ||
{print $1,$2,"nontarget"} | ||
}' > $this_out_dir/trials/aux/${trial_name} | ||
done | ||
utils/fix_data_dir.sh $this_out_dir | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/perl | ||
# | ||
# Copyright 2018 Ewald Enzinger | ||
# 2018 David Snyder | ||
# | ||
# Usage: make_voxceleb1.pl /export/voxceleb1 data/ | ||
# Note that this script also downloads a list of speakers that overlap | ||
# with our evaluation set, SITW. These speakers are removed from VoxCeleb1 | ||
# prior to preparing the dataset. | ||
|
||
if (@ARGV != 2) { | ||
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n"; | ||
print STDERR "e.g. $0 /export/voxceleb1 data/\n"; | ||
exit(1); | ||
} | ||
|
||
($data_base, $out_dir) = @ARGV; | ||
my $out_dir = "$out_dir/voxceleb1"; | ||
|
||
if (system("mkdir -p $out_dir") != 0) { | ||
die "Error making directory $out_dir"; | ||
} | ||
|
||
# This file provides the list of speakers that overlap between SITW and VoxCeleb1. | ||
if (! -e "$out_dir/voxceleb1_sitw_overlap.txt") { | ||
system("wget -O $out_dir/voxceleb1_sitw_overlap.txt http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt"); | ||
} | ||
|
||
# sitw_overlap contains the list of speakers that also exist in our evaluation set, SITW. | ||
my %sitw_overlap = (); | ||
open(OVERLAP, "<", "$out_dir/voxceleb1_sitw_overlap.txt") or die "Could not open the overlap file $out_dir/voxceleb1_sitw_overlap.txt"; | ||
while (<OVERLAP>) { | ||
chomp; | ||
my $spkr_id = $_; | ||
$sitw_overlap{$spkr_id} = (); | ||
} | ||
|
||
opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; | ||
my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); | ||
closedir $dh; | ||
|
||
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; | ||
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; | ||
|
||
foreach (@spkr_dirs) { | ||
my $spkr_id = $_; | ||
# Only keep the speaker if it isn't in the overlap list. | ||
if (not exists $sitw_overlap{$spkr_id}) { | ||
opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; | ||
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); | ||
closedir $dh; | ||
foreach (@files) { | ||
my $filename = $_; | ||
my $rec_id = substr($filename, 0, 11); | ||
my $segment = substr($filename, 12, 7); | ||
my $utt_id = "$spkr_id-$rec_id-$segment"; | ||
my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; | ||
print WAV "$utt_id", " $wav", "\n"; | ||
print SPKR "$utt_id", " $spkr_id", "\n"; | ||
} | ||
} | ||
} | ||
|
||
close(SPKR) or die; | ||
close(WAV) or die; | ||
|
||
if (system( | ||
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { | ||
die "Error creating spk2utt file in directory $out_dir"; | ||
} | ||
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); | ||
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { | ||
die "Error validating directory $out_dir"; | ||
} | ||
|
||
if (system( | ||
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { | ||
die "Error creating spk2utt file in directory $out_dir"; | ||
} | ||
|
||
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); | ||
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { | ||
die "Error validating directory $out_dir"; | ||
} |
Oops, something went wrong.