[egs] Add recipes for Speakers in the Wild (SITW) (#2422)

kaldi-asr · May 24, 2018 · 447e964 · 447e964
1 parent 7051384
commit 447e964
Show file tree

Hide file tree

Showing 29 changed files with 1,277 additions and 1 deletion.
diff --git a/egs/sitw/README.txt b/egs/sitw/README.txt
@@ -0,0 +1,16 @@
+
+ This directory (sitw) contains example scripts for the Speakers in the
+ Wild (SITW) Speaker Recognition Challenge.  The SITW corpus is required,
+ and can be obtained by following the directions at the url
+ http://www.speech.sri.com/projects/sitw/
+
+ Additional data sources (e.g., VoxCeleb and MUSAN) are required to train
+ the systems in the subdirectories.  See the corresponding README.txt files
+ in the subdirectories for more details. 
+
+ Note: This recipe requires ffmpeg to be installed and its location included
+ in $PATH.
+
+ The subdirectories "v1" and so on are different speaker recognition
+ recipes. The recipe in v1 is a traditional i-vector system while the v2
+ recipe uses DNN embeddings called x-vectors.
diff --git a/egs/sitw/v1/README.txt b/egs/sitw/v1/README.txt
@@ -0,0 +1,14 @@
+
+ This is a traditional i-vector recipe for Speakers in the Wild.  The
+ following datasets are used:
+
+ Evaluation
+
+     Speakers in the Wild    http://www.speech.sri.com/projects/sitw
+
+ System Development
+
+     VoxCeleb 1              http://www.robots.ox.ac.uk/~vgg/data/voxceleb
+     VoxCeleb 2              http://www.robots.ox.ac.uk/~vgg/data/voxceleb2
+     MUSAN                   http://www.openslr.org/17
+     RIR_NOISES              http://www.openslr.org/28
diff --git a/egs/sitw/v1/cmd.sh b/egs/sitw/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+
+
diff --git a/egs/sitw/v1/conf/mfcc.conf b/egs/sitw/v1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=24
+--snip-edges=false
diff --git a/egs/sitw/v1/conf/vad.conf b/egs/sitw/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+# Copyright 2015   David Snyder
+#           2018   Ewald Enzinger
+# Apache 2.0.
+#
+# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
+# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys
+
+def process_music_annotations(path):
+  utt2spk = {}
+  utt2vocals = {}
+  lines = open(path, 'r').readlines()
+  for line in lines:
+    utt, genres, vocals, musician = line.rstrip().split()[:4]
+    # For this application, the musican ID isn't important
+    utt2spk[utt] = utt
+    utt2vocals[utt] = vocals == "Y"
+  return utt2spk, utt2vocals
+
+def prepare_music(root_dir, use_vocals):
+  utt2vocals = {}
+  utt2spk = {}
+  utt2wav = {}
+  num_good_files = 0
+  num_bad_files = 0
+  music_dir = os.path.join(root_dir, "music")
+  for root, dirs, files in os.walk(music_dir):
+    for file in files:
+      file_path = os.path.join(root, file)
+      if file.endswith(".wav"):
+        utt = str(file).replace(".wav", "")
+        utt2wav[utt] = file_path
+      elif str(file) == "ANNOTATIONS":
+        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+        utt2spk.update(utt2spk_part)
+        utt2vocals.update(utt2vocals_part)
+  utt2spk_str = ""
+  utt2wav_str = ""
+  for utt in utt2vocals:
+    if utt in utt2wav:
+      if use_vocals or not utt2vocals[utt]:
+        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+      num_good_files += 1
+    else:
+      print("Missing file", utt)
+      num_bad_files += 1
+  print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
+  return utt2spk_str, utt2wav_str
+
+def prepare_speech(root_dir):
+  utt2spk = {}
+  utt2wav = {}
+  num_good_files = 0
+  num_bad_files = 0
+  speech_dir = os.path.join(root_dir, "speech")
+  for root, dirs, files in os.walk(speech_dir):
+    for file in files:
+      file_path = os.path.join(root, file)
+      if file.endswith(".wav"):
+        utt = str(file).replace(".wav", "")
+        utt2wav[utt] = file_path
+        utt2spk[utt] = utt
+  utt2spk_str = ""
+  utt2wav_str = ""
+  for utt in utt2spk:
+    if utt in utt2wav:
+      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+      num_good_files += 1
+    else:
+      print("Missing file", utt)
+      num_bad_files += 1
+  print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
+  return utt2spk_str, utt2wav_str
+
+def prepare_noise(root_dir):
+  utt2spk = {}
+  utt2wav = {}
+  num_good_files = 0
+  num_bad_files = 0
+  noise_dir = os.path.join(root_dir, "noise")
+  for root, dirs, files in os.walk(noise_dir):
+    for file in files:
+      file_path = os.path.join(root, file)
+      if file.endswith(".wav"):
+        utt = str(file).replace(".wav", "")
+        utt2wav[utt] = file_path
+        utt2spk[utt] = utt
+  utt2spk_str = ""
+  utt2wav_str = ""
+  for utt in utt2spk:
+    if utt in utt2wav:
+      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+      num_good_files += 1
+    else:
+      print("Missing file", utt)
+      num_bad_files += 1
+  print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
+  return utt2spk_str, utt2wav_str
+
+def main():
+  in_dir = sys.argv[1]
+  out_dir = sys.argv[2]
+  use_vocals = sys.argv[3] == "Y"
+  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
+  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
+  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
+  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+  wav_fi.write(utt2wav)
+  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+  utt2spk_fi.write(utt2spk)
+
+
+if __name__=="__main__":
+  main()
diff --git a/egs/sitw/v1/local/make_musan.sh b/egs/sitw/v1/local/make_musan.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
+#
+# This script, called by ../run.sh, creates the MUSAN
+# data directory. The required dataset is freely available at
+#   http://www.openslr.org/17/
+
+set -e
+in_dir=$1
+data_dir=$2
+use_vocals='Y'
+
+mkdir -p local/musan.tmp
+
+echo "Preparing ${data_dir}/musan..."
+mkdir -p ${data_dir}/musan
+local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
+
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
+  ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
+  ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
+  ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf local/musan.tmp
+
diff --git a/egs/sitw/v1/local/make_sitw.sh b/egs/sitw/v1/local/make_sitw.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyrigh       2017  Ignacio Viñals
+#           2017-2018  David Snyder
+#
+# This script prepares the SITW data.  It creates separate directories
+# for dev enroll, eval enroll, dev test, and eval test.  It also prepares
+# multiple trials files, in the test directories, but we usually only use the
+# core-core.lst list.
+
+if [  $# != 2 ]; then
+    echo "Usage: make_sitw.sh <SITW_PATH> <this_out_dir>"
+    echo "E.g.: make_sitw.sh /export/corpora/SRI/sitw data"
+    exit 1
+fi
+
+in_dir=$1
+out_dir=$2
+
+# Prepare the enrollment data
+for mode in dev eval; do
+  this_out_dir=${out_dir}/sitw_${mode}_enroll
+  mkdir -p $this_out_dir 2>/dev/null
+  WAVFILE=$this_out_dir/wav.scp
+  SPKFILE=$this_out_dir/utt2spk
+  MODFILE=$this_out_dir/utt2cond
+  rm $WAVFILE $SPKFILE $MODFILE 2>/dev/null
+  this_in_dir=${in_dir}/$mode
+
+  for enroll in core assist; do
+    cat $this_in_dir/lists/enroll-${enroll}.lst | \
+    while read line; do
+      wav_id=`echo $line| awk '{print $2}' |\
+        awk 'BEGIN{FS="[./]"}{print $(NF-1)}'`
+      spkr_id=`echo $line| awk '{print $1}'`
+      WAV=`echo $line | awk '{print this_in_dir"/"$2}' this_in_dir=$this_in_dir`
+      echo "${spkr_id}_${wav_id} sox -t flac $WAV -t wav -r 16k -b 16 - channels 1 |" >> $WAVFILE
+      echo "${spkr_id}_${wav_id} ${spkr_id}" >> $SPKFILE
+      echo "${spkr_id}_${wav_id} $enroll $mode" >> $MODFILE
+    done
+  done
+  utils/fix_data_dir.sh $this_out_dir
+done
+
+# Prepare the test data
+for mode in dev eval; do
+  this_out_dir=${out_dir}/sitw_${mode}_test
+  mkdir -p $this_out_dir 2>/dev/null
+  WAVFILE=$this_out_dir/wav.scp
+  SPKFILE=$this_out_dir/utt2spk
+  MODFILE=$this_out_dir/utt2cond
+  rm $WAVFILE $SPKFILE $MODFILE 2>/dev/null
+  mkdir -p $this_out_dir/trials 2>/dev/null
+  mkdir -p $this_out_dir/trials/aux 2>/dev/null
+  this_in_dir=${in_dir}/$mode
+
+  for trial in core multi; do
+    cat $this_in_dir/lists/test-${trial}.lst | awk '{print $1,$2}' |\
+    while read line; do
+      wav_id=`echo $line | awk 'BEGIN{FS="[./]"} {print $(NF-1)}'`
+      WAV=`echo $line | awk '{print this_in_dir"/"$1}' this_in_dir=$this_in_dir`
+      echo "${wav_id} sox -t flac $WAV -t wav -r 16k -b 16 - channels 1 |" >> $WAVFILE
+      echo "${wav_id} ${wav_id}" >> $SPKFILE
+      echo "${wav_id} $trial $mode" >> $MODFILE
+    done
+  done
+
+  for trial in core-core core-multi assist-core assist-multi; do
+    cat $this_in_dir/keys/$trial.lst | sed 's@audio/@@g' | sed 's@.flac@@g' |\
+    awk '{if ($3=="tgt")
+           {print $1,$2,"target"}
+         else
+           {print $1,$2,"nontarget"}
+         }'   > $this_out_dir/trials/${trial}.lst
+  done
+
+  for trial in $this_in_dir/keys/aux/* ; do
+    trial_name=`basename $trial`
+    cat $trial | sed 's@audio/@@g' | sed 's@.flac@@g' |\
+    awk '{if ($3=="tgt")
+           {print $1,$2,"target"}
+         else
+           {print $1,$2,"nontarget"}
+     }'   > $this_out_dir/trials/aux/${trial_name}
+  done
+  utils/fix_data_dir.sh $this_out_dir
+done
diff --git a/egs/sitw/v1/local/make_voxceleb1.pl b/egs/sitw/v1/local/make_voxceleb1.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#
+# Usage: make_voxceleb1.pl /export/voxceleb1 data/
+# Note that this script also downloads a list of speakers that overlap
+# with our evaluation set, SITW.  These speakers are removed from VoxCeleb1
+# prior to preparing the dataset.
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 data/\n";
+  exit(1);
+}
+
+($data_base, $out_dir) = @ARGV;
+my $out_dir = "$out_dir/voxceleb1";
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+# This file provides the list of speakers that overlap between SITW and VoxCeleb1.
+if (! -e "$out_dir/voxceleb1_sitw_overlap.txt") {
+  system("wget -O $out_dir/voxceleb1_sitw_overlap.txt http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt");
+}
+
+# sitw_overlap contains the list of speakers that also exist in our evaluation set, SITW.
+my %sitw_overlap = ();
+open(OVERLAP, "<", "$out_dir/voxceleb1_sitw_overlap.txt") or die "Could not open the overlap file $out_dir/voxceleb1_sitw_overlap.txt";
+while (<OVERLAP>) {
+  chomp;
+  my $spkr_id = $_;
+  $sitw_overlap{$spkr_id} = ();
+}
+
+opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+  # Only keep the speaker if it isn't in the overlap list.
+  if (not exists $sitw_overlap{$spkr_id}) {
+    opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+    closedir $dh;
+    foreach (@files) {
+      my $filename = $_;
+      my $rec_id = substr($filename, 0, 11);
+      my $segment = substr($filename, 12, 7);
+      my $utt_id = "$spkr_id-$rec_id-$segment";
+      my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
+      print WAV "$utt_id", " $wav", "\n";
+      print SPKR "$utt_id", " $spkr_id", "\n";
+    }
+  }
+}
+
+close(SPKR) or die;
+close(WAV) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}