kaldi-asr · danpovey · Dec 14, 2019 · Nov 29, 2019 · Dec 7, 2019 · Dec 7, 2019
diff --git a/egs/cnceleb/README.txt b/egs/cnceleb/README.txt
@@ -0,0 +1,9 @@
+
+This directory contains example scripts for CN-Celeb speaker 
+verification. The CN-Celeb corpus is required, and can be 
+downloaded from Openslr http://www.openslr.org/82/ or from 
+CSLT@Tsinghua http://cslt.riit.tsinghua.edu.cn/~data/CN-Celeb/
+
+The subdirectories "v1" and so on are different speaker recognition 
+recipes. The recipe in v1 demonstrates a standard approach using a 
+full-covariance GMM-UBM, iVectors, and a PLDA backend.
diff --git a/egs/cnceleb/v1/README.txt b/egs/cnceleb/v1/README.txt
@@ -0,0 +1,4 @@
+
+ This example demonstrates a traditional iVector system based on 
+ CN-Celeb dataset. 
+
diff --git a/egs/cnceleb/v1/cmd.sh b/egs/cnceleb/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+
+
diff --git a/egs/cnceleb/v1/conf/mfcc.conf b/egs/cnceleb/v1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=24
+--snip-edges=false
diff --git a/egs/cnceleb/v1/conf/vad.conf b/egs/cnceleb/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/cnceleb/v1/local/make_cnceleb.sh b/egs/cnceleb/v1/local/make_cnceleb.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright      2017  Ignacio Viñals
+#           2017-2018  David Snyder
+#                2019  Jiawen Kang
+#
+# This script prepares the CN-Celeb dataset. It creates separate directories
+# for train, eval enroll and eval test. It also prepares a trials files, in the eval test directory.
+
+if [  $# != 2 ]; then
+    echo "Usage: make_cnceleb.sh <CN-Celeb_PATH> <out_dir>"
+    echo "E.g.: make_cnceleb.sh /export/corpora/CN-Celeb data"
+    exit 1
+fi
+
+in_dir=$1
+out_dir=$2
+
+# Prepare the development data
+this_out_dir=${out_dir}/train
+mkdir -p $this_out_dir 2>/dev/null
+WAVFILE=$this_out_dir/wav.scp
+SPKFILE=$this_out_dir/utt2spk
+rm $WAVFILE $SPKFILE 2>/dev/null
+this_in_dir=${in_dir}/dev
+
+for spkr_id in `cat $this_in_dir/dev.lst`; do
+  for f in $in_dir/data/$spkr_id/*.wav; do
+    wav_id=$(basename $f | sed s:.wav$::)
+    echo "${spkr_id}-${wav_id} $f" >> $WAVFILE
+    echo "${spkr_id}-${wav_id} ${spkr_id}" >> $SPKFILE
+  done
+done
+utils/fix_data_dir.sh $this_out_dir
+
+# Prepare the evaluation data
+for mode in enroll test; do
+  this_out_dir=${out_dir}/eval_${mode}
+  mkdir -p $this_out_dir 2>/dev/null
+  WAVFILE=$this_out_dir/wav.scp
+  SPKFILE=$this_out_dir/utt2spk
+  rm $WAVFILE $SPKFILE 2>/dev/null
+  this_in_dir=${in_dir}/eval/${mode}
+
+  for f in $this_in_dir/*.wav; do
+    wav_id=$(basename $f | sed s:.wav$::)
+    spkr_id=$(echo ${wav_id} | cut -d "-" -f1)
+    echo "${wav_id} $f" >> $WAVFILE
+    echo "${wav_id} ${spkr_id}" >> $SPKFILE
+  done
+  utils/fix_data_dir.sh $this_out_dir
+done
+
+# Prepare test trials
+this_out_dir=$out_dir/eval_test/trials
+mkdir -p $out_dir/eval_test/trials
+this_in_dir=${in_dir}/eval/lists
+cat $this_in_dir/trials.lst | sed 's@-enroll@@g' | sed 's@test/@@g' | sed 's@.wav@@g' | \
+  awk '{if ($3 == "1")
+         {print $1,$2,"target"}
+       else
+         {print $1,$2,"nontarget"}
+       }'> $this_out_dir/trials.lst
+
diff --git a/egs/cnceleb/v1/path.sh b/egs/cnceleb/v1/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/cnceleb/v1/run.sh b/egs/cnceleb/v1/run.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Copyright   2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#             2017   Johns Hopkins University (Author: Daniel Povey)
+#        2017-2018   David Snyder
+#             2018   Ewald Enzinger
+#             2019   Tsinghua University (Author: Jiawen Kang and Lantian Li)
+# Apache 2.0.
+#
+# This is an i-vector-based recipe for CN-Celeb database.
+# See ../README.txt for more info on data required. The recipe uses
+# CN-Celeb/dev for training the UBM, T matrix and PLDA, and CN-Celeb/eval
+# for evaluation. The results are reported in terms of EER and minDCF,
+# and are inline in the comments below.
+
+. ./cmd.sh
+. ./path.sh
+set -e
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+cnceleb_root=/export/corpora/CN-Celeb
+eval_trails_core=data/eval_test/trials/trials.lst
+
+stage=0
+
+if [ $stage -le 0 ]; then
+  # Prepare the CN-Celeb dataset. The script is used to prepare the development
+  # dataset and evaluation dataset.
+  local/make_cnceleb.sh $cnceleb_root data
+fi
+
+if [ $stage -le 1 ]; then
+  # Make MFCCs and compute the energy-based VAD for each dataset
+  for name in train eval_enroll eval_test; do
+    steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 20 --cmd "$train_cmd" \
+      data/${name} exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/${name}
+    sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+      data/${name} exp/make_vad $vaddir
+    utils/fix_data_dir.sh data/${name}
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the UBM
+  sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \
+    --nj 20 --num-threads 8 \
+    data/train 2048 \
+    exp/diag_ubm
+
+  sid/train_full_ubm.sh --cmd "$train_cmd --mem 16G" \
+    --nj 20 --remove-low-count-gaussians false \
+    data/train \
+    exp/diag_ubm exp/full_ubm
+fi
+
+if [ $stage -le 3 ]; then
+  # Train the i-vector extractor.
+  sid/train_ivector_extractor.sh --nj 20 --cmd "$train_cmd --mem 16G" \
+    --ivector-dim 400 --num-iters 5 \
+    exp/full_ubm/final.ubm data/train \
+    exp/extractor
+fi
+
+if [ $stage -le 4 ]; then
+  # Note that there are over one-third of the utterances less than 2 seconds in our training set,
+  # and these short utterances are harmful for PLDA training. Therefore, to improve performance 
+  # of PLDA modeling and inference, we will combine the short utterances longer than 5 seconds.
+  utils/data/combine_short_segments.sh --speaker-only true \
+    data/train 5 data/train_comb
+  # Compute the energy-based VAD for train_comb
+  sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+    data/train_comb exp/make_vad $vaddir
+  utils/fix_data_dir.sh data/train_comb
+fi
+
+if [ $stage -le 5 ]; then
+  # These i-vectors will be used for mean-subtraction, LDA, and PLDA training.
+  sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    exp/extractor data/train_comb \
+    exp/ivectors_train_comb
+
+  # Extract i-vector for eval sets.
+  for name in eval_enroll eval_test; do
+    sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 10 \
+      exp/extractor data/$name \
+      exp/ivectors_$name
+  done
+fi
+
+if [ $stage -le 6 ]; then
+  # Compute the mean vector for centering the evaluation i-vectors.
+  $train_cmd exp/ivectors_train_comb/log/compute_mean.log \
+    ivector-mean scp:exp/ivectors_train_comb/ivector.scp \
+    exp/ivectors_train_comb/mean.vec || exit 1;
+
+  # This script uses LDA to decrease the dimensionality prior to PLDA.
+  lda_dim=150
+  $train_cmd exp/ivectors_train_comb/log/lda.log \
+    ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
+    "ark:ivector-subtract-global-mean scp:exp/ivectors_train_comb/ivector.scp ark:- |" \
+    ark:data/train_comb/utt2spk exp/ivectors_train_comb/transform.mat || exit 1;
+
+  # Train the PLDA model.
+  $train_cmd exp/ivectors_train_comb/log/plda.log \
+    ivector-compute-plda ark:data/train_comb/spk2utt \
+    "ark:ivector-subtract-global-mean scp:exp/ivectors_train_comb/ivector.scp ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    exp/ivectors_train_comb/plda || exit 1;
+
+fi
+
+if [ $stage -le 7 ]; then
+  # Compute PLDA scores for CN-Celeb eval core trials
+  $train_cmd exp/scores/log/cnceleb_eval_scoring.log \
+    ivector-plda-scoring --normalize-length=true \
+    --num-utts=ark:exp/ivectors_eval_enroll/num_utts.ark \
+    "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_comb/plda - |" \
+    "ark:ivector-mean ark:data/eval_enroll/spk2utt scp:exp/ivectors_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_comb/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    "ark:ivector-subtract-global-mean exp/ivectors_train_comb/mean.vec scp:exp/ivectors_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    "cat '$eval_trails_core' | cut -d\  --fields=1,2 |" exp/scores/cnceleb_eval_scores || exit 1;
+
+  # CN-Celeb Eval Core:
+  # EER: 13.91%
+  # minDCF(p-target=0.01): 0.6530
+  # minDCF(p-target=0.001): 0.7521
+  echo -e "\nCN-Celeb Eval Core:";
+  eer=$(paste $eval_trails_core exp/scores/cnceleb_eval_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
+  mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/cnceleb_eval_scores $eval_trails_core 2> /dev/null`
+  mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/cnceleb_eval_scores $eval_trails_core 2> /dev/null`
+  echo "EER: $eer%"
+  echo "minDCF(p-target=0.01): $mindcf1"
+  echo "minDCF(p-target=0.001): $mindcf2"
+fi
diff --git a/egs/cnceleb/v1/sid b/egs/cnceleb/v1/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
diff --git a/egs/cnceleb/v1/steps b/egs/cnceleb/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
diff --git a/egs/cnceleb/v1/utils b/egs/cnceleb/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
diff --git a/egs/wsj/s5/utils/data/combine_short_segments.sh b/egs/wsj/s5/utils/data/combine_short_segments.sh
@@ -16,16 +16,22 @@
 
 # begin configuration section
 cleanup=true
+speaker_only=false  # If true, utterances are only combined from the same speaker.
+                    # It may be useful for the speaker recognition task.
+                    # If false, utterances are preferentially combined from the same speaker,
+                    # and then combined across different speakers.
 # end configuration section
 
+
 . utils/parse_options.sh
 
 if [ $# != 3 ]; then
   echo "Usage: "
   echo "  $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>"
   echo "e.g.:"
   echo " $0 data/train 1.55 data/train_comb"
-  # options documentation here.
+  echo " Options:"
+  echo "  --speaker-only <true|false>  # options to internal/choose_utts_to_combine.py, default false."
   exit 1;
 fi
 
@@ -55,7 +61,7 @@ if ! mkdir -p $dir; then
   exit 1;
 fi
 
-if ! utils/validate_data_dir.sh $srcdir; then
+if ! utils/validate_data_dir.sh --no-text $srcdir; then
   echo "$0: failed to validate input directory $srcdir.  If needed, run   utils/fix_data_dir.sh $srcdir"
   exit 1
 fi
@@ -72,6 +78,7 @@ set -o pipefail
 utils/data/get_utt2dur.sh $srcdir
 
 utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \
+  --merge-within-speakers-only=$speaker_only \
   $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur
 
 utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt
@@ -87,7 +94,9 @@ utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \
 
 # create $dir/text by concatenating the source 'text' entries for the original
 # utts.
-utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
+fi
 
 if [ -f $srcdir/utt2uniq ]; then
   # the utt2uniq file is such that if 2 utts were derived from the same original
@@ -171,7 +180,7 @@ fi
 # note: the user will have to recompute the cmvn, as the speakers may have changed.
 rm $dir/cmvn.scp 2>/dev/null || true
 
-utils/validate_data_dir.sh --no-wav $dir
+utils/validate_data_dir.sh --no-text --no-wav $dir
 
 if $cleanup; then
   rm $dir/utt2utts

diff --git a/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py b/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py
@@ -37,6 +37,12 @@
 
 parser.add_argument("--min-duration", type = float, default = 1.55,
                     help="Minimum utterance duration")
+parser.add_argument("--merge-within-speakers-only", type = str, default = 'false',
+                    choices = ['true', 'false'],
+                    help="If true, utterances are only combined from the same speaker."
+                    "It may be useful for the speaker recognition task."
+                    "If false, utterances are preferentially combined from the same speaker,"
+                    "and then combined across different speakers.")
 parser.add_argument("spk2utt_in", type = str, metavar = "<spk2utt-in>",
                     help="Filename of [input] speaker to utterance map needed "
                     "because this script tries to merge utterances from the "
@@ -216,12 +222,14 @@ def SelfTest():
 # This function figures out the grouping of utterances.
 # The input is:
 # 'min_duration' which is the minimum utterance length in seconds.
+# 'merge_within_speakers_only' which is a ['true', 'false'] choice.
+# If true, then utterances are only combined if they belong to the same speaker.
 # 'spk2utt' which is a list of pairs (speaker-id, [list-of-utterances])
 # 'utt2dur' which is a dict from utterance-id to duration (as a float)
 # It returns a lists of lists of utterances; each list corresponds to
 # a group, e.g.
 # [ ['utt1'], ['utt2', 'utt3'] ]
-def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
+def GetUtteranceGroups(min_duration, merge_within_speakers_only, spk2utt, utt2dur):
     # utt_groups will be a list of lists of utterance-ids formed from the
     # first pass of combination.
     utt_groups = []
@@ -256,22 +264,24 @@ def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
     # Now we combine the groups obtained above, in case we had situations where
     # the combination of all the utterances of one speaker were still below
     # the minimum duration.
-    new_utt_groups = []
-    ranges = CombineList(min_duration, group_durations)
-    for start, end in ranges:
-        # the following code is destructive of 'utt_groups' but it doesn't
-        # matter.
-        this_group = utt_groups[start]
-        for i in range(start + 1, end):
-            this_group += utt_groups[i]
-        new_utt_groups.append(this_group)
-    print("choose_utts_to_combine.py: combined {0} utterances to {1} utterances "
-          "while respecting speaker boundaries, and then to {2} utterances "
-          "with merging across speaker boundaries.".format(
-            len(utt2dur), len(utt_groups), len(new_utt_groups)),
-          file = sys.stderr)
-    return new_utt_groups
-
+    if merge_within_speakers_only == 'true':
+      return utt_groups
+    else:
+      new_utt_groups = []
+      ranges = CombineList(min_duration, group_durations)
+      for start, end in ranges:
+          # the following code is destructive of 'utt_groups' but it doesn't
+          # matter.
+          this_group = utt_groups[start]
+          for i in range(start + 1, end):
+              this_group += utt_groups[i]
+          new_utt_groups.append(this_group)
+      print("choose_utts_to_combine.py: combined {0} utterances to {1} utterances "
+            "while respecting speaker boundaries, and then to {2} utterances "
+            "with merging across speaker boundaries.".format(
+              len(utt2dur), len(utt_groups), len(new_utt_groups)),
+            file = sys.stderr)
+      return new_utt_groups
 
 
 SelfTest()
@@ -324,7 +334,7 @@ def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
                 args.utt2dur_in, line))
 
 
-utt_groups = GetUtteranceGroups(args.min_duration, spk2utt, utt2dur)
+utt_groups = GetUtteranceGroups(args.min_duration, args.merge_within_speakers_only, spk2utt, utt2dur)
 
 # set utt_group names to an array like [ 'utt1', 'utt2-comb2', 'utt4', ... ]
 utt_group_names = [ group[0] if len(group)==1 else "{0}-comb{1}".format(group[0], len(group))