add speaker verification into evaluation

MingjieChen · Mar 29, 2023 · f389c0f · f389c0f
1 parent e6074de
commit f389c0f
Show file tree

Hide file tree

Showing 10 changed files with 175 additions and 46 deletions.
diff --git a/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_diffwave_none.yaml b/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_diffwave_none.yaml
@@ -40,8 +40,12 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)
 
 # decoder params
 decoder_params: 
+    noise_steps: 100
+    noise_start: !!float 1e-5
+    noise_end: 0.05
+    infer_noise: [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5]
     segment_size: 14400 
-    use_text_encoder: !!bool False
+    use_text_encoder: !!bool True
     input_dim: !!int 512
     spk_emb_dim: !!int  256
     prosodic_rep_type: continuous

diff --git a/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_gradtts_bigvgan.yaml b/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_gradtts_bigvgan.yaml
@@ -39,7 +39,7 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)
 
 # decoder params
 decoder_params: 
-    use_text_encoder: !!bool False
+    use_text_encoder: !!bool True
     use_prior_loss: !!bool False
     n_feats: !!int 100
     input_dim: !!int 512

diff --git a/decoder/diffwave/model.py b/decoder/diffwave/model.py
@@ -254,11 +254,18 @@ def __init__(self, config):
 
     self.use_text_encoder = config['use_text_encoder']
 
+
+    noise_steps = config['noise_steps']
+    noise_start = config['noise_start']
+    noise_end = config['noise_end']   
+
+    self.infer_noise = config['infer_noise']
+
 
-    noise_schedule = np.linspace(1e-4, 0.05, 50).tolist()
+    noise_schedule = np.linspace(noise_start, noise_end, noise_steps).tolist()
     self.noise_schedule = noise_schedule
     self.diffusion_embedding = DiffusionEmbedding(len(noise_schedule))
-
+    self.fast_sampling  = config['fast_sampling'] if 'fast_sampling' in config else True
     self.upsampler = Upsampler(inter_channels)
 
     if self.use_text_encoder:
@@ -331,11 +338,12 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):
     return y
 
   def inference(self, ling, pros, spk, lengths):
-    fast_sampling = True
+    fast_sampling = self.fast_sampling
     training_noise_schedule = np.array(self.noise_schedule)    
-    inference_noise_schedule=np.array([0.0001, 0.001, 0.01, 0.05, 0.2, 0.5])
+    #inference_noise_schedule= np.array([0.0001, 0.001, 0.01, 0.1, 0.2, 0.5])
+    inference_noise_schedule = np.array(self.infer_noise)
     inference_noise_schedule = np.array(inference_noise_schedule) if fast_sampling else training_noise_schedule
-
+    print(f'inference noise schedule {inference_noise_schedule}')  
     talpha = 1 - training_noise_schedule
     talpha_cum = np.cumprod(talpha)
 
@@ -351,6 +359,7 @@ def inference(self, ling, pros, spk, lengths):
           T.append(t + twiddle)
           break
     T = np.array(T, dtype=np.float32)
+    print(f'inference T {T}')
 
     # hard code hop_size = 240
     audio = torch.randn(ling.shape[0], 240 * ling.shape[-1], device=ling.device)
@@ -363,6 +372,7 @@ def inference(self, ling, pros, spk, lengths):
         noise = torch.randn_like(audio)
         sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
         audio += sigma * noise
+      print(f'c1 {c1} c2 {c2} sigma {sigma}', flush = True)
       audio = torch.clamp(audio, -1.0, 1.0)
     return audio 
 

diff --git a/decoder/interface.py b/decoder/interface.py
@@ -164,6 +164,7 @@ def infer_GradTTS(model, ling, pros, spk):
     return mel
 
 def infer_DiffWave(model, ling, pros, spk):
+    print(f'ling length {ling.size(1)}', flush = True)
     ling_lengths = torch.LongTensor([ling.size(1)]).to(ling.device)
     ling = ling.transpose(1,2)
     pros = pros.transpose(1,2)

diff --git a/evaluation/speechbrain_asv.py b/evaluation/speechbrain_asv.py
@@ -1,9 +1,107 @@
-from speechbrain.pretrained import SpeakerRecognition
+from speechbrain.pretrained import EncoderClassifier
 import os
 from speechbrain.utils.metric_stats import EER
 import sys
 from tqdm import tqdm
+import torch
 
+class SpeakerRecognition(EncoderClassifier):
+    """A ready-to-use model for speaker recognition. It can be used to
+    perform speaker verification with verify_batch().
+
+    ```
+    Example
+    -------
+    >>> import torchaudio
+    >>> from speechbrain.pretrained import SpeakerRecognition
+    >>> # Model is downloaded from the speechbrain HuggingFace repo
+    >>> tmpdir = getfixture("tmpdir")
+    >>> verification = SpeakerRecognition.from_hparams(
+    ...     source="speechbrain/spkrec-ecapa-voxceleb",
+    ...     savedir=tmpdir,
+    ... )
+
+    >>> # Perform verification
+    >>> signal, fs = torchaudio.load("tests/samples/single-mic/example1.wav")
+    >>> signal2, fs = torchaudio.load("tests/samples/single-mic/example2.flac")
+    >>> score, prediction = verification.verify_batch(signal, signal2)
+    """
+
+    MODULES_NEEDED = [
+        "compute_features",
+        "mean_var_norm",
+        "embedding_model",
+        "mean_var_norm_emb",
+    ]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
+
+    def verify_batch(
+        self, wavs1, wavs2, wav1_lens=None, wav2_lens=None, threshold=0.25
+    ):
+        """Performs speaker verification with cosine distance.
+
+        It returns the score and the decision (0 different speakers,
+        1 same speakers).
+
+        Arguments
+        ---------
+        wavs1 : Torch.Tensor
+                Tensor containing the speech waveform1 (batch, time).
+                Make sure the sample rate is fs=16000 Hz.
+        wavs2 : Torch.Tensor
+                Tensor containing the speech waveform2 (batch, time).
+                Make sure the sample rate is fs=16000 Hz.
+        wav1_lens: Torch.Tensor
+                Tensor containing the relative length for each sentence
+                in the length (e.g., [0.8 0.6 1.0])
+        wav2_lens: Torch.Tensor
+                Tensor containing the relative length for each sentence
+                in the length (e.g., [0.8 0.6 1.0])
+        threshold: Float
+                Threshold applied to the cosine distance to decide if the
+                speaker is different (0) or the same (1).
+
+        Returns
+        -------
+        score
+            The score associated to the binary verification output
+            (cosine distance).
+        prediction
+            The prediction is 1 if the two signals in input are from the same
+            speaker and 0 otherwise.
+        """
+        emb1 = self.encode_batch(wavs1, wav1_lens, normalize=True)
+        emb2 = self.encode_batch(wavs2, wav2_lens, normalize=True)
+        score = self.similarity(emb1, emb2)
+        return score, score > threshold
+
+    def verify_files(self, path_x, path_y):
+        """Speaker verification with cosine distance
+
+        Returns the score and the decision (0 different speakers,
+        1 same speakers).
+
+        Returns
+        -------
+        score
+            The score associated to the binary verification output
+            (cosine distance).
+        prediction
+            The prediction is 1 if the two signals in input are from the same
+            speaker and 0 otherwise.
+        """
+        waveform_x = self.load_audio(path_x, savedir = "pretrained_models/spkrec-ecapa-voxceleb") 
+        waveform_y = self.load_audio(path_y, savedir = "pretrained_models/spkrec-ecapa-voxceleb")
+        # Fake batches:
+        batch_x = waveform_x.unsqueeze(0)
+        batch_y = waveform_y.unsqueeze(0)
+        # Verify:
+        score, decision = self.verify_batch(batch_x, batch_y)
+        # Squeeze:
+        return score[0], decision[0]
 if __name__ == '__main__':
 
     converted_wav_dir = sys.argv[1]
@@ -50,5 +148,5 @@
 
     positive_scores = torch.tensor(positive_scores)     
     negative_scores = torch.tensor(negative_scores)     
-    eer_result = EER(positive_scores, negative_scores)
-    print(f'EER result {eer_result * 100}')
+    eer_result = EER(positive_scores, negative_scores)[0]
+    print(f'EER result {eer_result * 100 }')
diff --git a/feature_extraction.py b/feature_extraction.py
@@ -118,7 +118,6 @@ def process_speaker(spk_meta, spk, config, args):
         ID = row['ID']
         wav_path = row['wav_path'].strip()
         audio, fs = librosa.load(wav_path, sr = config['sampling_rate'])
-
         # trim silence
         start, end = float(row['start']), float(row['end'])
         audio = audio[ int(start * config['sampling_rate']):

diff --git a/inference.py b/inference.py
@@ -33,8 +33,6 @@
 def denorm_mel(mean_tensor, std_tensor, mel):
 
     if mean_tensor is not None and std_tensor is not None:
-        mean_tensor = torch.FloatTensor(scaler.mean_)
-        std_tensor = torch.FloatTensor(scaler.scale_)
 
         mel = mel * std_tensor + mean_tensor
 
@@ -66,7 +64,7 @@ def load_wav(path, sample_rate = 16000):
     parser.add_argument('--task', type = str) 
     parser.add_argument('--src_resyn', default = False, action = 'store_true')
     # vocoder
-    parser.add_argument('--vocoder', type = str, default = 'ppg_vc_hifigan')
+    #parser.add_argument('--vocoder', type = str, default = 'ppg_vc_hifigan')
     # sge task 
     parser.add_argument('--sge_task_id', type = int, default = 1)
     parser.add_argument('--sge_n_tasks', type = int, default = 1)
@@ -108,10 +106,10 @@ def load_wav(path, sample_rate = 16000):
 
     # load ling_encoder
     ling_enc_load_func = f'load_{ling_encoder}'
-    ling_enc_model = eval(ling_enc_load_func)(device = args.device)
+    ling_enc_model = eval(ling_enc_load_func)(device = 'cpu')
     ling_encoder_func = f'{ling_encoder}'
     # load speaker encoder
-    speaker_enc_model = load_speaker_encoder(speaker_encoder, device = args.device)
+    speaker_enc_model = load_speaker_encoder(speaker_encoder, device = 'cpu')
     speaker_encoder_func = load_speaker_encoder_func(args.task, speaker_encoder)
     print(f'load ling_encoder {ling_encoder} done')
     print(f'load speaker_encoder {speaker_encoder} done')
@@ -127,7 +125,7 @@ def load_wav(path, sample_rate = 16000):
     if 'vocoder' in exp_config:
         vocoder = exp_config['vocoder']
         vocoder_load_func = f'load_{vocoder}'
-        vocoder_model = eval(vocoder_load_func)(device = args.device)
+        vocoder_model = eval(vocoder_load_func)(device = 'cpu')
         vocoder_func = f'{vocoder}'
         print(f'load vocoder {vocoder} done')
     else:
@@ -163,31 +161,45 @@ def load_wav(path, sample_rate = 16000):
         src_wav_path = meta['src_wav']
         trg_wav_path = meta['trg_wav']
 
-        if args.src_resyn and vocoder == 'ppgvc_hifigan':
-            from feature_extraction import ppgvc_hifigan_logmelspectrogram
-            src_audio = load_wav(src_wav_path, 24000)
-            ppgvc_mel_config = {'sampling_rate':24000, 
-                                'fft_size': 1024, 
-                                'hop_size': 240,
-                                'win_length': 1024,
-                                'window': 'hann',
-                                'num_mels': 80,
-                                'fmin': 0,
-                                'fmax': 8000,
-                                'mel_min': -12.0,
-                                'mel_max': 2.5
-                                }
-            src_mel_resyn = ppgvc_hifigan_logmelspectrogram(src_audio,ppgvc_mel_config)
+        if args.src_resyn:
+            if vocoder == 'ppgvc_hifigan':
+                from feature_extraction import ppgvc_hifigan_logmelspectrogram
+                src_audio = load_wav(src_wav_path, 24000)
+                ppgvc_mel_config = {'sampling_rate':24000, 
+                                    'fft_size': 1024, 
+                                    'hop_size': 240,
+                                    'win_length': 1024,
+                                    'window': 'hann',
+                                    'num_mels': 80,
+                                    'fmin': 0,
+                                    'fmax': 8000,
+                                    'mel_min': -12.0,
+                                    'mel_max': 2.5
+                                    }
+                src_mel_resyn = ppgvc_hifigan_logmelspectrogram(src_audio,ppgvc_mel_config)
+            elif vocoder  == 'bigvgan':
+                from feature_extraction import bigvgan_logmelspectrogram
+                src_audio = load_wav(src_wav_path, 24000)
+                bigvgan_mel_config = {'sampling_rate':24000, 
+                                    'n_fft': 1024, 
+                                    'hop_size': 240,
+                                    'win_size': 1024,
+                                    'num_mels': 100,
+                                    'fmin': 0,
+                                    'fmax': 12000,
+                                    }
+                src_mel_resyn = bigvgan_logmelspectrogram(src_audio,bigvgan_mel_config)
+
 
         # load src wav & trg wav
         src_wav = load_wav(src_wav_path, 16000)
         mel_duration = len(src_wav) // 160 # estimate a mel duration for pad ling and pros reps
 
         # to tensor
-        src_wav_tensor = torch.FloatTensor(src_wav).unsqueeze(0).to(args.device) 
+        src_wav_tensor = torch.FloatTensor(src_wav).unsqueeze(0)#.to(args.device) 
         start_time = time.time()
         # extract ling representations
-        ling_rep = eval(ling_encoder_func)(ling_enc_model, src_wav_tensor)
+        ling_rep = eval(ling_encoder_func)(ling_enc_model, src_wav_tensor).to(args.device)
         ling_duration = ling_rep.size(1)
         # check if need upsample ling rep
         factor = int(round(mel_duration / ling_duration))
@@ -210,6 +222,7 @@ def load_wav(path, sample_rate = 16000):
             elif mel_duration > pros_duration:
                 pad_vec = pros_rep[:, -1, :]
                 pros_rep = torch.cat([pros_rep, pad_vec.unsqueeze(1).expand(1, mel_duration - pros_duration, pros_rep.size(2))], dim = 1)
+            pros_rep = pros_rep.to(args.device)    
         else:
             pros_rep = None    
         # trg spk emb
@@ -222,12 +235,15 @@ def load_wav(path, sample_rate = 16000):
 
         if vocoder is not None:
             # vocoder
+            decoder_out = decoder_out.cpu()
             wav = eval(vocoder_func)(vocoder_model, decoder_out)
             if args.src_resyn:
                 src_mel_tensor = torch.FloatTensor([src_mel_resyn])
                 src_resyn_wav = eval(vocoder_func)(vocoder_model, src_mel_tensor)
         else:
             wav = decoder_out.view(-1)    
+        if args.device == 'cuda':
+            torch.cuda.empty_cache()    
         end_time = time.time()
         rtf = (end_time - start_time) / (0.01 * ling_rep.size(1))
         total_rtf += rtf
@@ -237,6 +253,7 @@ def load_wav(path, sample_rate = 16000):
         if args.src_resyn:
             resyn_wav_basename = f'{ID}_resyn.wav'
             sf.write(os.path.join(out_wav_dir, resyn_wav_basename), src_resyn_wav.data.cpu().numpy(), 24000, "PCM_16")
+
 
     print(f"RTF: {total_rtf/cnt :.2f}")    
 

diff --git a/submit_inference.sh b/submit_inference.sh
@@ -6,11 +6,11 @@ split=eval_all
 ling_enc=vqwav2vec
 spk_enc=uttdvec
 pros_enc=ppgvcf0
-dec=gradtts
-vocoder=bigvgan
+dec=diffwave
+vocoder=none
 
 # exp setup
-exp_name=vctk_train_0
+exp_name=vctk_train_1
 exp_dir=exp/${dataset}_${ling_enc}_${spk_enc}_${pros_enc}_${dec}_${vocoder}/${exp_name}
 if [ ! -e $exp_dir ]; then
     echo "$exp_dir does not exist"
@@ -26,7 +26,7 @@ epochs=$( ls -t $exp_dir/ckpt | head -n 1 | sed 's/[^0-9]*//g')
 eval_list=eval_list_m2m_vc_small_oneshot.json
 eval_list_path=data/$dataset/$split/$eval_list
 # sge submitjob setup
-n_parallel_jobs=50
+n_parallel_jobs=360
 device=cpu
 job=$exp_dir/scripts/inference_${task}_${epochs}.sh
 log=$exp_dir/logs/inference_${task}_${epochs}.log
@@ -53,5 +53,5 @@ python inference.py \
 EOF
 
 #submit to sge
-submitjob -m 30000 -n $n_parallel_jobs   $log $job
+submitjob -m 40000 -n $n_parallel_jobs   $log $job
 echo "job submitted, see log in ${log}"