Skip to content

Commit

Permalink
add speaker verification into evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Mar 29, 2023
1 parent e6074de commit f389c0f
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 46 deletions.
6 changes: 5 additions & 1 deletion configs/vctk_vqwav2vec_uttdvec_ppgvcf0_diffwave_none.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
noise_steps: 100
noise_start: !!float 1e-5
noise_end: 0.05
infer_noise: [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5]
segment_size: 14400
use_text_encoder: !!bool False
use_text_encoder: !!bool True
input_dim: !!int 512
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
use_text_encoder: !!bool False
use_text_encoder: !!bool True
use_prior_loss: !!bool False
n_feats: !!int 100
input_dim: !!int 512
Expand Down
20 changes: 15 additions & 5 deletions decoder/diffwave/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,18 @@ def __init__(self, config):

self.use_text_encoder = config['use_text_encoder']


noise_steps = config['noise_steps']
noise_start = config['noise_start']
noise_end = config['noise_end']

self.infer_noise = config['infer_noise']


noise_schedule = np.linspace(1e-4, 0.05, 50).tolist()
noise_schedule = np.linspace(noise_start, noise_end, noise_steps).tolist()
self.noise_schedule = noise_schedule
self.diffusion_embedding = DiffusionEmbedding(len(noise_schedule))

self.fast_sampling = config['fast_sampling'] if 'fast_sampling' in config else True
self.upsampler = Upsampler(inter_channels)

if self.use_text_encoder:
Expand Down Expand Up @@ -331,11 +338,12 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):
return y

def inference(self, ling, pros, spk, lengths):
fast_sampling = True
fast_sampling = self.fast_sampling
training_noise_schedule = np.array(self.noise_schedule)
inference_noise_schedule=np.array([0.0001, 0.001, 0.01, 0.05, 0.2, 0.5])
#inference_noise_schedule= np.array([0.0001, 0.001, 0.01, 0.1, 0.2, 0.5])
inference_noise_schedule = np.array(self.infer_noise)
inference_noise_schedule = np.array(inference_noise_schedule) if fast_sampling else training_noise_schedule

print(f'inference noise schedule {inference_noise_schedule}')
talpha = 1 - training_noise_schedule
talpha_cum = np.cumprod(talpha)

Expand All @@ -351,6 +359,7 @@ def inference(self, ling, pros, spk, lengths):
T.append(t + twiddle)
break
T = np.array(T, dtype=np.float32)
print(f'inference T {T}')

# hard code hop_size = 240
audio = torch.randn(ling.shape[0], 240 * ling.shape[-1], device=ling.device)
Expand All @@ -363,6 +372,7 @@ def inference(self, ling, pros, spk, lengths):
noise = torch.randn_like(audio)
sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
audio += sigma * noise
print(f'c1 {c1} c2 {c2} sigma {sigma}', flush = True)
audio = torch.clamp(audio, -1.0, 1.0)
return audio

Expand Down
1 change: 1 addition & 0 deletions decoder/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def infer_GradTTS(model, ling, pros, spk):
return mel

def infer_DiffWave(model, ling, pros, spk):
print(f'ling length {ling.size(1)}', flush = True)
ling_lengths = torch.LongTensor([ling.size(1)]).to(ling.device)
ling = ling.transpose(1,2)
pros = pros.transpose(1,2)
Expand Down
104 changes: 101 additions & 3 deletions evaluation/speechbrain_asv.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,107 @@
from speechbrain.pretrained import SpeakerRecognition
from speechbrain.pretrained import EncoderClassifier
import os
from speechbrain.utils.metric_stats import EER
import sys
from tqdm import tqdm
import torch

class SpeakerRecognition(EncoderClassifier):
"""A ready-to-use model for speaker recognition. It can be used to
perform speaker verification with verify_batch().
```
Example
-------
>>> import torchaudio
>>> from speechbrain.pretrained import SpeakerRecognition
>>> # Model is downloaded from the speechbrain HuggingFace repo
>>> tmpdir = getfixture("tmpdir")
>>> verification = SpeakerRecognition.from_hparams(
... source="speechbrain/spkrec-ecapa-voxceleb",
... savedir=tmpdir,
... )
>>> # Perform verification
>>> signal, fs = torchaudio.load("tests/samples/single-mic/example1.wav")
>>> signal2, fs = torchaudio.load("tests/samples/single-mic/example2.flac")
>>> score, prediction = verification.verify_batch(signal, signal2)
"""

MODULES_NEEDED = [
"compute_features",
"mean_var_norm",
"embedding_model",
"mean_var_norm_emb",
]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

def verify_batch(
self, wavs1, wavs2, wav1_lens=None, wav2_lens=None, threshold=0.25
):
"""Performs speaker verification with cosine distance.
It returns the score and the decision (0 different speakers,
1 same speakers).
Arguments
---------
wavs1 : Torch.Tensor
Tensor containing the speech waveform1 (batch, time).
Make sure the sample rate is fs=16000 Hz.
wavs2 : Torch.Tensor
Tensor containing the speech waveform2 (batch, time).
Make sure the sample rate is fs=16000 Hz.
wav1_lens: Torch.Tensor
Tensor containing the relative length for each sentence
in the length (e.g., [0.8 0.6 1.0])
wav2_lens: Torch.Tensor
Tensor containing the relative length for each sentence
in the length (e.g., [0.8 0.6 1.0])
threshold: Float
Threshold applied to the cosine distance to decide if the
speaker is different (0) or the same (1).
Returns
-------
score
The score associated to the binary verification output
(cosine distance).
prediction
The prediction is 1 if the two signals in input are from the same
speaker and 0 otherwise.
"""
emb1 = self.encode_batch(wavs1, wav1_lens, normalize=True)
emb2 = self.encode_batch(wavs2, wav2_lens, normalize=True)
score = self.similarity(emb1, emb2)
return score, score > threshold

def verify_files(self, path_x, path_y):
"""Speaker verification with cosine distance
Returns the score and the decision (0 different speakers,
1 same speakers).
Returns
-------
score
The score associated to the binary verification output
(cosine distance).
prediction
The prediction is 1 if the two signals in input are from the same
speaker and 0 otherwise.
"""
waveform_x = self.load_audio(path_x, savedir = "pretrained_models/spkrec-ecapa-voxceleb")
waveform_y = self.load_audio(path_y, savedir = "pretrained_models/spkrec-ecapa-voxceleb")
# Fake batches:
batch_x = waveform_x.unsqueeze(0)
batch_y = waveform_y.unsqueeze(0)
# Verify:
score, decision = self.verify_batch(batch_x, batch_y)
# Squeeze:
return score[0], decision[0]
if __name__ == '__main__':

converted_wav_dir = sys.argv[1]
Expand Down Expand Up @@ -50,5 +148,5 @@

positive_scores = torch.tensor(positive_scores)
negative_scores = torch.tensor(negative_scores)
eer_result = EER(positive_scores, negative_scores)
print(f'EER result {eer_result * 100}')
eer_result = EER(positive_scores, negative_scores)[0]
print(f'EER result {eer_result * 100 }')
1 change: 0 additions & 1 deletion feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def process_speaker(spk_meta, spk, config, args):
ID = row['ID']
wav_path = row['wav_path'].strip()
audio, fs = librosa.load(wav_path, sr = config['sampling_rate'])

# trim silence
start, end = float(row['start']), float(row['end'])
audio = audio[ int(start * config['sampling_rate']):
Expand Down
63 changes: 40 additions & 23 deletions inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
def denorm_mel(mean_tensor, std_tensor, mel):

if mean_tensor is not None and std_tensor is not None:
mean_tensor = torch.FloatTensor(scaler.mean_)
std_tensor = torch.FloatTensor(scaler.scale_)

mel = mel * std_tensor + mean_tensor

Expand Down Expand Up @@ -66,7 +64,7 @@ def load_wav(path, sample_rate = 16000):
parser.add_argument('--task', type = str)
parser.add_argument('--src_resyn', default = False, action = 'store_true')
# vocoder
parser.add_argument('--vocoder', type = str, default = 'ppg_vc_hifigan')
#parser.add_argument('--vocoder', type = str, default = 'ppg_vc_hifigan')
# sge task
parser.add_argument('--sge_task_id', type = int, default = 1)
parser.add_argument('--sge_n_tasks', type = int, default = 1)
Expand Down Expand Up @@ -108,10 +106,10 @@ def load_wav(path, sample_rate = 16000):

# load ling_encoder
ling_enc_load_func = f'load_{ling_encoder}'
ling_enc_model = eval(ling_enc_load_func)(device = args.device)
ling_enc_model = eval(ling_enc_load_func)(device = 'cpu')
ling_encoder_func = f'{ling_encoder}'
# load speaker encoder
speaker_enc_model = load_speaker_encoder(speaker_encoder, device = args.device)
speaker_enc_model = load_speaker_encoder(speaker_encoder, device = 'cpu')
speaker_encoder_func = load_speaker_encoder_func(args.task, speaker_encoder)
print(f'load ling_encoder {ling_encoder} done')
print(f'load speaker_encoder {speaker_encoder} done')
Expand All @@ -127,7 +125,7 @@ def load_wav(path, sample_rate = 16000):
if 'vocoder' in exp_config:
vocoder = exp_config['vocoder']
vocoder_load_func = f'load_{vocoder}'
vocoder_model = eval(vocoder_load_func)(device = args.device)
vocoder_model = eval(vocoder_load_func)(device = 'cpu')
vocoder_func = f'{vocoder}'
print(f'load vocoder {vocoder} done')
else:
Expand Down Expand Up @@ -163,31 +161,45 @@ def load_wav(path, sample_rate = 16000):
src_wav_path = meta['src_wav']
trg_wav_path = meta['trg_wav']

if args.src_resyn and vocoder == 'ppgvc_hifigan':
from feature_extraction import ppgvc_hifigan_logmelspectrogram
src_audio = load_wav(src_wav_path, 24000)
ppgvc_mel_config = {'sampling_rate':24000,
'fft_size': 1024,
'hop_size': 240,
'win_length': 1024,
'window': 'hann',
'num_mels': 80,
'fmin': 0,
'fmax': 8000,
'mel_min': -12.0,
'mel_max': 2.5
}
src_mel_resyn = ppgvc_hifigan_logmelspectrogram(src_audio,ppgvc_mel_config)
if args.src_resyn:
if vocoder == 'ppgvc_hifigan':
from feature_extraction import ppgvc_hifigan_logmelspectrogram
src_audio = load_wav(src_wav_path, 24000)
ppgvc_mel_config = {'sampling_rate':24000,
'fft_size': 1024,
'hop_size': 240,
'win_length': 1024,
'window': 'hann',
'num_mels': 80,
'fmin': 0,
'fmax': 8000,
'mel_min': -12.0,
'mel_max': 2.5
}
src_mel_resyn = ppgvc_hifigan_logmelspectrogram(src_audio,ppgvc_mel_config)
elif vocoder == 'bigvgan':
from feature_extraction import bigvgan_logmelspectrogram
src_audio = load_wav(src_wav_path, 24000)
bigvgan_mel_config = {'sampling_rate':24000,
'n_fft': 1024,
'hop_size': 240,
'win_size': 1024,
'num_mels': 100,
'fmin': 0,
'fmax': 12000,
}
src_mel_resyn = bigvgan_logmelspectrogram(src_audio,bigvgan_mel_config)


# load src wav & trg wav
src_wav = load_wav(src_wav_path, 16000)
mel_duration = len(src_wav) // 160 # estimate a mel duration for pad ling and pros reps

# to tensor
src_wav_tensor = torch.FloatTensor(src_wav).unsqueeze(0).to(args.device)
src_wav_tensor = torch.FloatTensor(src_wav).unsqueeze(0)#.to(args.device)
start_time = time.time()
# extract ling representations
ling_rep = eval(ling_encoder_func)(ling_enc_model, src_wav_tensor)
ling_rep = eval(ling_encoder_func)(ling_enc_model, src_wav_tensor).to(args.device)
ling_duration = ling_rep.size(1)
# check if need upsample ling rep
factor = int(round(mel_duration / ling_duration))
Expand All @@ -210,6 +222,7 @@ def load_wav(path, sample_rate = 16000):
elif mel_duration > pros_duration:
pad_vec = pros_rep[:, -1, :]
pros_rep = torch.cat([pros_rep, pad_vec.unsqueeze(1).expand(1, mel_duration - pros_duration, pros_rep.size(2))], dim = 1)
pros_rep = pros_rep.to(args.device)
else:
pros_rep = None
# trg spk emb
Expand All @@ -222,12 +235,15 @@ def load_wav(path, sample_rate = 16000):

if vocoder is not None:
# vocoder
decoder_out = decoder_out.cpu()
wav = eval(vocoder_func)(vocoder_model, decoder_out)
if args.src_resyn:
src_mel_tensor = torch.FloatTensor([src_mel_resyn])
src_resyn_wav = eval(vocoder_func)(vocoder_model, src_mel_tensor)
else:
wav = decoder_out.view(-1)
if args.device == 'cuda':
torch.cuda.empty_cache()
end_time = time.time()
rtf = (end_time - start_time) / (0.01 * ling_rep.size(1))
total_rtf += rtf
Expand All @@ -237,6 +253,7 @@ def load_wav(path, sample_rate = 16000):
if args.src_resyn:
resyn_wav_basename = f'{ID}_resyn.wav'
sf.write(os.path.join(out_wav_dir, resyn_wav_basename), src_resyn_wav.data.cpu().numpy(), 24000, "PCM_16")


print(f"RTF: {total_rtf/cnt :.2f}")

Expand Down
10 changes: 5 additions & 5 deletions submit_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ split=eval_all
ling_enc=vqwav2vec
spk_enc=uttdvec
pros_enc=ppgvcf0
dec=gradtts
vocoder=bigvgan
dec=diffwave
vocoder=none

# exp setup
exp_name=vctk_train_0
exp_name=vctk_train_1
exp_dir=exp/${dataset}_${ling_enc}_${spk_enc}_${pros_enc}_${dec}_${vocoder}/${exp_name}
if [ ! -e $exp_dir ]; then
echo "$exp_dir does not exist"
Expand All @@ -26,7 +26,7 @@ epochs=$( ls -t $exp_dir/ckpt | head -n 1 | sed 's/[^0-9]*//g')
eval_list=eval_list_m2m_vc_small_oneshot.json
eval_list_path=data/$dataset/$split/$eval_list
# sge submitjob setup
n_parallel_jobs=50
n_parallel_jobs=360
device=cpu
job=$exp_dir/scripts/inference_${task}_${epochs}.sh
log=$exp_dir/logs/inference_${task}_${epochs}.log
Expand All @@ -53,5 +53,5 @@ python inference.py \
EOF

#submit to sge
submitjob -m 30000 -n $n_parallel_jobs $log $job
submitjob -m 40000 -n $n_parallel_jobs $log $job
echo "job submitted, see log in ${log}"
Loading

0 comments on commit f389c0f

Please sign in to comment.