Skip to content

Commit

Permalink
1. ecapa_tdnn inference; 2. fastspeech2 pitch energy inference
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Mar 11, 2023
1 parent cc81663 commit bfd34d3
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 19 deletions.
6 changes: 3 additions & 3 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ mel_type: vits_spec

# training
fp16_run: !!bool False
epochs: 200
epochs: 1000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
Expand All @@ -28,11 +28,11 @@ ngpu: 2
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 240
spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 12
batch_size: !!int 32
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: whisper_ppg_small
spk_enc: utt_ecapa_tdnn
pros_enc: norm_fastspeech2_pitch_energy
decoder: VITS
mel_type: vits_spec
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy


# training
fp16_run: !!bool False
epochs: 1000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 32
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
spk_emb_dim: 192
prosodic_rep_type: discrete
prosodic_net:
hidden_dim: 192
prosodic_bins: !!int 256
prosodic_stats_path: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/pitch_energy_min_max.npy
input_dim: !!int 768
spec_channels: !!int 513
inter_channels: !!int 192
hidden_channels: !!int 192
filter_channels: !!int 768
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
n_mels_channels: !!int 80
win_length: !!int 1024
hop_length: !!int 240
sampling_rate: !!int 24000
segment_size: !!int 9600




#optimizer & scheduler
optimizer:
generator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
discriminator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
scheduler:
generator:
lr_decay: !!float 0.999875
discriminator:
lr_decay: !!float 0.999875

# loss hyper-parameters
losses:
mel: !!int 45
kl: !!int 1







48 changes: 45 additions & 3 deletions ling_encoder/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
from .conformer_ppg.conformer_ppg_model.build_ppg_model import load_ppg_model
from .whisper_ppg.whisper_ppg_model.audio import pad_or_trim as whisper_ppg_pad_or_trim, log_mel_spectrogram as whisper_ppg_log_mel_spectrogram
from .whisper_ppg.whisper_ppg_model.model import Whisper, ModelDimensions
import torch


def load_whisper_ppg_small(ckpt = 'ling_encoder/whisper_ppg/ckpt/small.pt', config = None, device = 'cpu'):
checkpoint = torch.load(ckpt, map_location=device)
dims = ModelDimensions(**checkpoint["dims"])
model = Whisper(dims)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
return model.to(device)


def load_contentvec_500(ckpt = 'ling_encoder/contentvec_500/contentvec_500_model.pt', config = None, device = 'cpu'):
import fairseq
model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt])
model = model[0]
model.eval()
return model
def load_contentvec_100(ckpt = 'ling_encoder/contentvec_100/contentvec_100_model.pt', config = None, device = 'cpu'):
import fairseq
model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt])
model = model[0]
model.eval()
return model

def load_vqwav2vec(ckpt = 'ling_encoder/vqwav2vec/vq-wav2vec_kmeans.pt' , config = None, device = 'cpu'):
import fairseq
model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt])
Expand Down Expand Up @@ -61,13 +86,30 @@ def hubert_soft(model, wav_tensor):
with torch.inference_mode():

dense = model.units(wav_tensor)



return dense

def contentvec_500(model, wav_tensor):

dense = model.feature_extractor(wav_tensor)

return dense.transpose(1,2)
def contentvec_100(model, wav_tensor):

dense = model.feature_extractor(wav_tensor)

return dense.transpose(1,2)

def whisper_ppg_small(model, wav_tensor):
wav_tensor = wav_tensor.view(-1)
wav_len = wav_tensor.size(0)
ppg_len = wav_len // 320
wav_tensor = whisper_ppg_pad_or_trim(wav_tensor)
mel = whisper_ppg_log_mel_spectrogram(wav_tensor).to(model.device)

ppg = model.encoder(mel.unsqueeze(0))
ppg = ppg[:,:ppg_len,:]
return ppg




Expand Down
4 changes: 2 additions & 2 deletions prosodic_encoder/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ def infer_norm_fastspeech2_pitch_energy(source_wav, target_wav = None, config_pa
config = yaml.safe_load(f)
f.close()
# extract pitch energy
src_wav, _ = librosa.load(source_wav, sampling_rate = config['sampling_rate'])
src_wav, _ = librosa.load(source_wav, sr = config['sampling_rate'])
pitch_energy = extract_pitch_energy(src_wav, config)
pitch = pitch_energy[0, :]
energy = pitch_energy[1, :]
# load pitch energy mean std
scaler_pitch = StandardScaler()
scaler_energy = StandardScaler()
pitch_energy_stats = np.load(stats_path)
pitch_energy_stats = np.load(stats)
scaler_pitch.mean_ = pitch_energy_stats[0]
scaler_pitch.scale_ = pitch_energy_stats[1]
scaler_energy.mean_ = pitch_energy_stats[2]
Expand Down
4 changes: 4 additions & 0 deletions speaker_encoder/ecapa_tdnn/extract_utter_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess


sampling_rate = 16000
def process_speaker(spk_meta, spk, args):
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

Expand All @@ -19,6 +20,9 @@ def process_speaker(spk_meta, spk, args):
wav_path = row['wav_path']

signal, fs =torchaudio.load(wav_path)
if fs != sampling_rate:
signal = torchaudio.functional.resample(signal, fs, 16000)

embeddings = classifier.encode_batch(signal)
spk_emb = embeddings[0][0].data.numpy()

Expand Down
44 changes: 42 additions & 2 deletions speaker_encoder/interface.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,40 @@
from .d_vector.d_vector_model.audio import preprocess_wav
from .d_vector.d_vector_model.voice_encoder import SpeakerEncoder
from speechbrain.pretrained import EncoderClassifier
import torch
import torchaudio


def load_speaker_encoder(spk_enc_type, device = 'cpu'):

if spk_enc_type == 'utt_dvec':
return load_d_vector( device = device)
elif spk_enc_type == 'utt_ecapa_tdnn':
return load_ecapa_tdnn(device = device)
else:
return None
def load_speaker_encoder_func(task, spk_enc_type):

if spk_enc_type == 'utt_dvec':
if task == 'a2a_vc':
if task == 'a2a_vc' or task == 'm2m_vc':
return d_vector_spk_mean_emb
elif task == 'oneshot_vc' or 'oneshot_resyn':
return d_vector_emb
return d_vector_emb
raise Exception
if spk_enc_type == 'utt_ecapa_tdnn':
if task == 'a2a_vc' or 'm2m_vc':
return ecapa_tdnn_spk_mean_emb
elif task == 'oneshot_vc' or 'oneshot_resyn':
return ecapa_tdnn_emb
raise Exception



def load_ecapa_tdnn(ckpt = '', config = None, device = 'cpu'):
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")
classifier.eval()
classifier.to(device)
return classifier

def load_d_vector(ckpt = 'speaker_encoder/d_vector/d_vector_model/ckpt/pretrained_bak_5805000.pt', config = None, device = 'cpu'):

Expand All @@ -25,6 +43,28 @@ def load_d_vector(ckpt = 'speaker_encoder/d_vector/d_vector_model/ckpt/pretraine

return encoder

def ecapa_tdnn_emb(model, wav_path):
wav_path = wav_path[0]
signal, fs =torchaudio.load(wav_path)
if fs != 16000:
signal = torchaudio.functional.resample(signal, fs, 16000)

embeddings = model.encode_batch(signal).squeeze(0).squeeze(0)
return embedding

def ecapa_tdnn_spk_mean_emb(model, wav_paths):
batch = []
for wav_path in wav_paths:
signal, fs =torchaudio.load(wav_path)
if fs != 16000:
signal = torchaudio.functional.resample(signal, fs, 16000)
batch.append(signal)
batch = torch.cat(batch, 0)

embeddings = model.encode_batch(signal)
embedding = torch.mean(embeddings, dim = 0).view(-1)
return embedding

def d_vector_emb(model, wav_path):
wav_path = wav_path[0]
audio = preprocess_wav(wav_path)
Expand Down
6 changes: 3 additions & 3 deletions submit_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
dataset=vctk
split=eval_all
# model setup
ling_enc=conformerppg
spk_enc=uttdvec
pros_enc=ppgvcf0
ling_enc=whisperppgsmall
spk_enc=uttecapatdnn
pros_enc=fs2pitchenergy
dec=vits
vocoder=none

Expand Down
14 changes: 8 additions & 6 deletions submit_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ conda_env=torch_1.9
dataset=vctk
#ling=vqwav2vec
#ling=conformerppg
ling=contentvec100
spk=uttdvec
pros=ppgvcf0
#ling=contentvec100
ling=whisperppgsmall

spk=uttecapatdnn
pros=fs2pitchenergy
#dec=fs2
dec=vits
#dec=gradtts
Expand All @@ -28,13 +30,13 @@ fi
exp_dir=exp
model_name=${dataset}_${ling}_${spk}_${pros}_${dec}_${vocoder}
exp=$exp_dir/$model_name/$exp_name
njobs=12
njobs=48
ngpus=2
slots=4
#gputypes="GeForceRTX3060|GeForceRTX3090"
#gputypes="GeForceRTX3090"
gputypes="GeForceRTX3090"
#gputypes="GeForceGTXTITANX|GeForceGTX1080Ti|GeForceRTX3060"
gputypes="GeForceGTX1080Ti|GeForceRTX3090"
#gputypes="GeForceGTX1080Ti|GeForceRTX3090"

# create exp dir
[ ! -e $exp ] && mkdir -p $exp
Expand Down

0 comments on commit bfd34d3

Please sign in to comment.