Skip to content

Commit

Permalink
1. prosodic_encoder interance; 2. prosodic_nets for taco_ar and taco_mol
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Feb 21, 2023
1 parent 72177cc commit dfef66e
Show file tree
Hide file tree
Showing 17 changed files with 438 additions and 31 deletions.
File renamed without changes.
10 changes: 10 additions & 0 deletions configs/preprocess_fastspeech2_pitch_energy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
sampling_rate: !!int 24000
fft_size: !!int 2048
hop_size: !!int 240
win_length: !!int 1200
window: hann
num_mels: !!int 80
fmin: !!int 80
fmax: !!int 7600
f0_floor: !!float 20.0
f0_ceil: !!float 600.0
10 changes: 10 additions & 0 deletions configs/preprocess_mel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
sampling_rate: !!int 24000
fft_size: !!int 2048
hop_size: !!int 240
win_length: !!int 1200
window: hann
num_mels: !!int 80
fmin: !!int 80
fmax: !!int 7600
f0_floor: !!float 20.0
f0_ceil: !!float 600.0
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pros_enc: norm_fastspeech2_pitch_energy
decoder: FastSpeech2
vocoder: ppgvc_hifigan
mel_type: ppgvc_mel
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy

# training
fp16_run: !!bool True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ decoder: FastSpeech2
vocoder: vctk_hifigan
mel_type: norm_mel
mel_stats: dump/vctk/train_nodev_all/mel/train_nodev_all.npy # used for vctkhifigan vocoder
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy

# training
fp16_run: !!bool True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ spk_enc: utt_dvec
pros_enc: norm_fastspeech2_pitch_energy
decoder: VITS
mel_type: vits_spec
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy

# training
fp16_run: !!bool True
Expand Down
87 changes: 87 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_fs2_ppgvchifigan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: FastSpeech2
mel_type: ppgvc_mel
vocoder: ppgvc_hifigan

# training
fp16_run: !!bool True
epochs: 200
save_freq: 2 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: FS2Trainer
ngpu: 2

#dataloader
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
max_len: 1000
max_seq_len: 1000
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 256
prenet:
conv_kernel_size: 3
input_dim: 512
dropout: 0.1
postnet:
idim: 80
odim: 80
n_layers: 5
n_filts: 5
n_chans: 256
dropout_rate: 0.5
transformer:
encoder_layer: 4
encoder_head: 2
encoder_hidden: 256
decoder_layer: 4
decoder_head: 2
decoder_hidden: 256
conv_filter_size: 1024
conv_kernel_size: [3, 1]
encoder_dropout: 0.1
decoder_dropout: 0.1

#optimizer & scheduler
optimizer:
init_lr: !!float 1e-2
betas: [0.9,0.99]
weight_decay: 0.0
scheduler:
warm_up_step: 4000
anneal_steps: [800000, 900000, 1000000]
anneal_rate: 0.3

# loss hyper-parameters
loss:
alpha: 1.







2 changes: 2 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_fs2_vctkhifigan.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: FastSpeech2
mel_type: norm_mel
vocoder: vctk_hifigan
mel_stats: dump/vctk/train_nodev_all/mel/train_nodev_all.npy

# training
fp16_run: !!bool True
Expand Down
77 changes: 77 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_tacoar_ppgvchifigan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: TacoAR
mel_type: ppgvc_mel
vocoder: ppgvc_hifigan


# training
fp16_run: !!bool False
epochs: 200
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
trainer: TacoARTrainer
ngpu: 2

#dataloader
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: 64
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 256
input_dim: 512
output_dim: 80
resample_ratio: 1
spk_emb_integration_type: concat # add or concat
spk_emb_dim: 256
ar: True
encoder_type: "taco2"
hidden_dim: 1024
prenet_layers: 2 # if set 0, no prenet is used
prenet_dim: 256
prenet_dropout_rate: 0.5
lstmp_layers: 2
lstmp_dropout_rate: 0.2
lstmp_proj_dim: 256
lstmp_layernorm: False

#optimizer & scheduler
optimizer:
weight_decay: 0.0
betas: [0.9,0.99]
lr: !!float 1e-4
scheduler:
num_training_steps: 500000
num_warmup_steps: 4000

# loss hyper-parameters
loss:
alpha: 1.







2 changes: 1 addition & 1 deletion decoder/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def load_TacoAR(ckpt = None, config = None, device = 'cpu'):

def infer_TacoAR(model, ling, pros, spk):

mel, _ = model(ling, torch.LongTensor([ling.size(1)]).to(ling.device), spk)
mel, _ = model(ling, torch.LongTensor([ling.size(1)]).to(ling.device), spk, pros_rep = pros)
return mel

def load_TacoMOL(ckpt = None, config = None, device = 'cpu'):
Expand Down
19 changes: 16 additions & 3 deletions decoder/taco_ar/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from .prosodic_nets import DiscreteProsodicNet, ContinuousProsodicNet
################################################################################

# The follow section is related to Tacotron2
Expand Down Expand Up @@ -299,6 +299,17 @@ def __init__(self, config,
output_dim = config['output_dim']
spk_emb_integration_type = config['spk_emb_integration_type']
spk_emb_dim = config['spk_emb_dim']

if 'prosodic_rep_type' not in config:
self.prosodic_net = None
elif config['prosodic_rep_type'] == 'discrete':
self.prosodic_net = DiscreteProsodicNet(config['prosodic_net'])
elif config['prosodic_rep_type'] == 'continuous':
self.prosodic_net = ContinuousProsodicNet(config['prosodic_net'])
else:
raise Exception



lstmp_layers = config['lstmp_layers']
lstmp_dropout_rate = config['lstmp_dropout_rate']
Expand Down Expand Up @@ -403,7 +414,7 @@ def _integrate_with_spk_emb(self, hs, spembs):

return hs

def forward(self, features, lens, ref_spk_embs, targets = None):
def forward(self, features, lens, ref_spk_embs, targets = None, pros_rep = None):
"""Calculate forward propagation.
Args:
features: Batch of the sequences of input features (B, Lmax, idim).
Expand All @@ -427,7 +438,9 @@ def forward(self, features, lens, ref_spk_embs, targets = None):

# inject speaker embeddings
encoder_states = self._integrate_with_spk_emb(encoder_states, ref_spk_embs)

# inject prosodic representations
if self.prosodic_net is not None and pros_rep is not None:
encoder_states = encoder_states + self.prosodic_net(pros_rep)
# decoder: LSTMP layers & projection
if self.ar:
if targets is not None:
Expand Down
74 changes: 74 additions & 0 deletions decoder/taco_ar/prosodic_nets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import numpy as np
import torch.nn.functional as F
class DiscreteProsodicNet(nn.Module):
def __init__(self, config):
super().__init__()

n_bins = config['prosodic_bins']
prosodic_stats_path = config['prosodic_stats_path']
# load pitch energy min max
stats = np.load(prosodic_stats_path)
pitch_max = stats[0][0]
pitch_min = stats[1][0]
energy_max = stats[2][0]
energy_min = stats[3][0]
self.pitch_bins = nn.Parameter(
torch.linspace(pitch_min, pitch_max, n_bins - 1),
requires_grad=False,
)
self.energy_bins = nn.Parameter(
torch.linspace(energy_min, energy_max, n_bins - 1),
requires_grad=False,
)
self.pitch_embedding = nn.Embedding(
n_bins, config["hidden_dim"]
)
self.energy_embedding = nn.Embedding(
n_bins, config["hidden_dim"]
)
def forward(self, x):
pitch = x[:,:,0]
energy = x[:,:,1]
pitch_reps = self.pitch_embedding(torch.bucketize(pitch, self.pitch_bins))
energy_reps = self.energy_embedding(torch.bucketize(energy, self.energy_bins))
prosodic_reps = pitch_reps + energy_reps
return prosodic_reps
class ContinuousProsodicNet(nn.Module):
def __init__(self, config):
super().__init__()

hidden_dim = config['hidden_dim']
self.pitch_convs = torch.nn.Sequential(
torch.nn.Conv1d(2, hidden_dim, kernel_size=1, bias=False),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(hidden_dim, affine=False),
torch.nn.Conv1d(
hidden_dim, hidden_dim,
kernel_size= 3,
stride=1,
padding=1,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(hidden_dim, affine=False),
torch.nn.Conv1d(
hidden_dim, hidden_dim,
kernel_size= 3,
stride=1,
padding=1,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(hidden_dim, affine=False),
)
def forward(self, x):

out = x.transpose(1,2)
out = self.pitch_convs(out)
out = out.transpose(1,2)
return out

Loading

0 comments on commit dfef66e

Please sign in to comment.