-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1. prosodic_encoder interance; 2. prosodic_nets for taco_ar and taco_mol
- Loading branch information
1 parent
72177cc
commit dfef66e
Showing
17 changed files
with
438 additions
and
31 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
sampling_rate: !!int 24000 | ||
fft_size: !!int 2048 | ||
hop_size: !!int 240 | ||
win_length: !!int 1200 | ||
window: hann | ||
num_mels: !!int 80 | ||
fmin: !!int 80 | ||
fmax: !!int 7600 | ||
f0_floor: !!float 20.0 | ||
f0_ceil: !!float 600.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
sampling_rate: !!int 24000 | ||
fft_size: !!int 2048 | ||
hop_size: !!int 240 | ||
win_length: !!int 1200 | ||
window: hann | ||
num_mels: !!int 80 | ||
fmin: !!int 80 | ||
fmax: !!int 7600 | ||
f0_floor: !!float 20.0 | ||
f0_ceil: !!float 600.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
configs/vctk_vqwav2vec_uttdvec_ppgvcf0_fs2_ppgvchifigan.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# experiment | ||
dataset: vctk | ||
train_meta: data/vctk/train_nodev_all/metadata.csv | ||
dev_meta: data/vctk/dev_all/metadata.csv | ||
train_set: train_nodev_all | ||
dev_set: dev_all | ||
|
||
|
||
# encoder-decoder | ||
ling_enc: vqwav2vec | ||
spk_enc: utt_dvec | ||
pros_enc: ppgvc_f0 | ||
decoder: FastSpeech2 | ||
mel_type: ppgvc_mel | ||
vocoder: ppgvc_hifigan | ||
|
||
# training | ||
fp16_run: !!bool True | ||
epochs: 200 | ||
save_freq: 2 # save ckpt frequency | ||
show_freq: 100 # show training information frequency | ||
load_only_params: !!bool False | ||
seed: !!int 1234 | ||
trainer: FS2Trainer | ||
ngpu: 2 | ||
|
||
#dataloader | ||
sort: !!bool False | ||
dump_dir: dump | ||
num_workers: !!int 8 | ||
batch_size: 32 | ||
drop_last: !!bool True | ||
rm_long_utt: !!bool True # remove too long utterances from metadata | ||
max_utt_duration: !!float 10.0 # max utterance duration (seconds) | ||
|
||
|
||
# decoder params | ||
decoder_params: | ||
max_len: 1000 | ||
max_seq_len: 1000 | ||
spk_emb_dim: 256 | ||
prosodic_rep_type: continuous | ||
prosodic_net: | ||
hidden_dim: 256 | ||
prenet: | ||
conv_kernel_size: 3 | ||
input_dim: 512 | ||
dropout: 0.1 | ||
postnet: | ||
idim: 80 | ||
odim: 80 | ||
n_layers: 5 | ||
n_filts: 5 | ||
n_chans: 256 | ||
dropout_rate: 0.5 | ||
transformer: | ||
encoder_layer: 4 | ||
encoder_head: 2 | ||
encoder_hidden: 256 | ||
decoder_layer: 4 | ||
decoder_head: 2 | ||
decoder_hidden: 256 | ||
conv_filter_size: 1024 | ||
conv_kernel_size: [3, 1] | ||
encoder_dropout: 0.1 | ||
decoder_dropout: 0.1 | ||
|
||
#optimizer & scheduler | ||
optimizer: | ||
init_lr: !!float 1e-2 | ||
betas: [0.9,0.99] | ||
weight_decay: 0.0 | ||
scheduler: | ||
warm_up_step: 4000 | ||
anneal_steps: [800000, 900000, 1000000] | ||
anneal_rate: 0.3 | ||
|
||
# loss hyper-parameters | ||
loss: | ||
alpha: 1. | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
configs/vctk_vqwav2vec_uttdvec_ppgvcf0_tacoar_ppgvchifigan.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# experiment | ||
dataset: vctk | ||
train_meta: data/vctk/train_nodev_all/metadata.csv | ||
dev_meta: data/vctk/dev_all/metadata.csv | ||
train_set: train_nodev_all | ||
dev_set: dev_all | ||
|
||
|
||
# encoder-decoder | ||
ling_enc: vqwav2vec | ||
spk_enc: utt_dvec | ||
pros_enc: ppgvc_f0 | ||
decoder: TacoAR | ||
mel_type: ppgvc_mel | ||
vocoder: ppgvc_hifigan | ||
|
||
|
||
# training | ||
fp16_run: !!bool False | ||
epochs: 200 | ||
save_freq: 2 # save ckpt frequency | ||
show_freq: 10 | ||
load_only_params: !!bool False | ||
seed: !!int 1234 | ||
trainer: TacoARTrainer | ||
ngpu: 2 | ||
|
||
#dataloader | ||
sort: !!bool True | ||
dump_dir: dump | ||
num_workers: !!int 8 | ||
batch_size: 64 | ||
drop_last: !!bool True | ||
rm_long_utt: !!bool True # remove too long utterances from metadata | ||
max_utt_duration: !!float 10.0 # max utterance duration (seconds) | ||
|
||
|
||
# decoder params | ||
decoder_params: | ||
prosodic_rep_type: continuous | ||
prosodic_net: | ||
hidden_dim: 256 | ||
input_dim: 512 | ||
output_dim: 80 | ||
resample_ratio: 1 | ||
spk_emb_integration_type: concat # add or concat | ||
spk_emb_dim: 256 | ||
ar: True | ||
encoder_type: "taco2" | ||
hidden_dim: 1024 | ||
prenet_layers: 2 # if set 0, no prenet is used | ||
prenet_dim: 256 | ||
prenet_dropout_rate: 0.5 | ||
lstmp_layers: 2 | ||
lstmp_dropout_rate: 0.2 | ||
lstmp_proj_dim: 256 | ||
lstmp_layernorm: False | ||
|
||
#optimizer & scheduler | ||
optimizer: | ||
weight_decay: 0.0 | ||
betas: [0.9,0.99] | ||
lr: !!float 1e-4 | ||
scheduler: | ||
num_training_steps: 500000 | ||
num_warmup_steps: 4000 | ||
|
||
# loss hyper-parameters | ||
loss: | ||
alpha: 1. | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import torch | ||
import torch.nn as nn | ||
from torch.nn.parameter import Parameter | ||
import numpy as np | ||
import torch.nn.functional as F | ||
class DiscreteProsodicNet(nn.Module): | ||
def __init__(self, config): | ||
super().__init__() | ||
|
||
n_bins = config['prosodic_bins'] | ||
prosodic_stats_path = config['prosodic_stats_path'] | ||
# load pitch energy min max | ||
stats = np.load(prosodic_stats_path) | ||
pitch_max = stats[0][0] | ||
pitch_min = stats[1][0] | ||
energy_max = stats[2][0] | ||
energy_min = stats[3][0] | ||
self.pitch_bins = nn.Parameter( | ||
torch.linspace(pitch_min, pitch_max, n_bins - 1), | ||
requires_grad=False, | ||
) | ||
self.energy_bins = nn.Parameter( | ||
torch.linspace(energy_min, energy_max, n_bins - 1), | ||
requires_grad=False, | ||
) | ||
self.pitch_embedding = nn.Embedding( | ||
n_bins, config["hidden_dim"] | ||
) | ||
self.energy_embedding = nn.Embedding( | ||
n_bins, config["hidden_dim"] | ||
) | ||
def forward(self, x): | ||
pitch = x[:,:,0] | ||
energy = x[:,:,1] | ||
pitch_reps = self.pitch_embedding(torch.bucketize(pitch, self.pitch_bins)) | ||
energy_reps = self.energy_embedding(torch.bucketize(energy, self.energy_bins)) | ||
prosodic_reps = pitch_reps + energy_reps | ||
return prosodic_reps | ||
class ContinuousProsodicNet(nn.Module): | ||
def __init__(self, config): | ||
super().__init__() | ||
|
||
hidden_dim = config['hidden_dim'] | ||
self.pitch_convs = torch.nn.Sequential( | ||
torch.nn.Conv1d(2, hidden_dim, kernel_size=1, bias=False), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(hidden_dim, affine=False), | ||
torch.nn.Conv1d( | ||
hidden_dim, hidden_dim, | ||
kernel_size= 3, | ||
stride=1, | ||
padding=1, | ||
), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(hidden_dim, affine=False), | ||
torch.nn.Conv1d( | ||
hidden_dim, hidden_dim, | ||
kernel_size= 3, | ||
stride=1, | ||
padding=1, | ||
), | ||
torch.nn.LeakyReLU(0.1), | ||
|
||
torch.nn.InstanceNorm1d(hidden_dim, affine=False), | ||
) | ||
def forward(self, x): | ||
|
||
out = x.transpose(1,2) | ||
out = self.pitch_convs(out) | ||
out = out.transpose(1,2) | ||
return out | ||
|
Oops, something went wrong.