Skip to content

Commit

Permalink
1. add use_text_encoder into config; 2. remove text_encoder in grad_t…
Browse files Browse the repository at this point in the history
…ts and diffwave
  • Loading branch information
MingjieChen committed Mar 19, 2023
1 parent 6a131e7 commit dceaf0f
Show file tree
Hide file tree
Showing 15 changed files with 324 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: FS2Trainer
ngpu: 2
ngpu: 1

#dataloader
dataset_class: Dataset
Expand All @@ -38,9 +38,11 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
use_text_encoder: !!bool False
max_len: 1000
max_seq_len: 1000
spk_emb_dim: 256
out_dim: 80
prosodic_rep_type: discrete
prosodic_net:
hidden_dim: !!int 256
Expand Down
76 changes: 76 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_diffwave_none.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: DiffWave
mel_type: vits_spec # will not use it
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy

# training
fp16_run: !!bool False
epochs: 2000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: DiffWaveTrainer
ngpu: 1

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 60
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
batch_size: 16
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
segment_size: 14400
use_text_encoder: !!bool False
input_dim: !!int 512
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 64
inter_channels: !!int 64
hidden_channels: !!int 64
filter_channels: !!int 128
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
residual_channels: 64
residual_layers: 30
dilation_cycle_length: 10



#optimizer & scheduler
optimizer:
lr: !!float 2e-4

# loss hyper-parameters
losses:
alpha: 1.







76 changes: 76 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_gradtts_bigvgan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: GradTTS
mel_type: norm_bigvgan_mel
vocoder: bigvgan
mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy

# training
fp16_run: !!bool False
epochs: 2000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: GradTTSTrainer
ngpu: 1

#dataloader
dataset_class: Dataset
mel_segment_length: !!int 128
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
batch_size: 16
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
use_text_encoder: !!bool False
use_prior_loss: !!bool False
n_feats: !!int 100
input_dim: !!int 512
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: !!int 100
n_enc_channels: !!int 192
filter_channels: !!int 768
filter_channels_dp: !!int 256
n_enc_layers: !!int 6
enc_kernel: !!int 3
enc_dropout: !!float 0.1
n_heads: !!int 2
window_size: !!int 4
dec_dim: !!int 64
beta_min: !!float 0.05
beta_max: !!float 20.0
pe_scale: !!int 1000 # 1 for `grad-tts-old.pt` checkpoint

#optimizer & scheduler
optimizer:
lr: !!float 1e-4

# loss hyper-parameters
losses:
alpha: 1.







Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ decoder_params:
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: !!int 512
hidden_dim: !!int 80
n_enc_channels: !!int 192
filter_channels: !!int 768
filter_channels_dp: !!int 256
Expand Down
79 changes: 79 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_tacoar_bigvgan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: TacoAR
mel_type: norm_bigvgan_mel
vocoder: bigvgan
mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy


# training
fp16_run: !!bool True
epochs: 200
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
trainer: TacoARTrainer
ngpu: 1

#dataloader
dataset_class: Dataset
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 1024
input_dim: 512
output_dim: 100
resample_ratio: 1
spk_emb_integration_type: concat # add or concat
spk_emb_dim: 256
ar: True
encoder_type: "taco2"
hidden_dim: 1024
prenet_layers: 2 # if set 0, no prenet is used
prenet_dim: 256
prenet_dropout_rate: 0.5
lstmp_layers: 2
lstmp_dropout_rate: 0.2
lstmp_proj_dim: 256
lstmp_layernorm: False

#optimizer & scheduler
optimizer:
weight_decay: 0.0
betas: [0.9,0.99]
lr: !!float 1e-4
scheduler:
num_training_steps: 500000
num_warmup_steps: 4000

# loss hyper-parameters
losses:
alpha: 1.







2 changes: 1 addition & 1 deletion configs/vctk_vqwav2vec_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ vits_hop_size: !!int 240
spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
num_workers: !!int 8
batch_size: !!int 24
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
use_prior_loss: !!bool True
use_prior_loss: !!bool False
n_feats: !!int 100
input_dim: !!int 768
spk_emb_dim: !!int 192
prosodic_rep_type: discrete
prosodic_net:
hidden_dim: 256
hidden_dim: 100
prosodic_bins: !!int 256
prosodic_stats_path: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/pitch_energy_min_max.npy
n_enc_channels: !!int 192
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ vits_hop_size: !!int 240
spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
num_workers: !!int 8
batch_size: !!int 24
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
Expand Down
27 changes: 17 additions & 10 deletions decoder/diffwave/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ def __init__(self, n_mels, residual_channels, dilation, spk_emb_dim):
super().__init__()
self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
self.diffusion_projection = Linear(512, residual_channels)
self.local_conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
self.global_conditioner_projection = Conv1d(spk_emb_dim, 2 * residual_channels, 1)
self.local_conditioner_projection = Conv1d(n_mels, residual_channels, 1)
self.global_conditioner_projection = Conv1d(spk_emb_dim, residual_channels, 1)

self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)

Expand All @@ -223,7 +223,8 @@ def forward(self, x, diffusion_step, c, g):
y = x + diffusion_step
local_condition = self.local_conditioner_projection(c)
global_condition = self.global_conditioner_projection(g)
y = self.dilated_conv(y) + local_condition + global_condition
y = y + global_condition + local_condition
y = self.dilated_conv(y)

gate, filter = torch.chunk(y, 2, dim=1)
y = torch.sigmoid(gate) * torch.tanh(filter)
Expand All @@ -250,16 +251,18 @@ def __init__(self, config):
residual_layers = config['residual_layers']
dilation_cycle_length = config['dilation_cycle_length']
self.spk_emb_dim = config['spk_emb_dim']

self.use_text_encoder = config['use_text_encoder']


noise_schedule = np.linspace(1e-4, 0.05, 50).tolist()
self.noise_schedule = noise_schedule
self.diffusion_embedding = DiffusionEmbedding(len(noise_schedule))


self.upsampler = Upsampler(inter_channels)

self.text_encoder = TextEncoder(
if self.use_text_encoder:
self.text_encoder = TextEncoder(
input_dim,
inter_channels,
hidden_channels,
Expand All @@ -270,6 +273,8 @@ def __init__(self, config):
filter_channels,
n_heads,
p_dropout)
else:
self.text_encoder = nn.Conv1d(input_dim, inter_channels, 3,1,1)

if 'prosodic_rep_type' not in config:
self.prosodic_net = None
Expand All @@ -294,15 +299,17 @@ def __init__(self, config):
def forward(self, audio, diffusion_step, ling, pros, spk, lengths):


x, x_masks = self.text_encoder(ling, lengths)
if self.use_text_encoder:
x, x_masks = self.text_encoder(ling, lengths)
else:
x = self.text_encoder(ling)

if self.prosodic_net is not None and pros is not None:
pros = self.prosodic_net(pros)
x += pros

spk_embeds = F.normalize(
up_spk_embeds = F.normalize(
spk.squeeze(2)).unsqueeze(2).expand(ling.size(0), self.spk_emb_dim, ling.size(2) * 240)
#x = torch.cat([x, spk_embeds], dim=1)
#x = self.reduce_proj(x)

x = self.upsampler(x)

Expand All @@ -314,7 +321,7 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):

skip = None
for layer in self.residual_layers:
y, skip_connection = layer(y, diffusion_step, x, spk_embeds)
y, skip_connection = layer(y, diffusion_step, x, up_spk_embeds)
skip = skip_connection if skip is None else skip_connection + skip

y = skip / sqrt(len(self.residual_layers))
Expand Down
Loading

0 comments on commit dceaf0f

Please sign in to comment.