Skip to content

Commit

Permalink
add use_text_encoder into fastspeech2
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Mar 19, 2023
1 parent dceaf0f commit e6074de
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 8 deletions.
92 changes: 92 additions & 0 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_fs2_bigvgan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: FastSpeech2
mel_type: norm_bigvgan_mel
mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy
vocoder: bigvgan

# training
fp16_run: !!bool True
epochs: 200
save_freq: 2 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: FS2Trainer
ngpu: 1

#dataloader
dataset_class: Dataset
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
use_text_encoder: !!bool False
input_dim: 512
out_dim: 100
max_len: 1000
max_seq_len: 1000
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 256
prenet:
conv_kernel_size: 3
input_dim: 512
dropout: 0.1
postnet:
idim: 80
odim: 80
n_layers: 0
n_filts: 5
n_chans: 256
dropout_rate: 0.5
transformer:
encoder_layer: 4
encoder_head: 2
encoder_hidden: 256
decoder_layer: 4
decoder_head: 2
decoder_hidden: 256
conv_filter_size: 1024
conv_kernel_size: [3, 1]
encoder_dropout: 0.1
decoder_dropout: 0.1

#optimizer & scheduler
optimizer:
init_lr: !!float 1e-2
betas: [0.9,0.99]
weight_decay: 0.0
scheduler:
warm_up_step: 4000
anneal_steps: [800000, 900000, 1000000]
anneal_rate: 0.3

# loss hyper-parameters
losses:
alpha: 1.







6 changes: 3 additions & 3 deletions decoder/fastspeech2/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ def __init__(self, model_config):
def forward(self, x, spk_emb, pros_rep, mask, max_len):
batch_size = x.size(0)
# integrate speaker embedding
spk_emb = F.normalize(spk_emb.squeeze(1)).unsqueeze(1)
x = torch.cat([x,spk_emb.expand(batch_size, max_len, self.d_model )], dim = -1)
x = self.reduce_projection(x)
if self.pros_net is not None:
# integrate prosodic_rep
processed_pros_rep = self.pros_net(pros_rep)
x = x + processed_pros_rep
spk_emb = F.normalize(spk_emb.squeeze(1)).unsqueeze(1)
x = torch.cat([x,spk_emb.expand(batch_size, max_len, self.d_model )], dim = -1)
x = self.reduce_projection(x)

if mask is not None:
x = x.masked_fill(mask.unsqueeze(-1), 0)
Expand Down
10 changes: 5 additions & 5 deletions submit_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ spk=uttdvec
pros=ppgvcf0
#pros=fs2pitchenergy

#dec=fs2
dec=fs2
#dec=vits
#dec=gradtts
dec=diffwave
#dec=diffwave
#dec=tacoar
#dec=tacomol

#vocoder=ppgvchifigan
vocoder=none
#vocoder=bigvgan
#vocoder=none
vocoder=bigvgan

exp_name=vctk_train_0
#exp_name=vctk_no16fp_split
Expand All @@ -40,7 +40,7 @@ fi
exp_dir=exp
model_name=${dataset}_${ling}_${spk}_${pros}_${dec}_${vocoder}
exp=$exp_dir/$model_name/$exp_name
njobs=60
njobs=1
ngpus=1
slots=4
#gputypes="GeForceRTX3060|GeForceRTX3090"
Expand Down

0 comments on commit e6074de

Please sign in to comment.