Skip to content

Commit

Permalink
small updates
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Jun 13, 2023
1 parent 030a126 commit 48cead2
Show file tree
Hide file tree
Showing 12 changed files with 396 additions and 45 deletions.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ conda create --name torch_1.9 --file requirements.txt
- **Dataset**
- [x] VCTK
- [x] LibriTTS
- [ ] M4Singer
- [ ] NUS-48E
- [ ] NHSS

- **Linguistic Encoder**
- [x] conformer_ppg from [ppg-vc](https://github.com/liusongxiang/ppg-vc)
Expand Down Expand Up @@ -67,7 +64,6 @@ conda create --name torch_1.9 --file requirements.txt

- **Vocoder**
- [x] hifigan (vctk) from [ppg-vc](https://github.com/liusongxiang/ppg-vc)
- [ ] BigVGAN from [bigvgan](https://github.com/NVIDIA/BigVGAN)

- **Evaluation**
- [x] UTMOS22 mos prediction from [UTMOS22](https://github.com/sarulab-speech/UTMOS22)
Expand Down
9 changes: 9 additions & 0 deletions bin/contentvec_100_feature_extraction.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@

splits=$1
dataset=$2

if [ ! -e ling_encoder/contentvec_100/contentvec_100_model.pt ]; then
cd ling_encoder/contentvec_100
wget "https://public.boxcloud.com/d/1/b1\!aZwF89Eh4jXgxr7yGU4SQ7YuIF7zrYJ_QyySriX1CBnI5Cg0oI_5mKHTcakLamdA-XyLSzH5Natnk-mDc-ea7crePAVToem5nSlaGtTBWaT4H8QZ5FsJv82YS7jyBjzeZ-20V34qmZKLgAWckoeA8t5utnBTrFP7URXoPcGST5Kn3pnMGhD2sgClyELFGB0PedSRegfxq9RBNqmx7KAgdxerreUI_EafNH29SxqVC2H2c0aioUu8Qmp_nObVKRdysGBF-rvcfx08lAoLoGaF_dw_KxDbvDEi6GLmIDBBAWE55_5IOqFvE9PZxw8tXHT4fXWiyBpoTgMkN7jOiAd-2BeYGrRome9gaPnwG-GegeDcVAqseMqlOqMXaFTQsEol2fNeFt427l9-9PBtfyiZlW9ru-APHtDHIErIRSgIaMViJA368lssKN7vlXJaUJ4qUNAik37hP0HeTfuQbppxDLhRfPVRTT8tZtR40PkX2UGz6Ev8_b_R7jfk6CkWsQ-U3znUJmudMM_OLcZPS7ZsZSU1M4_rzo2sCg4p80KYbpNaAkKNblODEH6WQnYNC-FDPwHH-6FZMrs8DnA-TcHuCNWBLAl9Oe4vReHYHSSLoxHUyVcqe_ox5klCinuxgG1cY7AOzXfq8imFvvsyJrwa3bsXs0Y_Ql4pUcvekGQKslbDqbpb7jkXOnqvFEvUzt71l6UWA64dCFYFNYo0HQWtaj6VfLxCpj9dL60d5fDmjTyRii96qdZYUU04kPmNgqrQv7hufMod0QkuKkpjIewaaGj3cyAJxXiY3Tv5QhXOvNizxm5yJOzCDUJgiWWWMSISSS8578azEE_Fk2N43IOIfSqG7rQBXyZzxlKlDLuFtA7OBqAiLlyT55ZSUK8GrnCrn_BDu2UBUC7laWDK_SpdjsJcxnWSWzVIPI8NVysLHd1ejtAdGRkohATdf6tzlJaQDlWkj1D3p3wyq1q4YHd-KK_NVbTwLOj0Tz41O_iT5WMEIhbw-aJXhkFvHMdVzEVBFfVCoMWcrbEW7M7ix0bOt5NWB7A5dPFNqX_PrEG3VRIuWhbfSMwJPlv1f1Fck9ict8Fs6o_c0o8lNe7LROf1P-7pBPMWpgx-OgEhnRKYUkxkraXneunNr7V-yjjI1Ham99JpK02yAUATf3pEKjCJaUIMqmawYbv_nOl6rnjwwyTB3_t5O0h7PoNqJmVqeyzFpGp-gDx4ZZPkFsOXHQ6s8A8nRmTrTL3B_tMFMdvaqt0lv2KECstUO0pbk2yeeejIIhtjWVBq-L5JnTocz4HmhCnfwqguqbvGC8bOsdu5zWC5wvqgQGN7/download"
ln -s checkpoint_best_legacy_100.pt contentvec_100_model.pt
cd ../../
echo "done!"
fi

for split in $splits ; do

echo "[vqwav2vec feature extraction]: $split for libritts"
Expand Down
4 changes: 2 additions & 2 deletions bin/feature_extraction_sge_multi_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ conda_env=torch_1.9
# setup

dataset=libritts
config=configs/preprocess_ppgvc_mel.yaml
feature_type=ppgvc_mel
config=configs/preprocess_vits_spec.yaml
feature_type=vits_spec
splits="train_nodev_clean dev_clean"

script_dir=scripts/$dataset/preprocess
Expand Down
9 changes: 9 additions & 0 deletions bin/vqwav2vec_feature_extraction.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@

splits=$1
dataset=$2

if [ ! -e ling_encoder/vqwav2vec/vq-wav2vec_kmeans.pt ]; then
echo "downloading vqwav2vec model checkpoint"
mkdir -p ling_encoder/vqwav2vec
cd ling_encoder/vqwav2vec
wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/vq-wav2vec_kmeans.pt
cd ../..
echo "done!"
fi
for split in $splits ; do

echo "[vqwav2vec feature extraction]: $split for $dataset"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# experiment
dataset: libritts
train_meta: data/libritts/train_nodev_clean/metadata.csv
dev_meta: data/libritts/dev_clean/metadata.csv
train_set: train_nodev_clean
dev_set: dev_clean


# encoder-decoder
ling_enc: conformerppg
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: GradTTS
mel_type: ppgvc_mel
vocoder: ppgvc_hifigan

# training
fp16_run: !!bool False
epochs: 2000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: GradTTSTrainer
ngpu: 2

#dataloader
dataset_class: Dataset
mel_segment_length: !!int 128
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
use_prior_loss: !!bool False
n_feats: !!int 80
input_dim: !!int 144
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: !!int 80
n_enc_channels: !!int 192
filter_channels: !!int 768
filter_channels_dp: !!int 256
n_enc_layers: !!int 6
enc_kernel: !!int 3
enc_dropout: !!float 0.1
n_heads: !!int 2
window_size: !!int 4
dec_dim: !!int 64
beta_min: !!float 0.05
beta_max: !!float 20.0
pe_scale: !!int 1000 # 1 for `grad-tts-old.pt` checkpoint

#optimizer & scheduler
optimizer:
lr: !!float 1e-4

# loss hyper-parameters
losses:
alpha: 1.







100 changes: 100 additions & 0 deletions configs/libritts_conformerppg_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# experiment
dataset: libritts
train_meta: data/libritts/train_nodev_clean/metadata.csv
dev_meta: data/libritts/dev_clean/metadata.csv
train_set: train_nodev_clean
dev_set: dev_clean


# encoder-decoder
ling_enc: conformer_ppg
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: VITS
mel_type: vits_spec


# training
fp16_run: !!bool False
epochs: 1000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: !!int 48
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 192
input_dim: !!int 144
spec_channels: !!int 513
inter_channels: !!int 192
hidden_channels: !!int 192
filter_channels: !!int 768
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [8,5,3,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [15, 15, 5, 5]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
n_mels_channels: !!int 80
win_length: !!int 1024
hop_length: !!int 240
sampling_rate: !!int 24000
segment_size: !!int 9600




#optimizer & scheduler
optimizer:
generator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
discriminator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
scheduler:
generator:
lr_decay: !!float 0.999875
discriminator:
lr_decay: !!float 0.999875

# loss hyper-parameters
losses:
mel: !!int 45
kl: !!int 1







Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# experiment
dataset: libritts
train_meta: data/libritts/train_nodev_clean/metadata.csv
dev_meta: data/libritts/dev_clean/metadata.csv
train_set: train_nodev_clean
dev_set: dev_clean


# encoder-decoder
ling_enc: vqwav2vec
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: GradTTS
mel_type: ppgvc_mel
vocoder: ppgvc_hifigan

# training
fp16_run: !!bool False
epochs: 2000
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: GradTTSTrainer
ngpu: 2

#dataloader
dataset_class: Dataset
mel_segment_length: !!int 128
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
use_prior_loss: !!bool False
n_feats: !!int 80
input_dim: !!int 512
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: !!int 80
n_enc_channels: !!int 192
filter_channels: !!int 768
filter_channels_dp: !!int 256
n_enc_layers: !!int 6
enc_kernel: !!int 3
enc_dropout: !!float 0.1
n_heads: !!int 2
window_size: !!int 4
dec_dim: !!int 64
beta_min: !!float 0.05
beta_max: !!float 20.0
pe_scale: !!int 1000 # 1 for `grad-tts-old.pt` checkpoint

#optimizer & scheduler
optimizer:
lr: !!float 1e-4

# loss hyper-parameters
losses:
alpha: 1.







Loading

0 comments on commit 48cead2

Please sign in to comment.