Skip to content

Commit

Permalink
add vits configs
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Mar 5, 2023
1 parent 8fa0695 commit c9ac636
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 13 deletions.
2 changes: 1 addition & 1 deletion bin/conformer_ppg_feature_extract_multi_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ for split in $splits ; do
speakers=$(cat data/$dataset/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/conformer_ppg_feature_extraction_${split}_${spk}.sh
l=logs/enc_dec_conformer_ppg_feature_extraction${split}_${spk}.log
l=logs/enc_dec_conformer_ppg_feature_extraction_${split}_${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
Expand Down
22 changes: 13 additions & 9 deletions bin/hubertsoft_feature_extraction_multi_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,34 @@

conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.9
splits="train_nodev_clean dev_clean eval_clean"
splits="train_nodev_all dev_all eval_all"
dataset=vctk

script_dir=scripts/libritts/hubert_soft

script_dir=scripts/$dataset/hubert_soft
[ ! -e $script_dir ] && mkdir -p $script_dir
[ ! -e logs ] && mkdir logs


for split in $splits ; do

echo "[hubert_soft feature extraction]: $split for libritts"
speakers=$(cat data/libritts/$split/speakers.txt)
echo "[hubert_soft feature extraction]: $split for ${dataset}"
speakers=$(cat data/$dataset/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/hubertsoft_feature_extraction_${spk}.sh
l=logs/enc_dec_hubertsoft_feature_extraction.${spk}.log
b=$script_dir/hubertsoft_feature_extraction_${split}_${spk}.sh
l=logs/hubertsoft_feature_extraction_${split}_${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 ling_encoder/hubert_soft/extract_features.py \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--metadata data/$dataset/$split/metadata.csv \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20 \
--speaker $spk
EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $spk"
echo "submitjob for [$dataset $split] [$spk], see log in $l"
done
done
100 changes: 100 additions & 0 deletions configs/vctk_conformerppg_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: conformer_ppg
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: VITS
mel_type: vits_spec


# training
fp16_run: !!bool False
epochs: 200
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 240
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 12
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 192
input_dim: !!int 144
spec_channels: !!int 513
inter_channels: !!int 192
hidden_channels: !!int 192
filter_channels: !!int 768
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
n_mels_channels: !!int 80
win_length: !!int 1024
hop_length: !!int 240
sampling_rate: !!int 24000
segment_size: !!int 9600




#optimizer & scheduler
optimizer:
generator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
discriminator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
scheduler:
generator:
lr_decay: !!float 0.999875
discriminator:
lr_decay: !!float 0.999875

# loss hyper-parameters
losses:
mel: !!int 45
kl: !!int 1







100 changes: 100 additions & 0 deletions configs/vctk_hubertsoft_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: hubert_soft
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: VITS
mel_type: vits_spec


# training
fp16_run: !!bool False
epochs: 200
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 240
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 12
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 192
input_dim: !!int 256
spec_channels: !!int 513
inter_channels: !!int 192
hidden_channels: !!int 192
filter_channels: !!int 768
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
n_mels_channels: !!int 80
win_length: !!int 1024
hop_length: !!int 240
sampling_rate: !!int 24000
segment_size: !!int 9600




#optimizer & scheduler
optimizer:
generator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
discriminator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
scheduler:
generator:
lr_decay: !!float 0.999875
discriminator:
lr_decay: !!float 0.999875

# loss hyper-parameters
losses:
mel: !!int 45
kl: !!int 1







6 changes: 3 additions & 3 deletions submit_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ conda_env=torch_1.9

#choose config
dataset=vctk
ling=vqwav2vec
#ling=vqwav2vec
#ling=conformerppg
#ling=hubertsoft
ling=hubertsoft
spk=uttdvec
pros=ppgvcf0
#dec=fs2
Expand All @@ -19,7 +19,7 @@ dec=vits
#vocoder=ppgvchifigan
vocoder=none

exp_name=vctk_no16fp_split
exp_name=vctk_first_train
config=configs/${dataset}_${ling}_${spk}_${pros}_${dec}_${vocoder}.yaml
if [ ! -e $config ] ; then
echo "can't find config file $config"
Expand Down

0 comments on commit c9ac636

Please sign in to comment.