Skip to content

Commit

Permalink
1. fix ling_rep upsample bug; 2. finish utmos scripts; 3. finsh asr e…
Browse files Browse the repository at this point in the history
…val scripts
  • Loading branch information
MingjieChen committed Mar 7, 2023
1 parent c9ac636 commit 51f9013
Show file tree
Hide file tree
Showing 21 changed files with 851 additions and 217 deletions.
17 changes: 10 additions & 7 deletions bin/generate_eval_list_libritts.sh → bin/generate_eval_list.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
task=vc
dataset=vctk
split=eval_all
eval_list=eval_list_m2m_vc_small.json
echo "update eval metadata with wrd"
# update eval metadata with text transcriptions
python3 evaluation/update_metadata_${dataset}.py \
--metadata_path data/${dataset}/${split}/metadata.csv \
--out_path data/${dataset}/${split}/metadata_with_wrd.csv
eval_list=eval_list_m2m_vc_small_oneshot.json

if [ ! -e data/$dataset/$split/metadata_with_wrd.csv ]; then
echo "update eval metadata with wrd"
# update eval metadata with text transcriptions
python3 evaluation/update_metadata_${dataset}.py \
--metadata_path data/${dataset}/${split}/metadata.csv \
--out_path data/${dataset}/${split}/metadata_with_wrd.csv
fi


echo "done!"
Expand All @@ -22,7 +25,7 @@ python3 evaluation/generate_eval_list.py \
--speakers_path data/$dataset/$split/speakers.txt \
--eval_metadata_path data/$dataset/$split/metadata_with_wrd.csv \
--eval_list_out_path data/$dataset/$split/${eval_list} \
--n_samples_per_trg_speaker 10 \
--n_samples_per_trg_speaker 1 \
--n_eval_speakers 10 \
--n_samples_per_src_speaker 4
echo "done!"
19 changes: 19 additions & 0 deletions bin/whisper_ppg_largev2_feature_extraction.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash



splits=$1
dataset=$2
for split in $splits ; do

echo "[whisper_ppg_largev2 feature extraction]: $split for libritts"
python3 ling_encoder/whisper_ppg/whisper_ppg_feature_extract.py \
--ckpt ling_encoder/whisper_ppg/ckpt/large-v2.pt \
--metadata data/$dataset/metadata.csv \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20
--ext largev2
done


Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ for split in $splits ; do
echo "[whisper_ppgfeature extraction]: $split for $dataset"
speakers=$(cat data/$dataset/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/whisper_ppg_feature_extraction_${split}_${spk}.sh
l=logs/whisper_ppg_feature_extraction_${split}_${spk}.log
b=$script_dir/whisper_ppg_largev2_feature_extraction_${split}_${spk}.sh
l=logs/whisper_ppg_largev2_feature_extraction_${split}_${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
Expand All @@ -25,10 +25,11 @@ python3 ling_encoder/whisper_ppg/whisper_ppg_feature_extract.py \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20 \
--speaker $spk
--speaker $spk \
--ext largev2
EOF
chmod +x $b
submitjob -m 30000 $l $b
submitjob -q LONG -m 30000 $l $b
echo "submitjob for $spk see log $l"
done
done
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@ splits=$1
dataset=$2
for split in $splits ; do

echo "[vqwav2vec feature extraction]: $split for libritts"
echo "[whisper_ppg_medium feature extraction]: $split for $dataset"
python3 ling_encoder/whisper_ppg/whisper_ppg_feature_extract.py \
--vqwav2vec_ckpt ling_encoder/whisper_ppg/ckpt/large-v2.pt \
--ckpt ling_encoder/whisper_ppg/ckpt/medium.pt \
--metadata data/$dataset/metadata.csv \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20
--ext medium
done


35 changes: 35 additions & 0 deletions bin/whisper_ppg_medium_feature_extraction_multi_jobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.9

dataset=vctk
splits="train_nodev_all dev_all eval_all"

script_dir=scripts/$dataset/whisper_ppg
[ ! -e $script_dir ] && mkdir -p $script_dir
[ ! -e logs ] && mkdir logs
for split in $splits ; do

echo "[whisper_ppgfeature extraction]: $split for $dataset"
speakers=$(cat data/$dataset/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/whisper_ppg_medium_feature_extraction_${split}_${spk}.sh
l=logs/whisper_ppg_medium_feature_extraction_${split}_${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 ling_encoder/whisper_ppg/whisper_ppg_feature_extract.py \
--ckpt ling_encoder/whisper_ppg/ckpt/medium.pt \
--metadata data/$dataset/$split/metadata.csv \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20 \
--speaker $spk \
--ext medium
EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $spk see log $l"
done
done
19 changes: 19 additions & 0 deletions bin/whisper_ppg_small_feature_extraction.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash



splits=$1
dataset=$2
for split in $splits ; do

echo "[whisper_ppg_small feature extraction]: $split for $dataset"
python3 ling_encoder/whisper_ppg/whisper_ppg_feature_extract.py \
--vqwav2vec_ckpt ling_encoder/whisper_ppg/ckpt/small.pt \
--metadata data/$dataset/metadata.csv \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20
--ext small
done


35 changes: 35 additions & 0 deletions bin/whisper_ppg_small_feature_extraction_multi_jobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.9

dataset=vctk
splits="train_nodev_all dev_all eval_all"

script_dir=scripts/$dataset/whisper_ppg
[ ! -e $script_dir ] && mkdir -p $script_dir
[ ! -e logs ] && mkdir logs
for split in $splits ; do

echo "[whisper_ppgfeature extraction]: $split for $dataset"
speakers=$(cat data/$dataset/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/whisper_ppg_small_feature_extraction_${split}_${spk}.sh
l=logs/whisper_ppg_small_feature_extraction_${split}_${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 ling_encoder/whisper_ppg/whisper_ppg_feature_extract.py \
--ckpt ling_encoder/whisper_ppg/ckpt/small.pt \
--metadata data/$dataset/$split/metadata.csv \
--dump_dir dump/$dataset \
--split $split \
--max_workers 20 \
--speaker $spk \
--ext small
EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $spk see log $l"
done
done
100 changes: 100 additions & 0 deletions configs/vctk_contentvec100_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: contentvec_100
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: VITS
mel_type: vits_spec


# training
fp16_run: !!bool False
epochs: 200
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 240
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 12
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 192
input_dim: !!int 512
spec_channels: !!int 513
inter_channels: !!int 192
hidden_channels: !!int 192
filter_channels: !!int 768
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
n_mels_channels: !!int 80
win_length: !!int 1024
hop_length: !!int 240
sampling_rate: !!int 24000
segment_size: !!int 9600




#optimizer & scheduler
optimizer:
generator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
discriminator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
scheduler:
generator:
lr_decay: !!float 0.999875
discriminator:
lr_decay: !!float 0.999875

# loss hyper-parameters
losses:
mel: !!int 45
kl: !!int 1







100 changes: 100 additions & 0 deletions configs/vctk_contentvec500_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# experiment
dataset: vctk
train_meta: data/vctk/train_nodev_all/metadata.csv
dev_meta: data/vctk/dev_all/metadata.csv
train_set: train_nodev_all
dev_set: dev_all


# encoder-decoder
ling_enc: contentvec_500
spk_enc: utt_dvec
pros_enc: ppgvc_f0
decoder: VITS
mel_type: vits_spec


# training
fp16_run: !!bool False
epochs: 200
save_freq: 1 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 240
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 12
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
spk_emb_dim: 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 192
input_dim: !!int 512
spec_channels: !!int 513
inter_channels: !!int 192
hidden_channels: !!int 192
filter_channels: !!int 768
n_heads: !!int 2
n_layers: !!int 6
kernel_size: !!int 3
p_dropout: !!float 0.1
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
n_mels_channels: !!int 80
win_length: !!int 1024
hop_length: !!int 240
sampling_rate: !!int 24000
segment_size: !!int 9600




#optimizer & scheduler
optimizer:
generator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
discriminator:
lr: !!float 1e-4
betas: [0.8,0.99]
eps: !!float 1e-9
scheduler:
generator:
lr_decay: !!float 0.999875
discriminator:
lr_decay: !!float 0.999875

# loss hyper-parameters
losses:
mel: !!int 45
kl: !!int 1







Loading

0 comments on commit 51f9013

Please sign in to comment.