Skip to content

Commit

Permalink
submit_inference, hubert_soft training, scaler for taco_ar and taco_m…
Browse files Browse the repository at this point in the history
…ol, ling_rep upsample in dataset.py, libritts_hifigan vocoder
  • Loading branch information
MingjieChen committed Feb 10, 2023
1 parent 2e15e92 commit 5eb43f5
Show file tree
Hide file tree
Showing 17 changed files with 538 additions and 24 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ submit_gpu.sh
downloads/
ling_encoder/vqwav2vec/vq-wav2vec_kmeans.pt
exp/
vocoder/libritts_hifigan/checkpoint-600000steps.pkl

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
6 changes: 3 additions & 3 deletions bin/inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ source $conda/bin/activate $conda_env

echo "sge_task_id $SGE_TASK_ID"
python inference.py \
--exp_dir exp/vqw2v_uttdvec_none_fastspeech2/first_train \
--exp_dir exp/vqw2v_uttdvec_none_tacoar/first_train \
--eval_list data/libritts/eval_clean/eval_list_oneshot_vc_small.json \
--epochs 95 \
--epochs 72 \
--task oneshot_vc \
--vocoder ppg_vc_hifigan \
--vocoder libritts_hifigan \
--device cpu \
--sge_task_id $SGE_TASK_ID \
--sge_n_tasks 50
2 changes: 1 addition & 1 deletion configs/conformerppg_uttdvec_none_tacoar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ decoder: TacoAR
# training
fp16_run: !!bool False
epochs: 200
save_freq: 4 # save ckpt frequency
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
Expand Down
2 changes: 1 addition & 1 deletion configs/conformerppg_uttdvec_none_tacomol.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ decoder: TacoMOL
# training
fp16_run: !!bool False
epochs: 200
save_freq: 5 # save ckpt frequency
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
Expand Down
84 changes: 84 additions & 0 deletions configs/hubertsoft_uttdvec_none_fastspeech2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# experiment
dataset: libritts
train_meta: data/libritts/train_nodev_clean/metadata.csv
dev_meta: data/libritts/dev_clean/metadata.csv
train_set: train_nodev_clean
dev_set: dev_clean


# encoder-decoder
ling_enc: hubert_soft
spk_enc: utt_dvec
pros_enc: f0
decoder: FastSpeech2


# training
fp16_run: !!bool True
epochs: 200
save_freq: 5 # save ckpt frequency
show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: FS2Trainer
ngpu: 2

#dataloader
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
max_len: 1000
max_seq_len: 1000
spk_emb_dim: 256

prenet:
conv_kernel_size: 3
input_dim: 256
dropout: 0.1
postnet:
idim: 80
odim: 80
n_layers: 5
n_filts: 5
n_chans: 256
dropout_rate: 0.5
transformer:
encoder_layer: 4
encoder_head: 2
encoder_hidden: 256
decoder_layer: 4
decoder_head: 2
decoder_hidden: 256
conv_filter_size: 1024
conv_kernel_size: [3, 1]
encoder_dropout: 0.1
decoder_dropout: 0.1

#optimizer & scheduler
optimizer:
init_lr: !!float 1e-2
betas: [0.9,0.99]
weight_decay: 0.0
scheduler:
warm_up_step: 4000
anneal_steps: [800000, 900000, 1000000]
anneal_rate: 0.3

# loss hyper-parameters
loss:
alpha: 1.







72 changes: 72 additions & 0 deletions configs/hubertsoft_uttdvec_none_tacoar.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# experiment
dataset: libritts
train_meta: data/libritts/train_nodev_clean/metadata.csv
dev_meta: data/libritts/dev_clean/metadata.csv
train_set: train_nodev_clean
dev_set: dev_clean


# encoder-decoder
ling_enc: hubert_soft
spk_enc: utt_dvec
pros_enc: f0
decoder: TacoAR


# training
fp16_run: !!bool True
epochs: 200
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
trainer: TacoARTrainer
ngpu: 2

#dataloader
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)


# decoder params
decoder_params:
input_dim: 256
output_dim: 80
resample_ratio: 1
spk_emb_integration_type: concat # add or concat
spk_emb_dim: 256
ar: True
encoder_type: "taco2"
hidden_dim: 1024
prenet_layers: 2 # if set 0, no prenet is used
prenet_dim: 256
prenet_dropout_rate: 0.5
lstmp_layers: 2
lstmp_dropout_rate: 0.2
lstmp_proj_dim: 256
lstmp_layernorm: False

#optimizer & scheduler
optimizer:
weight_decay: 0.0
betas: [0.9,0.99]
lr: !!float 1e-4
scheduler:
num_training_steps: 500000
num_warmup_steps: 4000

# loss hyper-parameters
loss:
alpha: 1.







60 changes: 60 additions & 0 deletions configs/hubertsoft_uttdvec_none_tacomol.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# experiment
dataset: libritts
train_meta: data/libritts/train_nodev_clean/metadata.csv
dev_meta: data/libritts/dev_clean/metadata.csv
train_set: train_nodev_clean
dev_set: dev_clean


# encoder-decoder
ling_enc: hubert_soft
spk_enc: utt_dvec
pros_enc: f0
decoder: TacoMOL


# training
fp16_run: !!bool True
epochs: 200
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
trainer: TacoMOLTrainer
ngpu: 2

#dataloader
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)
frames_per_step: !!int 4


# decoder params
decoder_params:
spk_embed_dim: 256
bottle_neck_feature_dim: 256

#optimizer & scheduler
optimizer:
weight_decay: !!float 1e-6
betas: [0.9,0.99]
lr: !!float 1e-4
scheduler:
num_training_steps: 500000
num_warmup_steps: 4000

# loss hyper-parameters
loss:
alpha: 1.







2 changes: 1 addition & 1 deletion configs/vqw2v_uttdvec_none_tacoar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ decoder: TacoAR
# training
fp16_run: !!bool False
epochs: 200
save_freq: 4 # save ckpt frequency
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
Expand Down
2 changes: 1 addition & 1 deletion configs/vqw2v_uttdvec_none_tacomol.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ decoder: TacoMOL
# training
fp16_run: !!bool False
epochs: 200
save_freq: 5 # save ckpt frequency
save_freq: 2 # save ckpt frequency
show_freq: 10
load_only_params: !!bool False
seed: !!int 1234
Expand Down
10 changes: 9 additions & 1 deletion dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,18 @@ def __getitem__(self, idx):
pros_rep = np.expand_dims(np.load(pros_rep_path), axis = 1)
pros_duration = pros_rep.shape[0]

# up_sample ling_rep to 10hz, in case some ling_rep are 50hz or 25hz.
factor = int(round(mel_duration / ling_duration))
repeated_ling_rep = np.repeat(ling_rep, factor, axis=1)
ling_rep = np.reshape(repeated_ling_rep, [ling_duration * factor, ling_rep.shape[1]])
ling_duration = ling_rep.shape[0]


# match length between mel and ling_rep
if mel_duration > ling_duration:
if mel_duration > ling_duration :
pad_vec = np.expand_dims(ling_rep[-1,:], axis = 0)
ling_rep = np.concatenate((ling_rep, np.repeat(pad_vec, mel_duration - ling_duration, 0)),0)

elif mel_duration < ling_duration:
ling_rep = ling_rep[:mel_duration,:]

Expand Down
21 changes: 15 additions & 6 deletions decoder/taco_ar/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _train_epoch(self):

train_losses = defaultdict(list)
self.model.train()
#scaler = torch.cuda.amp.GradScaler() if (('cuda' in str(self.device)) and self.fp16_run) else None
scaler = torch.cuda.amp.GradScaler() if (('cuda' in str(self.device)) and self.fp16_run) else None


for train_steps_per_epoch, batch in tqdm(enumerate(self.train_dataloader, 1)):
Expand All @@ -160,11 +160,20 @@ def _train_epoch(self):
#print(f'shapes {shapes}', flush = True)

self.optimizer.zero_grad()
loss, losses = compute_loss(self.model, _batch, self.objective)
self.timer.cnt('fw')
loss.backward()
self.optimizer.step()
self.timer.cnt('bw')
if scaler is not None:
with torch.cuda.amp.autocast():
loss, losses = compute_loss(self.model, _batch, self.objective)
self.timer.cnt('fw')
scaler.scale(loss).backward()
scaler.step(self.optimizer)
scaler.update()
self.timer.cnt('bw')
else:
loss, losses = compute_loss(self.model, _batch, self.objective)
self.timer.cnt('fw')
loss.backward()
self.optimizer.step()
self.timer.cnt('bw')

loss_string = f"epoch: {self.epochs}| iters: {self.iters}| timer: {self.timer.show()}|"
for key in losses:
Expand Down
23 changes: 17 additions & 6 deletions decoder/taco_mol/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _train_epoch(self):

train_losses = defaultdict(list)
self.model.train()
#scaler = torch.cuda.amp.GradScaler() if (('cuda' in str(self.device)) and self.fp16_run) else None
scaler = torch.cuda.amp.GradScaler() if (('cuda' in str(self.device)) and self.fp16_run) else None


for train_steps_per_epoch, batch in tqdm(enumerate(self.train_dataloader, 1)):
Expand All @@ -159,12 +159,23 @@ def _train_epoch(self):
self.timer.cnt("rd")
#print(f'shapes {shapes}', flush = True)


self.optimizer.zero_grad()
loss, losses = compute_loss(self.model, _batch, self.objective)
self.timer.cnt('fw')
loss.backward()
self.optimizer.step()
self.timer.cnt('bw')

if scaler is not None:
with torch.cuda.amp.autocast():
loss, losses = compute_loss(self.model, _batch, self.objective)
self.timer.cnt('fw')
scaler.scale(loss).backward()
scaler.step(self.optimizer)
scaler.update()
self.timer.cnt('bw')
else:
loss, losses = compute_loss(self.model, _batch, self.objective)
self.timer.cnt('fw')
loss.backward()
self.optimizer.step()
self.timer.cnt('bw')

loss_string = f"epoch: {self.epochs}| iters: {self.iters}| timer: {self.timer.show()}|"
for key in losses:
Expand Down
Loading

0 comments on commit 5eb43f5

Please sign in to comment.