Skip to content

Commit

Permalink
data_loader, fs2 trainer, train.py
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Jan 9, 2023
1 parent dab400c commit 10c78dd
Show file tree
Hide file tree
Showing 71 changed files with 1,211 additions and 252 deletions.
6 changes: 3 additions & 3 deletions bin/conformer_ppg_feature_extract.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ splits="train_nodev_clean dev_clean eval_clean"
for split in $splits ; do

echo "[conformer_ppg feature extraction]: $split for libritts"
python3 content_encoder/conformer_ppg/conformer_ppg_feature_extract.py \
--conformer_ppg_config content_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/config.yaml\
--conformer_ppg_ckpt content_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/24epoch.pth \
python3 ling_encoder/conformer_ppg/conformer_ppg_feature_extract.py \
--conformer_ppg_config ling_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/config.yaml\
--conformer_ppg_ckpt ling_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/24epoch.pth \
--metadata data/libritts/metadata.csv \
--dump_dir dump/libritts \
--split $split \
Expand Down
6 changes: 3 additions & 3 deletions bin/conformer_ppg_feature_extract_multi_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ for split in $splits ; do
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 content_encoder/conformer_ppg/conformer_ppg_feature_extract.py \
--conformer_ppg_config content_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/config.yaml\
--conformer_ppg_ckpt content_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/24epoch.pth \
python3 ling_encoder/conformer_ppg/conformer_ppg_feature_extract.py \
--conformer_ppg_config ling_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/config.yaml\
--conformer_ppg_ckpt ling_encoder/conformer_ppg/conformer_ppg_model/en_conformer_ctc_att/24epoch.pth \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--split $split \
Expand Down
21 changes: 12 additions & 9 deletions bin/d_vector_extract_speaker_embedding.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.7
source $conda/bin/activate $conda_env
root=$PWD
cd model/transformer_adversarial
speaker=1001
python extract_speaker_embed.py \
/share/mini1/res/t/vc/studio/tiresyn-en/libritts/ParallelWaveGAN/egs/libritts/voc1/data/ \
$root/dump/ppg-vc-spks \
speaker_encoder/ckpt/pretrained_bak_5805000.pt \
$speaker \
/share/mini1/res/t/vc/studio/tiresyn-en/libritts/ParallelWaveGAN/egs/libritts/voc1
splits="train_nodev_clean dev_clean eval_clean"

for split in $splits ; do

echo "[d_vector speaker-level extraction]: $split for libritts"
python3 speaker_encoder/d_vector/extract_speaker_embed.py \
--d_vector_ckpt speaker_encoder/d_vector/d_vector_model/ckpt/pretrained_bak_5805000.pt \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--split $split \
--max_workers 20
done


32 changes: 32 additions & 0 deletions bin/d_vector_extract_speaker_embedding_multi_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.7
splits="train_nodev_clean dev_clean eval_clean"

script_dir=scripts/libritts/d_vector_speaker_level
[ ! -e $script_dir ] && mkdir -p $script_dir
[ ! -e logs ] && mkdir logs
for split in $splits ; do

echo "[d_vector speaker-level extraction]: $split for libritts"
speakers=$(cat data/libritts/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/d_vector_speaker_level_${spk}.sh
l=logs/enc_dec_d_vector_speaker_level.${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 speaker_encoder/d_vector/extract_speaker_embed.py \
--d_vector_ckpt speaker_encoder/d_vector/d_vector_model/ckpt/pretrained_bak_5805000.pt \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--split $split \
--max_workers 20 \
--speaker $spk
EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $spk"
done
done
38 changes: 10 additions & 28 deletions bin/d_vector_extract_utterance_embedding.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,17 @@
conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.7
source $conda/bin/activate $conda_env
root=$PWD
cd model/transformer_adversarial
splits="train_nodev_clean dev_clean eval_clean"

#utters=$( ls /share/mini1/res/t/vc/studio/timap-en/vcc2020/baseline/vcc2020/groundtruth/*/E3*.wav)
#utters=$(ls $root/exp/transformer_adversarial/0310_ta_0/zs_vcc_converted_wavs_100/*/*.wav)
#utters=$(ls $root/exp/transformer_adversarial/0310_ta_0/zs_vcc_mean_wavs_60/*/*.wav)
#for utt in $utters ; do
# spk=$( basename $( dirname $utt )| cut -d'_' -f2 )
# base=$( basename $utt | sed "s/.wav//")
# echo "$utt $spk $base"
# python extract_utter_embed.py \
# $utt \
# $root/exp/transformer_adversarial/0310_ta_0/zs_vcc_converted_wavs_100/vcc_mean_spkembs/${base}.npy \
# speaker_encoder/ckpt/pretrained_bak_5805000.pt \
# #$root/dump/vcc2020-spks/${spk}_${base}.npy \
#done


#scp=/share/mini1/res/t/vc/studio/tiresyn-en/libritts/ParallelWaveGAN/egs/libritts/voc1/data/train_nodev_clean/wav.scp
#data_root=/share/mini1/res/t/vc/studio/tiresyn-en/libritts/ParallelWaveGAN/egs/libritts/voc1
#wavs=$(cat $scp | awk '{print $2}')

#for _wav in $wavs; do
# fid=$(basename $_wav | cut -d '.' -f 1)
# spk=$(echo $fid | cut -d '_' -f 1 )
# echo "$data_root/$_wav $root/dump/utt_d_vec/$spk/${fid}.npy" >> speaker_embedding_meta.txt
for split in $splits ; do

#done

python extract_utter_embed.py speaker_embedding_meta.txt speaker_encoder/ckpt/pretrained_bak_5805000.pt 24000
echo "[d_vector utterance-level extraction]: $split for libritts"
python3 speaker_encoder/d_vector/extract_utter_embed.py \
--d_vector_ckpt speaker_encoder/d_vector/d_vector_model/ckpt/pretrained_bak_5805000.pt \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--split $split \
--max_workers 20
done


32 changes: 32 additions & 0 deletions bin/d_vector_extract_utterance_embedding_multi_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.7
splits="train_nodev_clean dev_clean eval_clean"

script_dir=scripts/libritts/d_vector_utterance_level
[ ! -e $script_dir ] && mkdir -p $script_dir
[ ! -e logs ] && mkdir logs
for split in $splits ; do

echo "[d_vector utterance-level extraction]: $split for libritts"
speakers=$(cat data/libritts/$split/speakers.txt)
for spk in $speakers ; do
b=$script_dir/d_vector_utterance_level_${spk}.sh
l=logs/enc_dec_d_vector_utterance_level.${spk}.log
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 speaker_encoder/d_vector/extract_utter_embed.py \
--d_vector_ckpt speaker_encoder/d_vector/d_vector_model/ckpt/pretrained_bak_5805000.pt \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--split $split \
--max_workers 20 \
--speaker $spk
EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $spk"
done
done
4 changes: 2 additions & 2 deletions bin/vqwav2vec_feature_extraction.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ splits="train_nodev_clean dev_clean eval_clean"
for split in $splits ; do

echo "[vqwav2vec feature extraction]: $split for libritts"
python3 content_encoder/vqwav2vec/vqwav2vec_feature_extract.py \
--vqwav2vec_ckpt content_encoder/vqwav2vec/vq-wav2vec_kmeans.pt \
python3 ling_encoder/vqwav2vec/vqwav2vec_feature_extract.py \
--vqwav2vec_ckpt ling_encoder/vqwav2vec/vq-wav2vec_kmeans.pt \
--metadata data/libritts/metadata.csv \
--dump_dir dump/libritts \
--split $split \
Expand Down
5 changes: 2 additions & 3 deletions bin/vqwav2vec_feature_extraction_multi_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ for split in $splits ; do
cat <<EOF > $b
#!/bin/bash
source $conda/bin/activate $conda_env
python3 content_encoder/vqwav2vec/vqwav2vec_feature_extract.py \
--vqwav2vec_ckpt content_encoder/vqwav2vec/vq-wav2vec_kmeans.pt \
python3 ling_encoder/vqwav2vec/vqwav2vec_feature_extract.py \
--vqwav2vec_ckpt ling_encoder/vqwav2vec/vq-wav2vec_kmeans.pt \
--metadata data/libritts/$split/metadata.csv \
--dump_dir dump/libritts \
--split $split \
Expand All @@ -28,6 +28,5 @@ EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $spk"
break
done
done
6 changes: 6 additions & 0 deletions configs/train_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
batch_size: !!int 10
drop_last: !!bool False
train_csv: train_nodev.csv
dev_csv: dev.csv
sort_in_batch: !!bool True
speakers: ../../speakers.json # it has 109 speakers, but only use 108, no use p315
175 changes: 175 additions & 0 deletions dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from torch.utils import data
import torch
import glob
import os
from os.path import join, basename, dirname, split, exists
import numpy as np
import json
import csv
import random
from torch.utils.data import DataLoader
from collections import defaultdict
def get_dataloader(config):
train_dataset = eval(config['dataset'])(config, config['train_csv'], config['train_set'])
dev_dataset = eval(config['dataset'])(config, config['dev_csv'], config['dev_set'])

train_loader = DataLoader(
train_dataset,
batch_size = config['batch_size'],
shuffle = True,
collate_fn = train_dataset.collate_fn,
num_workers = config['num_workers']
)
dev_loader = DataLoader(
dev_dataset,
batch_size = config['batch_size'],
shuffle = False,
collate_fn = dev_dataset.collate_fn,
num_workers = config['num_workers']
)
return train_loader, dev_loader

def pad_1D(inputs, length, PAD = 0):

def pad_data(x, length, PAD):

x_padded = np.pad(
x, (0, length - x.shape[0]), mode = 'constant', constant_values = PAD

)
return x_padded

max_len = max(len(x) for x in inputs)
padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])

return padded

def pad_2D(inputs, maxlen = None):
def pad(x, max_len):
PAD = 0
if np.shape(x)[0] > max_len:
raise ValueError(f'shape {x.shape[0]} excceed max_len {max_len}')

s = np.shape(x)[1]
x_padded = np.pad(
x, (0, max_len - x.shape[0]), mode = 'constant', constant_values = PAD
)
return x_padded[:,:s]

if maxlen:
output = np.stack([pad(x,maxlen) for x in inputs])
else:
max_len = max([x.shape[0] for x in inputs])
output = np.stack([pad(x, max_len) for x in inputs])
return output







class Dataset(data.Dataset):

def __init__(self, config, metadata_csv, split):
super().__init__()
self.metadata = []

# read metadata
with open(metadata_csv) as f:
reader = csv.DictReader(f, delimiter = ',')
for row in reader:
# remove utterances that are too long for training.
if config['rm_long_utt']:
_duration = row['duration']
if float(_duration) <= config['max_utt_duration']:
self.metadata.append(row)
f.close()

# load speakers
with open(config['speakers']) as f:
self.speakers = json.load(f)
f.close()

self.sort = config['sort_in_batch']
self.batch_size = config['batch_size']
self.drop_last = config['drop_last']
self.use_trg_spk = config['use_trg_spk']

# feature dirs
self.mel_dir = os.path.join(config['dump_dir'], config['dataset'], split, 'mel')

self.ling_enc = config['ling_enc']
self.ling_rep_dir = os.path.join(config['dump_dir'], config['dataset'], split, self.ling_enc)
self.spk_enc = config['spk_enc']
self.spk_emb_dir = os.path.join(config['dump_dir'], config['dataset'], split, self.spk_enc)
self.pros_enc = config['pros_enc']
self.pros_rep_dir = os.path.join(config['dump_dir'], config['dataset'], split, self.pros_enc)





def __len__(self):
return len(self.metadata)

def __getitem__(self, idx):
row = self.metadata[idx]
file_id = row['ID']
spk = row['spk']

# feature path
mel_path = os.path.join(self.mel_dir, spk, ID + '.npy')

ling_rep_path = os.path.join(self.ling_rep_dir, spk, ID+'.npy')
spk_emb_path = os.path.join(self.spk_emb_dir, spk, ID+'.npy')
pros_rep_path = os.path.join(self.f0_dir, spk, ID + '.npy')

assert os.path.exists(mel_path)
assert os.path.exists(ling_rep_path)
assert os.path.exists(spk_emb_path)
assert os.path.exists(pros_reppath)

# load feature
mel = np.load(mel_path)
mel_duration = mel.shape[0]
ling_rep = np.load(ling_rep_path)
ling_duration = ling_rep.shape[0]
spk_emb = np.load(spk_emb_path)
pros_rep = np.load(pros_rep_path)
pros_duration = pros_rep.shape[0]

# match length between mel and ling_rep
if mel_duration > ling_duration:
pad_vec = ling_rep[-1,:]
ling_rep = np.concatenate((ling_rep, np.repeat(pad_vec, mel_duration - ling_duration, 0)),1)
elif mel_duration < ling_duration:
ling_rep = ling_rep[:mel_duration,:]

# match length between mel and pros_rep
if mel_duration > pros_duration:
pad_vec = pros_rep[-1,:]
pros_rep = np.concatenate((pros_rep, np.repeat(pad_vec, mel_duration - pros_duration, 0)),1)
elif mel_duration < pros_duration:
pros_rep = pros_rep[:mel_duration,:]

return (mel, ling_rep, pros_rep, spk_emb, mel_duration)

def collate_fn(self, data):
batch_size = len(data)
# sort in batch
mel = [ data[id][0] for id in range(batch_size)]
ling_rep = [ data[id][1] for id in range(batch_size)]
pros_rep = [ data[id][1] for id in range(batch_size)]
spk_emb = [ data[id][3] for id in range(batch_size)]
length = [ data[id][4] for id in idx ]

max_len = max(length)
padded_mel = torch.FloatTensor(pad_2D(mel))
padded_ling_rep = torch.FloatTensor(pad_2D(ling_rep))
padded_pros_rep = torch.FloatTensor(pad_2D(pros_rep))
spk_emb_tensor = torch.FloatTensor(np.array(spk_emb)).unsqueeze(1)
length = torch.LongTensor(np.array(length))
output.append((padded_mel, padded_ling_rep, padded_pros_rep, spk_emb_tensor, length, max_len))

return output
Loading

0 comments on commit 10c78dd

Please sign in to comment.