Skip to content

Commit

Permalink
add vcc2020 test set
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Aug 9, 2023
1 parent c17da0c commit 6b295d8
Show file tree
Hide file tree
Showing 14 changed files with 442 additions and 39 deletions.
43 changes: 29 additions & 14 deletions bin/generate_eval_list.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,42 @@
#!/bin/bash

task=vc
dataset=libritts
split=eval_clean
eval_list=eval_list_a2a_vc_small_oneshot.json
dataset=vcc2020
split=eval
eval_list=eval_list_crosslingual_a2a_vc_oneshot.json
n_trg_spk_samples=1
n_src_spk_samples=4
n_eval_spks=10
n_src_spk_samples=25
n_eval_spks=100000

. ./parse_options.sh || exit 1;

if [ ! -e data/$dataset/$split/metadata_with_wrd.csv ]; then
echo "update eval metadata with wrd"
# update eval metadata with text transcriptions
python3 evaluation/update_metadata_${dataset}.py \
--metadata_path data/${dataset}/${split}/metadata.csv \
--out_path data/${dataset}/${split}/metadata_with_wrd.csv
fi
. ./bin/parse_options.sh || exit 1;


if [ "$dataset" != "vcc2020" ]; then
if [ ! -e data/$dataset/$split/metadata_with_wrd.csv ]; then
echo "update eval metadata with wrd"
# update eval metadata with text transcriptions
python3 evaluation/update_metadata_${dataset}.py \
--metadata_path data/${dataset}/${split}/metadata.csv \
--out_path data/${dataset}/${split}/metadata_with_wrd.csv
fi
else
if [ ! -e data/$dataset/$split/metadata_with_wrd.csv ]; then
cd data/$dataset/$split; ln -s metadata.csv metadata_with_wrd.csv; cd ../../../
fi
fi


echo "done!"

echo "generate eval list for a2a vc"
# generate eval list
opts=""
if [ "$dataset" == "vcc2020" ]; then
opts+=" --src_speakers SEF1 SEF2 SEM1 SEM2\
--trg_speakers TFF1 TFM1 TGM1 TGF1 TMM1 TMF1
"
fi
python3 evaluation/generate_eval_list.py \
--task $task \
--split $split \
Expand All @@ -32,5 +46,6 @@ python3 evaluation/generate_eval_list.py \
--eval_list_out_path data/$dataset/$split/${eval_list} \
--n_samples_per_trg_speaker $n_trg_spk_samples \
--n_eval_speakers $n_eval_spks \
--n_samples_per_src_speaker $n_src_spk_samples
--n_samples_per_src_speaker $n_src_spk_samples \
$opts
echo "done!"
21 changes: 21 additions & 0 deletions bin/preprocess_vcc2020.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

conda=/share/mini1/sw/std/python/anaconda3-2019.07/v3.7
conda_env=torch_1.9
source $conda/bin/activate $conda_env


splits="train_nodev dev eval"

for split in $splits ; do

echo "[preprocess]: $split for vcc2020"
python3 preprocess/preprocess_vcc2020.py \
--data_root downloads/vcc2020 \
--scp_dir data/vcc2020 \
--metadata_dir data/vcc2020/$split/ \
--split $split \
--max_workers 20
done


Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
out_dim: 80
max_len: 1000
max_seq_len: 1000
spk_emb_dim: 256
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ frames_per_step: !!int 4

# decoder params
decoder_params:
out_dim: 80
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: 256
Expand Down
6 changes: 3 additions & 3 deletions configs/vctk_conformerppg_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2
ngpu: 4

#dataloader
dataset_class: VITSDataset
Expand All @@ -31,8 +31,8 @@ vits_hop_size: !!int 240
spec_max_len: !!int 240
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 12
num_workers: !!int 10
batch_size: !!int 24
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)
Expand Down
68 changes: 68 additions & 0 deletions dataset/vcc2020/data_prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import glob
import argparse
import os
import sys
import re


if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument('--split', type = str)
parser.add_argument('--db', type = str)
parser.add_argument('--data_dir', type = str)
parser.add_argument('--spk', type = str)
parser.add_argument('--num_dev', type = int, default = 10)
args = parser.parse_args()


scp_path = os.path.join(args.data_dir, args.split, 'wav.scp')
text_path = os.path.join(args.data_dir, args.split, 'text')
os.makedirs(os.path.dirname(scp_path), exist_ok = True)

f = open(scp_path, 'a')
f_text = open(text_path, 'a')
if args.split == 'eval':
wav_paths = sorted(glob.glob(os.path.join(args.db, args.spk, '[EGFM]3*.wav' )))
else:
wav_paths = list(sorted(glob.glob(os.path.join(args.db, args.spk, '[EGFM]1*.wav'))))\
+ list(sorted(glob.glob(os.path.join(args.db, args.spk, '[EGFM]2*.wav'))))
if args.split == 'train_nodev':
wav_paths = wav_paths[:-args.num_dev]
elif args.split == 'dev':
wav_paths = wav_paths[-args.num_dev:]

for ind, wav_path in enumerate(wav_paths):
basename = os.path.basename(wav_path).split('.')[0]
if basename.startswith('E'):
trans_path = os.path.join(args.db, 'prompts', 'Eng_transcriptions.txt')
elif basename.startswith('G'):
trans_path = os.path.join(args.db, 'prompts', 'Ger_transcriptions.txt')
elif basename.startswith('F'):
trans_path = os.path.join(args.db, 'prompts', 'Fin_transcriptions.txt')
elif basename.startswith('M'):
trans_path = os.path.join(args.db, 'prompts', 'Man_transcriptions.txt')


id = f'{args.spk}_{basename}'
for line in open(trans_path).readlines():
line = line.strip()
if line.split()[0] == basename[1:]:
text = ' '.join(line.split()[1:])
assert type(text) == str
text = text.upper()
if not basename.startswith('M'):
text = re.sub(r"[^A-Z ]", '', text)
f.write(f'{id} {wav_path}\n')
f_text.write(f'{id} {text}\n')
print(f'vcc2020 {args.split}: ({id} {wav_path}) {text}')

f.close()







25 changes: 25 additions & 0 deletions dataset/vcc2020/data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash


db=downloads/vcc2020
data_dir=data/vcc2020



splits=("train_nodev" "dev" "eval")
spks=("SEF1" "SEF2" "SEM1" "SEM2" "TEF1" "TEF2" "TEM1" "TEM2" "TGM1" "TGF1" "TFM1" "TFF1" "TMM1" "TMF1")

for split in ${splits[*]}; do
[ -e $data_dir/$split/wav.scp ] && rm $data_dir/$split/wav.scp
[ -e $data_dir/$split/text ] && rm $data_dir/$split/text
for spk in ${spks[*]}; do
python3 dataset/vcc2020/data_prep.py \
--db $db \
--split $split \
--spk $spk \
--data_dir $data_dir
done
done



2 changes: 1 addition & 1 deletion decoder/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def infer_GradTTS(model, ling, pros, spk):


ling_lengths = torch.LongTensor([ling.size(2)]).to(ling.device)
mel = model(ling, ling_lengths, spk, pros, 10)
mel = model.inference(ling, ling_lengths, spk, pros, 10)
mel = mel.transpose(1,2)
return mel

Expand Down
43 changes: 33 additions & 10 deletions evaluation/generate_eval_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
parser.add_argument('--eval_list_out_path', type = str)
# task setup

n_samples_per_trg_speaker = parser.add_argument('--n_samples_per_trg_speaker', type = int)
n_eval_speakers = parser.add_argument('--n_eval_speakers', type = int)
n_samples_per_src_speaker = parser.add_argument('--n_samples_per_src_speaker', type = int)

parser.add_argument('--n_samples_per_trg_speaker', type = int)
parser.add_argument('--n_eval_speakers', type = int)
parser.add_argument('--n_samples_per_src_speaker', type = int)
parser.add_argument('--src_speakers', type = str, nargs = '+', default = None)
parser.add_argument('--trg_speakers', type = str, nargs = '+', default = None)

args = parser.parse_args()
# load in all speakers in eval set
Expand Down Expand Up @@ -76,26 +78,47 @@
selected_trg_metas = {}

# sample spk metas
for spk in selected_speakers:
if args.src_speakers is None:
src_speakers = selected_speakers[:]
else:
src_speakers = args.src_speakers[:]
if args.trg_speakers is None:
trg_speakers = selected_speakers[:]
else:
trg_speakers = args.trg_speakers[:]


for spk in src_speakers:
print(spk)
selected_src_metas[spk] = []
selected_trg_metas[spk] = []
#selected_trg_metas[spk] = []
_spk_metas = spk2wavs[spk]
_src_spk_metas_idxs = random.sample(range(0,len(_spk_metas)), k = int(args.n_samples_per_src_speaker))
_trg_spk_metas_idxs = random.sample(range(0,len(_spk_metas)), k = int(args.n_samples_per_trg_speaker))
#_trg_spk_metas_idxs = random.sample(range(0,len(_spk_metas)), k = int(args.n_samples_per_trg_speaker))
_selected_src_spk_metas = [ _spk_metas[_i] for _i in _src_spk_metas_idxs]
_selected_trg_spk_metas = [ _spk_metas[_i] for _i in _trg_spk_metas_idxs]
#_selected_trg_spk_metas = [ _spk_metas[_i] for _i in _trg_spk_metas_idxs]
selected_src_metas[spk].extend(_selected_src_spk_metas)
#selected_trg_metas[spk].extend(_selected_trg_spk_metas)
#print(f'spk {spk}| src: {len(_selected_src_spk_metas)}, trg: {len(_selected_trg_spk_metas)}')
print(f'spk {spk}| src: {len(_selected_src_spk_metas)}')

for spk in trg_speakers:
print(spk)
selected_trg_metas[spk] = []
_spk_metas = spk2wavs[spk]
_trg_spk_metas_idxs = random.sample(range(0,len(_spk_metas)), k = int(args.n_samples_per_trg_speaker))
_selected_trg_spk_metas = [ _spk_metas[_i] for _i in _trg_spk_metas_idxs]
selected_trg_metas[spk].extend(_selected_trg_spk_metas)
print(f'spk {spk}| src: {len(_selected_src_spk_metas)}, trg: {len(_selected_trg_spk_metas)}')
print(f'spk {spk}| trg: {len(_selected_trg_spk_metas)}')



# produce eval list
if args.task == 'vc':
for src_spk in selected_speakers:
for src_spk in src_speakers:
src_metas = selected_src_metas[src_spk]
for _meta in src_metas:
for trg_spk in selected_speakers:
for trg_spk in trg_speakers:
if src_spk != trg_spk:
trg_metas = selected_trg_metas[trg_spk]
trg_wavs = [_trg_meta['wav_path'] for _trg_meta in trg_metas]
Expand Down
47 changes: 43 additions & 4 deletions evaluation/speechbrain_asv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from speechbrain.pretrained import EncoderClassifier
from speechbrain.utils.data_utils import split_path
from speechbrain.pretrained.fetching import fetch
from copy import copy
import os
from speechbrain.utils.metric_stats import EER
import sys
Expand Down Expand Up @@ -34,11 +37,47 @@ class SpeakerRecognition(EncoderClassifier):
"embedding_model",
"mean_var_norm_emb",
]


def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

def new_load_audio(self, path, savedir="audio_cache", **kwargs):
"""Load an audio file with this model's input spec
When using a speech model, it is important to use the same type of data,
as was used to train the model. This means for example using the same
sampling rate and number of channels. It is, however, possible to
convert a file from a higher sampling rate to a lower one (downsampling).
Similarly, it is simple to downmix a stereo file to mono.
The path can be a local path, a web url, or a link to a huggingface repo.
"""
source, fl = split_path(path)
kwargs = copy(kwargs) # shallow copy of references only
channels_first = kwargs.pop(
"channels_first", False
) # False as default value: SB consistent tensor format
if kwargs:
fetch_kwargs = dict()
for key in [
"overwrite",
"save_filename",
"use_auth_token",
"revision",
"cache_dir",
"silent_local_fetch",
]:
if key in kwargs:
fetch_kwargs[key] = kwargs.pop(key)
path = fetch(fl, source=source, savedir=savedir, **fetch_kwargs)
else:
path = fetch(fl, source=source, savedir=savedir)
signal, sr = torchaudio.load(
str(path), channels_first=channels_first, **kwargs
)
return self.audio_normalizer(signal, sr)

def verify_batch(
self, wavs1, wavs2, wav1_lens=None, wav2_lens=None, threshold=0.25
):
Expand Down Expand Up @@ -95,8 +134,8 @@ def verify_files(self, path_x, path_y, save_dir):
speaker and 0 otherwise.
"""

waveform_x = self.load_audio(path_x, savedir = save_dir)
waveform_y = self.load_audio(path_y, savedir = save_dir)
waveform_x = self.new_load_audio(path_x, savedir = save_dir, overwrite = True)
waveform_y = self.new_load_audio(path_y, savedir = save_dir, overwrite = True)
# Fake batches:
batch_x = waveform_x.unsqueeze(0)
batch_y = waveform_y.unsqueeze(0)
Expand Down Expand Up @@ -139,7 +178,7 @@ def verify_files(self, path_x, path_y, save_dir):

score, prediction = verification.verify_files(wav_1, wav_2, save_dir)
positive_scores.append(score)
print(f'{wav_1} {wav_2} {score} {prediction}', flush = True)
print(f'{wav_1} {wav_2} {score.item()} {prediction.item()}', flush = True)

negative_scores = []
for pair in tqdm(negative_pairs, total = len(negative_pairs)):
Expand All @@ -149,7 +188,7 @@ def verify_files(self, path_x, path_y, save_dir):

score, prediction = verification.verify_files(wav_1, wav_2, save_dir)
negative_scores.append(score)
print(f'{wav_1} {wav_2} {score} {prediction}', flush = True)
print(f'{wav_1} {wav_2} {score.item()} {prediction.item()}', flush = True)

positive_scores = torch.tensor(positive_scores)
negative_scores = torch.tensor(negative_scores)
Expand Down
Loading

0 comments on commit 6b295d8

Please sign in to comment.