Skip to content

Commit

Permalink
1. add asv evaluation; 2. update grad_tts configs; 3. update vits model;
Browse files Browse the repository at this point in the history
  • Loading branch information
MingjieChen committed Mar 16, 2023
1 parent 04a26db commit b982857
Show file tree
Hide file tree
Showing 24 changed files with 379 additions and 90 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pretrained_models

# parts that are not public yet
evaluation/UTMOS-demo
evaluation/eval_list*.txt



Expand Down
2 changes: 2 additions & 0 deletions bin/compute_statistics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ dataset=$1
train_split=$2
feature_type=$3


echo "running compute_statistics for $feature_type $dataset $train_split"
python preprocess/compute_statistics.py \
--dump_dir dump/$dataset/ \
--split $train_split \
Expand Down
1 change: 1 addition & 0 deletions bin/normalize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ feature_type=$3
stats_path=$4

for split in $splits ;do
echo "running normalize for $feature_type $dataset $split"
python preprocess/normalize.py \
--stats_path $stats_path \
--dump_dir dump/$dataset \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ decoder_params:
spk_emb_dim: !!int 256
prosodic_rep_type: continuous
prosodic_net:
hidden_dim: !!int 80
hidden_dim: !!int 512
n_enc_channels: !!int 192
filter_channels: !!int 768
filter_channels_dp: !!int 256
Expand Down
8 changes: 4 additions & 4 deletions configs/vctk_vqwav2vec_uttdvec_ppgvcf0_vits_none.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: VITSTrainer
ngpu: 2
ngpu: 1

#dataloader
dataset_class: VITSDataset
Expand All @@ -32,7 +32,7 @@ spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 32
batch_size: !!int 24
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)
Expand All @@ -56,9 +56,9 @@ decoder_params:
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_rates: [8,5,3,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
upsample_kernel_sizes: [15, 15, 5, 5]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ spk_enc: utt_ecapa_tdnn
pros_enc: norm_fastspeech2_pitch_energy
decoder: DiffWave
mel_type: vits_spec # will not use it
vocoder: none
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy

# training
Expand All @@ -23,13 +22,13 @@ show_freq: 100 # show training information frequency
load_only_params: !!bool False
seed: !!int 1234
trainer: DiffWaveTrainer
ngpu: 2
ngpu: 1

#dataloader
dataset_class: VITSDataset
sampling_rate: !!int 24000
vits_hop_size: !!int 240
spec_max_len: !!int 80
spec_max_len: !!int 60
sort: !!bool False
dump_dir: dump
num_workers: !!int 8
Expand All @@ -41,7 +40,7 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
segment_size: 19200
segment_size: 14400
input_dim: !!int 768
spk_emb_dim: !!int 192
prosodic_rep_type: discrete
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ ling_enc: whisper_ppg_small
spk_enc: utt_ecapa_tdnn
pros_enc: norm_fastspeech2_pitch_energy
decoder: GradTTS
mel_type: bigvgan_mel
mel_type: norm_bigvgan_mel
vocoder: bigvgan
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy
mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy

# training
fp16_run: !!bool False
Expand All @@ -39,13 +40,13 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)

# decoder params
decoder_params:
use_prior_loss: !!bool False
use_prior_loss: !!bool True
n_feats: !!int 100
input_dim: !!int 768
spk_emb_dim: !!int 192
prosodic_rep_type: discrete
prosodic_net:
hidden_dim: 100
hidden_dim: 256
prosodic_bins: !!int 256
prosodic_stats_path: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/pitch_energy_min_max.npy
n_enc_channels: !!int 192
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ ling_enc: whisper_ppg_small
spk_enc: utt_ecapa_tdnn
pros_enc: norm_fastspeech2_pitch_energy
decoder: TacoAR
mel_type: bigvgan_mel
mel_type: norm_bigvgan_mel
vocoder: bigvgan
pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy
mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy


# training
Expand All @@ -31,7 +32,7 @@ dataset_class: Dataset
sort: !!bool True
dump_dir: dump
num_workers: !!int 8
batch_size: 16
batch_size: 32
drop_last: !!bool True
rm_long_utt: !!bool True # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec_max_len: !!int 480
sort: !!bool True
dump_dir: dump
num_workers: !!int 4
batch_size: !!int 16
batch_size: !!int 24
drop_last: !!bool True
rm_long_utt: !!bool False # remove too long utterances from metadata
max_utt_duration: !!float 10.0 # max utterance duration (seconds)
Expand All @@ -59,9 +59,9 @@ decoder_params:
resblock : 1
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [10,6,2,2]
upsample_rates: [8,5,3,2]
upsample_initial_channel: !!int 512
upsample_kernel_sizes: [20, 12, 4, 4]
upsample_kernel_sizes: [15, 15, 5, 5]
n_layers_q: !!int 3
use_spectral_norm: !!bool False
filter_length: !!int 1024
Expand Down
74 changes: 56 additions & 18 deletions decoder/diffwave/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d

from tqdm import tqdm

Linear = nn.Linear
ConvTranspose2d = nn.ConvTranspose2d

Expand Down Expand Up @@ -185,6 +187,7 @@ def _build_embedding(self, max_steps):
class Upsampler(nn.Module):
def __init__(self, n_mels):
super().__init__()
# hard code upsampling scale to 240
self.conv1 = ConvTranspose2d(1, 1, [3, 24], stride=[1, 12], padding=[1, 6])
self.conv2 = ConvTranspose2d(1, 1, [3, 40], stride=[1, 20], padding=[1, 10])

Expand All @@ -199,7 +202,7 @@ def forward(self, x):


class ResidualBlock(nn.Module):
def __init__(self, n_mels, residual_channels, dilation, uncond=False):
def __init__(self, n_mels, residual_channels, dilation, spk_emb_dim):
'''
:param n_mels: inplanes of conv1x1 for spectrogram conditional
:param residual_channels: audio conv
Expand All @@ -209,23 +212,19 @@ def __init__(self, n_mels, residual_channels, dilation, uncond=False):
super().__init__()
self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
self.diffusion_projection = Linear(512, residual_channels)
if not uncond: # conditional model
self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
else: # unconditional model
self.conditioner_projection = None
self.local_conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
self.global_conditioner_projection = Conv1d(spk_emb_dim, 2 * residual_channels, 1)

self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)

def forward(self, x, diffusion_step, conditioner=None):
def forward(self, x, diffusion_step, c, g):

diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
y = x + diffusion_step
if self.conditioner_projection is None: # using a unconditional model
y = self.dilated_conv(y)
else:
conditioner = self.conditioner_projection(conditioner)
y = self.dilated_conv(y) + conditioner

local_condition = self.local_conditioner_projection(c)
global_condition = self.global_conditioner_projection(g)
y = self.dilated_conv(y) + local_condition + global_condition

gate, filter = torch.chunk(y, 2, dim=1)
y = torch.sigmoid(gate) * torch.tanh(filter)

Expand Down Expand Up @@ -254,6 +253,7 @@ def __init__(self, config):


noise_schedule = np.linspace(1e-4, 0.05, 50).tolist()
self.noise_schedule = noise_schedule
self.diffusion_embedding = DiffusionEmbedding(len(noise_schedule))


Expand All @@ -279,11 +279,11 @@ def __init__(self, config):
elif config['prosodic_rep_type'] == 'continuous':
self.prosodic_net = ContinuousProsodicNet(config['prosodic_net'])

self.reduce_proj = nn.Conv1d(self.spk_emb_dim + inter_channels, inter_channels, 1,1,0)
#self.reduce_proj = nn.Conv1d(self.spk_emb_dim + inter_channels, inter_channels, 1,1,0)

self.input_projection = Conv1d(1, residual_channels, 1)
self.residual_layers = nn.ModuleList([
ResidualBlock(inter_channels, residual_channels, 2**(i % dilation_cycle_length), uncond=False)
ResidualBlock(inter_channels, residual_channels, 2**(i % dilation_cycle_length), spk_emb_dim = self.spk_emb_dim)
for i in range(residual_layers)
])

Expand All @@ -300,9 +300,9 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):
x += pros

spk_embeds = F.normalize(
spk.squeeze(2)).unsqueeze(2).expand(ling.size(0), self.spk_emb_dim, ling.size(2))
x = torch.cat([x, spk_embeds], dim=1)
x = self.reduce_proj(x)
spk.squeeze(2)).unsqueeze(2).expand(ling.size(0), self.spk_emb_dim, ling.size(2) * 240)
#x = torch.cat([x, spk_embeds], dim=1)
#x = self.reduce_proj(x)

x = self.upsampler(x)

Expand All @@ -314,11 +314,49 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):

skip = None
for layer in self.residual_layers:
y, skip_connection = layer(y, diffusion_step, x)
y, skip_connection = layer(y, diffusion_step, x, spk_embeds)
skip = skip_connection if skip is None else skip_connection + skip

y = skip / sqrt(len(self.residual_layers))
y = self.skip_projection(y)
y = F.relu(y)
y = self.output_projection(y)
return y

def inference(self, ling, pros, spk, lengths):
fast_sampling = True
training_noise_schedule = np.array(self.noise_schedule)
inference_noise_schedule=np.array([0.0001, 0.001, 0.01, 0.05, 0.2, 0.5])
inference_noise_schedule = np.array(inference_noise_schedule) if fast_sampling else training_noise_schedule

talpha = 1 - training_noise_schedule
talpha_cum = np.cumprod(talpha)

beta = inference_noise_schedule
alpha = 1 - beta
alpha_cum = np.cumprod(alpha)

T = []
for s in range(len(inference_noise_schedule)):
for t in range(len(training_noise_schedule) - 1):
if talpha_cum[t+1] <= alpha_cum[s] <= talpha_cum[t]:
twiddle = (talpha_cum[t]**0.5 - alpha_cum[s]**0.5) / (talpha_cum[t]**0.5 - talpha_cum[t+1]**0.5)
T.append(t + twiddle)
break
T = np.array(T, dtype=np.float32)

# hard code hop_size = 240
audio = torch.randn(ling.shape[0], 240 * ling.shape[-1], device=ling.device)
noise_scale = torch.from_numpy(alpha_cum**0.5).float().unsqueeze(1).to(ling.device)
for n in tqdm(range(len(alpha) - 1, -1, -1)):
c1 = 1 / alpha[n]**0.5
c2 = beta[n] / (1 - alpha_cum[n])**0.5
audio = c1 * (audio - c2 * self.forward(audio, torch.tensor([T[n]], device=audio.device), ling, pros, spk, lengths).squeeze(1))
if n > 0:
noise = torch.randn_like(audio)
sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
audio += sigma * noise
audio = torch.clamp(audio, -1.0, 1.0)
return audio


2 changes: 1 addition & 1 deletion decoder/diffwave/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def _eval_epoch(self):

eval_losses = defaultdict(list)
self.model.eval()
for eval_steps_per_epoch, batch in enumerate(self.dev_dataloader, 1):
for eval_steps_per_epoch, batch in tqdm(enumerate(self.dev_dataloader, 1), total = len(self.dev_dataloader)):
_batch = []
for b in batch:
if isinstance(b, torch.Tensor):
Expand Down
Loading

0 comments on commit b982857

Please sign in to comment.