1. add asv evaluation; 2. update grad_tts configs; 3. update vits model;

MingjieChen · Mar 16, 2023 · b982857 · b982857
1 parent 04a26db
commit b982857
Show file tree

Hide file tree

Showing 24 changed files with 379 additions and 90 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@ pretrained_models
 
 # parts that are not public yet
 evaluation/UTMOS-demo
+evaluation/eval_list*.txt
 
 
 

diff --git a/bin/compute_statistics.sh b/bin/compute_statistics.sh
@@ -4,6 +4,8 @@ dataset=$1
 train_split=$2
 feature_type=$3
 
+
+echo "running compute_statistics for $feature_type $dataset $train_split"
 python preprocess/compute_statistics.py \
     --dump_dir dump/$dataset/ \
     --split $train_split \

diff --git a/bin/normalize.sh b/bin/normalize.sh
@@ -7,6 +7,7 @@ feature_type=$3
 stats_path=$4
 
 for split in $splits ;do
+    echo "running normalize for $feature_type  $dataset $split"
     python preprocess/normalize.py \
             --stats_path $stats_path \
             --dump_dir dump/$dataset \

diff --git a/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_gradtts_ppgvchifigan.yaml b/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_gradtts_ppgvchifigan.yaml
@@ -44,7 +44,7 @@ decoder_params:
     spk_emb_dim: !!int  256
     prosodic_rep_type: continuous
     prosodic_net:
-        hidden_dim: !!int 80
+        hidden_dim: !!int 512
     n_enc_channels: !!int 192
     filter_channels: !!int 768
     filter_channels_dp: !!int 256

diff --git a/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_vits_none.yaml b/configs/vctk_vqwav2vec_uttdvec_ppgvcf0_vits_none.yaml
@@ -22,7 +22,7 @@ show_freq: 100 # show training information frequency
 load_only_params: !!bool False
 seed: !!int 1234
 trainer: VITSTrainer
-ngpu: 2
+ngpu: 1
 
 #dataloader
 dataset_class: VITSDataset
@@ -32,7 +32,7 @@ spec_max_len: !!int 480
 sort: !!bool True
 dump_dir: dump
 num_workers: !!int 4
-batch_size: !!int 32
+batch_size: !!int 24
 drop_last: !!bool True
 rm_long_utt: !!bool False # remove too long utterances from metadata
 max_utt_duration: !!float 10.0 # max utterance duration (seconds)
@@ -56,9 +56,9 @@ decoder_params:
     resblock : 1
     resblock_kernel_sizes: [3,7,11]
     resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
-    upsample_rates: [10,6,2,2]
+    upsample_rates: [8,5,3,2]
     upsample_initial_channel: !!int 512
-    upsample_kernel_sizes: [20, 12, 4, 4]
+    upsample_kernel_sizes: [15, 15, 5, 5]
     n_layers_q: !!int 3
     use_spectral_norm: !!bool False
     filter_length: !!int 1024

diff --git a/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_diffwave_none.yaml b/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_diffwave_none.yaml
@@ -12,7 +12,6 @@ spk_enc: utt_ecapa_tdnn
 pros_enc: norm_fastspeech2_pitch_energy
 decoder: DiffWave
 mel_type: vits_spec # will not use it 
-vocoder: none
 pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy
 
 # training 
@@ -23,13 +22,13 @@ show_freq: 100 # show training information frequency
 load_only_params: !!bool False
 seed: !!int 1234
 trainer: DiffWaveTrainer
-ngpu: 2
+ngpu: 1
 
 #dataloader
 dataset_class: VITSDataset
 sampling_rate: !!int 24000
 vits_hop_size: !!int 240
-spec_max_len: !!int 80
+spec_max_len: !!int 60
 sort: !!bool False
 dump_dir: dump
 num_workers: !!int 8
@@ -41,7 +40,7 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)
 
 # decoder params
 decoder_params: 
-    segment_size: 19200 
+    segment_size: 14400 
     input_dim: !!int 768
     spk_emb_dim: !!int  192
     prosodic_rep_type: discrete

diff --git a/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_gradtts_bigvgan.yaml b/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_gradtts_bigvgan.yaml
@@ -11,9 +11,10 @@ ling_enc: whisper_ppg_small
 spk_enc: utt_ecapa_tdnn
 pros_enc: norm_fastspeech2_pitch_energy
 decoder: GradTTS
-mel_type: bigvgan_mel
+mel_type: norm_bigvgan_mel
 vocoder: bigvgan
 pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy
+mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy
 
 # training 
 fp16_run: !!bool False
@@ -39,13 +40,13 @@ max_utt_duration: !!float 10.0 # max utterance duration (seconds)
 
 # decoder params
 decoder_params: 
-    use_prior_loss: !!bool False
+    use_prior_loss: !!bool True
     n_feats: !!int 100
     input_dim: !!int 768
     spk_emb_dim: !!int  192
     prosodic_rep_type: discrete
     prosodic_net:
-        hidden_dim: 100
+        hidden_dim: 256
         prosodic_bins: !!int 256
         prosodic_stats_path: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/pitch_energy_min_max.npy
     n_enc_channels: !!int 192

diff --git a/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_tacoar_bigvgan.yaml b/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_tacoar_bigvgan.yaml
@@ -11,9 +11,10 @@ ling_enc: whisper_ppg_small
 spk_enc: utt_ecapa_tdnn
 pros_enc: norm_fastspeech2_pitch_energy
 decoder: TacoAR
-mel_type: bigvgan_mel
+mel_type: norm_bigvgan_mel
 vocoder: bigvgan
 pros_stats: dump/vctk/train_nodev_all/fastspeech2_pitch_energy/train_nodev_all.npy
+mel_stats: dump/vctk/train_nodev_all/bigvgan_mel/train_nodev_all.npy
 
 
 # training 
@@ -31,7 +32,7 @@ dataset_class: Dataset
 sort: !!bool True
 dump_dir: dump
 num_workers: !!int 8
-batch_size: 16
+batch_size: 32
 drop_last: !!bool True
 rm_long_utt: !!bool True # remove too long utterances from metadata
 max_utt_duration: !!float 10.0 # max utterance duration (seconds)

diff --git a/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_vits_none.yaml b/configs/vctk_whisperppgsmall_uttecapatdnn_fs2pitchenergy_vits_none.yaml
@@ -33,7 +33,7 @@ spec_max_len: !!int 480
 sort: !!bool True
 dump_dir: dump
 num_workers: !!int 4
-batch_size: !!int 16
+batch_size: !!int 24
 drop_last: !!bool True
 rm_long_utt: !!bool False # remove too long utterances from metadata
 max_utt_duration: !!float 10.0 # max utterance duration (seconds)
@@ -59,9 +59,9 @@ decoder_params:
     resblock : 1
     resblock_kernel_sizes: [3,7,11]
     resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
-    upsample_rates: [10,6,2,2]
+    upsample_rates: [8,5,3,2]
     upsample_initial_channel: !!int 512
-    upsample_kernel_sizes: [20, 12, 4, 4]
+    upsample_kernel_sizes: [15, 15, 5, 5]
     n_layers_q: !!int 3
     use_spectral_norm: !!bool False
     filter_length: !!int 1024

diff --git a/decoder/diffwave/model.py b/decoder/diffwave/model.py
@@ -24,6 +24,8 @@
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 
+from tqdm import tqdm
+
 Linear = nn.Linear
 ConvTranspose2d = nn.ConvTranspose2d
 
@@ -185,6 +187,7 @@ def _build_embedding(self, max_steps):
 class Upsampler(nn.Module):
   def __init__(self, n_mels):
     super().__init__()
+    # hard code upsampling scale to 240
     self.conv1 = ConvTranspose2d(1, 1, [3, 24], stride=[1, 12], padding=[1, 6])
     self.conv2 = ConvTranspose2d(1, 1,  [3, 40], stride=[1, 20], padding=[1, 10])
 
@@ -199,7 +202,7 @@ def forward(self, x):
 
 
 class ResidualBlock(nn.Module):
-  def __init__(self, n_mels, residual_channels, dilation, uncond=False):
+  def __init__(self, n_mels, residual_channels, dilation, spk_emb_dim):
     '''
     :param n_mels: inplanes of conv1x1 for spectrogram conditional
     :param residual_channels: audio conv
@@ -209,23 +212,19 @@ def __init__(self, n_mels, residual_channels, dilation, uncond=False):
     super().__init__()
     self.dilated_conv = Conv1d(residual_channels, 2 * residual_channels, 3, padding=dilation, dilation=dilation)
     self.diffusion_projection = Linear(512, residual_channels)
-    if not uncond: # conditional model
-      self.conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
-    else: # unconditional model
-      self.conditioner_projection = None
+    self.local_conditioner_projection = Conv1d(n_mels, 2 * residual_channels, 1)
+    self.global_conditioner_projection = Conv1d(spk_emb_dim, 2 * residual_channels, 1)
 
     self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
 
-  def forward(self, x, diffusion_step, conditioner=None):
+  def forward(self, x, diffusion_step, c, g):
 
     diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
     y = x + diffusion_step
-    if self.conditioner_projection is None: # using a unconditional model
-      y = self.dilated_conv(y)
-    else:
-      conditioner = self.conditioner_projection(conditioner)
-      y = self.dilated_conv(y) + conditioner
-
+    local_condition = self.local_conditioner_projection(c)
+    global_condition = self.global_conditioner_projection(g)
+    y = self.dilated_conv(y) + local_condition + global_condition
+
     gate, filter = torch.chunk(y, 2, dim=1)
     y = torch.sigmoid(gate) * torch.tanh(filter)
 
@@ -254,6 +253,7 @@ def __init__(self, config):
 
 
     noise_schedule = np.linspace(1e-4, 0.05, 50).tolist()
+    self.noise_schedule = noise_schedule
     self.diffusion_embedding = DiffusionEmbedding(len(noise_schedule))
 
 
@@ -279,11 +279,11 @@ def __init__(self, config):
         elif config['prosodic_rep_type'] == 'continuous':
             self.prosodic_net = ContinuousProsodicNet(config['prosodic_net'])    
 
-    self.reduce_proj = nn.Conv1d(self.spk_emb_dim + inter_channels, inter_channels, 1,1,0)
+    #self.reduce_proj = nn.Conv1d(self.spk_emb_dim + inter_channels, inter_channels, 1,1,0)
 
     self.input_projection = Conv1d(1, residual_channels, 1)
     self.residual_layers = nn.ModuleList([
-        ResidualBlock(inter_channels, residual_channels, 2**(i % dilation_cycle_length), uncond=False)
+        ResidualBlock(inter_channels, residual_channels, 2**(i % dilation_cycle_length), spk_emb_dim = self.spk_emb_dim)
         for i in range(residual_layers)
     ])
 
@@ -300,9 +300,9 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):
         x += pros
 
     spk_embeds = F.normalize(
-            spk.squeeze(2)).unsqueeze(2).expand(ling.size(0), self.spk_emb_dim, ling.size(2))
-    x = torch.cat([x, spk_embeds], dim=1)
-    x = self.reduce_proj(x)
+            spk.squeeze(2)).unsqueeze(2).expand(ling.size(0), self.spk_emb_dim, ling.size(2) * 240)
+    #x = torch.cat([x, spk_embeds], dim=1)
+    #x = self.reduce_proj(x)
 
     x = self.upsampler(x)
 
@@ -314,11 +314,49 @@ def forward(self, audio, diffusion_step, ling, pros, spk, lengths):
 
     skip = None
     for layer in self.residual_layers:
-      y, skip_connection = layer(y, diffusion_step, x)
+      y, skip_connection = layer(y, diffusion_step, x, spk_embeds)
       skip = skip_connection if skip is None else skip_connection + skip
 
     y = skip / sqrt(len(self.residual_layers))
     y = self.skip_projection(y)
     y = F.relu(y)
     y = self.output_projection(y)
     return y
+
+  def inference(self, ling, pros, spk, lengths):
+    fast_sampling = True
+    training_noise_schedule = np.array(self.noise_schedule)    
+    inference_noise_schedule=np.array([0.0001, 0.001, 0.01, 0.05, 0.2, 0.5])
+    inference_noise_schedule = np.array(inference_noise_schedule) if fast_sampling else training_noise_schedule
+
+    talpha = 1 - training_noise_schedule
+    talpha_cum = np.cumprod(talpha)
+
+    beta = inference_noise_schedule
+    alpha = 1 - beta
+    alpha_cum = np.cumprod(alpha)
+
+    T = []
+    for s in range(len(inference_noise_schedule)):
+      for t in range(len(training_noise_schedule) - 1):
+        if talpha_cum[t+1] <= alpha_cum[s] <= talpha_cum[t]:
+          twiddle = (talpha_cum[t]**0.5 - alpha_cum[s]**0.5) / (talpha_cum[t]**0.5 - talpha_cum[t+1]**0.5)
+          T.append(t + twiddle)
+          break
+    T = np.array(T, dtype=np.float32)
+
+    # hard code hop_size = 240
+    audio = torch.randn(ling.shape[0], 240 * ling.shape[-1], device=ling.device)
+    noise_scale = torch.from_numpy(alpha_cum**0.5).float().unsqueeze(1).to(ling.device)
+    for n in tqdm(range(len(alpha) - 1, -1, -1)):
+      c1 = 1 / alpha[n]**0.5
+      c2 = beta[n] / (1 - alpha_cum[n])**0.5
+      audio = c1 * (audio - c2 * self.forward(audio, torch.tensor([T[n]], device=audio.device), ling, pros, spk, lengths).squeeze(1))
+      if n > 0:
+        noise = torch.randn_like(audio)
+        sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
+        audio += sigma * noise
+      audio = torch.clamp(audio, -1.0, 1.0)
+    return audio 
+
+
diff --git a/decoder/diffwave/trainer.py b/decoder/diffwave/trainer.py
@@ -182,7 +182,7 @@ def _eval_epoch(self):
 
         eval_losses = defaultdict(list)
         self.model.eval()
-        for eval_steps_per_epoch, batch in enumerate(self.dev_dataloader, 1):
+        for eval_steps_per_epoch, batch in tqdm(enumerate(self.dev_dataloader, 1), total = len(self.dev_dataloader)):
             _batch = []
             for b in batch:
                 if isinstance(b, torch.Tensor):