extract and compute_statistics for fastspeech2_pitch_energy as prosod…

…ic_encoder
MingjieChen · Feb 17, 2023 · 9ab61ef · 9ab61ef
1 parent 8647381
commit 9ab61ef
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 18 deletions.
diff --git a/bin/compute_statistics.sh b/bin/compute_statistics.sh
@@ -7,4 +7,5 @@ source $conda/bin/activate $conda_env
 python preprocess/compute_statistics.py \
     --dump_dir dump/vctk/ \
     --split train_nodev_all \
-    --metadata data/vctk/train_nodev_all/metadata.csv
+    --metadata data/vctk/train_nodev_all/metadata.csv \
+    --feature_type fastspeech2_pitch_energy
diff --git a/bin/feature_extraction_multi_jobs.sh b/bin/feature_extraction_multi_jobs.sh
@@ -6,8 +6,8 @@ conda_env=torch_1.7
 # setup
 
 dataset=vctk
-config=configs/preprocess_vctk_ppgvc_mel.yaml
-feature_type=ppgvc_f0
+config=configs/preprocess_vctk.yaml
+feature_type=fastspeech2_pitch_energy
 splits="train_nodev_all dev_all"
 
 script_dir=scripts/$dataset/preprocess
@@ -35,6 +35,6 @@ python3 feature_extraction.py \
 EOF
     chmod +x $b
     submitjob -m 10000 $l $b
-    echo "submitjob for $dataset $split  $spk $feature_type"
+    echo "submitjob for $dataset $split  $spk $feature_type see log $l"
     done
 done        
diff --git a/dataset.py b/dataset.py
@@ -10,6 +10,7 @@
 from torch.utils.data import DataLoader
 from collections import defaultdict
 from prosodic_encoder.ppgvc_f0.ppgvc_lf0 import get_cont_lf0 as process_ppgvc_f0
+from prosodic_encoder.fastspeech2_pitch_energy.pitch_energy import process_fastspeech2_pitch_energy
 def get_dataloader(config):
     train_dataset = Dataset(config, config['train_meta'], config['train_set'])
     dev_dataset = Dataset(config, config['dev_meta'], config['dev_set'])

diff --git a/decoder/fastspeech2/module.py b/decoder/fastspeech2/module.py
@@ -10,19 +10,78 @@
     Conv1DBlock,
     TransformerBlock,
 )
+
+class DiscreteProsodicNet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        bins = config['prosodic_bins']
+        quantize = config['quantize']
+        prosodic_stats_path = config['prosodic_stats_path']
+
+    def forward(self, x):
+        return x    
+class ContinuousProsodicNet(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        hidden_dim = config['hidden_dim']
+        self.pitch_convs = torch.nn.Sequential(
+            torch.nn.Conv1d(2, hidden_dim, kernel_size=1, bias=False),
+            torch.nn.LeakyReLU(0.1),
+
+            torch.nn.InstanceNorm1d(hidden_dim, affine=False),
+            torch.nn.Conv1d(
+                hidden_dim, hidden_dim, 
+                kernel_size= 3, 
+                stride=1, 
+                padding=1,
+            ),
+            torch.nn.LeakyReLU(0.1),
+
+            torch.nn.InstanceNorm1d(encoder_dim, affine=False),
+            torch.nn.Conv1d(
+                hidden_dim, hidden_dim, 
+                kernel_size= 3, 
+                stride=1, 
+                padding=1,
+            ),
+            torch.nn.LeakyReLU(0.1),
+
+            torch.nn.InstanceNorm1d(hidden_dim, affine=False),
+        )
+    def forward(self, x):
+
+        out = x.transpose(1,2)
+        out = self.pitch_convs(out)
+        out = out.transpose(1,2)
+        return out    
+
+
 class VarianceAdaptor(nn.Module):
     """ Variance Adaptor """
 
     def __init__(self, model_config):
         super(VarianceAdaptor, self).__init__()
 
         self.d_model = model_config["transformer"]["encoder_hidden"]
-        self.projection = nn.Linear(model_config['spk_emb_dim'],model_config['transformer']['encoder_hidden'])
+        self.reduce_projection = nn.Linear(model_config['transformer']['encoder_hidden'] + model_config['spk_emb_dim'], model_config['transformer']['encoder_hidden'])
+        if model_config['prosodic_rep_type'] == 'continues':
+            self.pros_net = ContinuousProsodicNet(model_config['pitch_net'])
+        elif model_config['prosodic_rep_type'] == 'discrete':
+            self.pros_net = DiscreteProsodicNet(model_config['pitch_net'])    
 
-    def forward(self, x, spk_emb, f0, mask, max_len):
+    def forward(self, x, spk_emb, pros_rep, mask, max_len):
         batch_size = x.size(0)
-        spk_emb = self.projection(spk_emb.squeeze(1)).unsqueeze(1)
-        x = x + spk_emb.expand(batch_size, max_len, self.d_model )
+        # integrate speaker embedding
+        spk_emb = F.normalize(spk_emb.squeeze(1)).unsqueeze(1)
+        x = torch.cat([x,spk_emb.expand(batch_size, max_len, self.d_model )], dim = -1)
+        x = self.reduce_projection(x)
+
+        # integrate prosodic_rep
+        processed_pros_rep = self.pitch_net(pros_rep)
+        x = x + processed_pros_rep
+
         if mask is not None:
             x = x.masked_fill(mask.unsqueeze(-1), 0)
 

diff --git a/feature_extraction.py b/feature_extraction.py
@@ -5,6 +5,7 @@
 import glob
 from preprocess.audio_utils import mel_spectrogram, normalize
 from prosodic_encoder.ppgvc_f0.ppgvc_lf0 import compute_f0 as compute_ppgvc_f0
+from prosodic_encoder.fastspeech2_pitch_energy.pitch_energy import extract_pitch_energy as compute_fastspeech2_pitch_energy
 import pyworld as pw
 import librosa
 import numpy  as np
@@ -127,6 +128,8 @@ def process_speaker(spk_meta, spk, config, args):
             feature = ppgvc_hifigan_logmelspectrogram(audio, config)     
         elif args.feature_type == 'ppgvc_f0':
             feature = compute_ppgvc_f0(audio, sr = config['sampling_rate'], frame_period = 10.0)    
+        elif args.feature_type == 'fastspeech2_pitch_energy':
+            feature = compute_fastspeech2_pitch_energy(audio, config)     
         feature_path = os.path.join(args.dump_dir, args.split, args.feature_type, spk, ID+'.npy')
         os.makedirs(os.path.dirname(feature_path), exist_ok = True)
         np.save(feature_path, feature)
@@ -143,7 +146,7 @@ def process_speaker(spk_meta, spk, config, args):
     parser.add_argument('--split', type = str)
     parser.add_argument('--max_workers', type = int, default = 20)
     parser.add_argument('--speaker', type = str, default = None)
-    parser.add_argument('--feature_type', type = str, default = 'mel', choices = ['mel', 'ppgvc_mel', 'ppgvc_f0', 'fastspeech2_f0'])
+    parser.add_argument('--feature_type', type = str, default = 'mel', choices = ['mel', 'ppgvc_mel', 'ppgvc_f0', 'fastspeech2_pitch_energy'])
     parser.add_argument('--pitch', default = False, action = 'store_true')
     args = parser.parse_args()
 

diff --git a/preprocess/compute_statistics.py b/preprocess/compute_statistics.py
@@ -17,11 +17,15 @@
     parser.add_argument('--dump_dir', type = str)
     parser.add_argument('--metadata', type = str)
     parser.add_argument('--split', type = str)
+    parser.add_argument('--feature_type', type = str, default = 'mel')
 
     args = parser.parse_args()
     # create scaler
-    scaler = StandardScaler()
-
+    if args.feature_type == 'fastspeech2_pitch_energy':
+        scaler_pitch = StandardScaler()
+        scaler_energy = StandardScaler()
+    else:
+        scaler = StandardScaler()    
     metadata = []
     # load metadata
     with open(args.metadata) as f:
@@ -32,13 +36,24 @@
     for _meta in tqdm(metadata):        
         ID = _meta['ID']
         spk = _meta['spk']
-        mel_path = os.path.join(args.dump_dir, args.split, 'mel', spk, ID+'.npy')
-        assert os.path.exists(mel_path), f'{mel_path}'
-        mel = np.load(mel_path)
-        scaler.partial_fit(mel)
-
-    out_path = os.path.join(args.dump_dir, args.split, 'mel', args.split + '.npy')    
-    stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
+        feature_path = os.path.join(args.dump_dir, args.split, args.feature_type, spk, ID+'.npy')
+        assert os.path.exists(feature_path), f'{feature_path}'
+        feature = np.load(feature_path)
+        if args.feature_type == 'fastspeech2_pitch_energy':
+            pitch = feature[0, :]
+            energy = feature[1, :]
+            scaler_pitch.partial_fit(pitch.reshape(-1, 1))
+            scaler_energy.partial_fit(energy.reshape(-1, 1))
+        else:    
+            scaler.partial_fit(feature)
+
+
+    out_path = os.path.join(args.dump_dir, args.split, args.feature_type, args.split + '.npy')    
+
+    if args.feature_type == 'fastspeech2_pitch_energy':
+        stats = np.stack([scaler_pitch.mean_, scaler_pitch.scale_, scaler_energy.mean_, scaler_energy.scale_], axis = 0)
+    else:    
+        stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
     np.save(
             out_path,
             stats.astype(np.float32),

diff --git a/prosodic_encoder/fastspeech2_pitch_energy/pitch_energy.py b/prosodic_encoder/fastspeech2_pitch_energy/pitch_energy.py
@@ -0,0 +1,58 @@
+import os
+import random
+import json
+
+import librosa
+import numpy as np
+import pyworld as pw
+
+def process_fastspeech2_pitch_energy(pitch_energy):
+    return pitch_energy
+def extract_energy(
+    audio,
+    sampling_rate,
+    fft_size=1024,
+    hop_size=256,
+    win_length=None,
+    window="hann",
+    num_mels=80,
+    fmin=None,
+    fmax=None,
+    eps=1e-10,
+    log_base=10.0,
+):
+    # get amplitude spectrogram
+    x_stft = librosa.stft(
+        audio,
+        n_fft=fft_size,
+        hop_length=hop_size,
+        win_length=win_length,
+        window=window,
+        pad_mode="reflect",
+    )
+    spc = np.abs(x_stft).T  # (#frames, #bins)
+    energy = np.linalg.norm(spc, axis = 1, ord = 2)
+    return energy
+
+def extract_pitch_energy(audio, config):
+    pitch, t = pw.dio(
+            audio.astype(np.float64),
+            config['sampling_rate'],
+            frame_period=config['hop_size'] / config['sampling_rate'] * 1000,
+            )
+    pitch = pw.stonemask(audio.astype(np.float64), pitch, t, config['sampling_rate'])
+    energy = extract_energy(
+        audio,
+        sampling_rate=config['sampling_rate'],
+        hop_size=config['hop_size'],
+        fft_size=config["fft_size"],
+        win_length=config["win_length"],
+        window=config["window"],
+        num_mels=config["num_mels"],
+        fmin=config["fmin"],
+        fmax=config["fmax"]
+    )
+    pitch_energy = np.array([pitch, energy])
+    return pitch_energy
+
+