Skip to content

Commit

Permalink
extract and compute_statistics for fastspeech2_pitch_energy as prosod…
Browse files Browse the repository at this point in the history
…ic_encoder
  • Loading branch information
MingjieChen committed Feb 17, 2023
1 parent 8647381 commit 9ab61ef
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 18 deletions.
3 changes: 2 additions & 1 deletion bin/compute_statistics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ source $conda/bin/activate $conda_env
python preprocess/compute_statistics.py \
--dump_dir dump/vctk/ \
--split train_nodev_all \
--metadata data/vctk/train_nodev_all/metadata.csv
--metadata data/vctk/train_nodev_all/metadata.csv \
--feature_type fastspeech2_pitch_energy
6 changes: 3 additions & 3 deletions bin/feature_extraction_multi_jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ conda_env=torch_1.7
# setup

dataset=vctk
config=configs/preprocess_vctk_ppgvc_mel.yaml
feature_type=ppgvc_f0
config=configs/preprocess_vctk.yaml
feature_type=fastspeech2_pitch_energy
splits="train_nodev_all dev_all"

script_dir=scripts/$dataset/preprocess
Expand Down Expand Up @@ -35,6 +35,6 @@ python3 feature_extraction.py \
EOF
chmod +x $b
submitjob -m 10000 $l $b
echo "submitjob for $dataset $split $spk $feature_type"
echo "submitjob for $dataset $split $spk $feature_type see log $l"
done
done
1 change: 1 addition & 0 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from torch.utils.data import DataLoader
from collections import defaultdict
from prosodic_encoder.ppgvc_f0.ppgvc_lf0 import get_cont_lf0 as process_ppgvc_f0
from prosodic_encoder.fastspeech2_pitch_energy.pitch_energy import process_fastspeech2_pitch_energy
def get_dataloader(config):
train_dataset = Dataset(config, config['train_meta'], config['train_set'])
dev_dataset = Dataset(config, config['dev_meta'], config['dev_set'])
Expand Down
67 changes: 63 additions & 4 deletions decoder/fastspeech2/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,78 @@
Conv1DBlock,
TransformerBlock,
)

class DiscreteProsodicNet(nn.Module):
def __init__(self, config):
super().__init__()

bins = config['prosodic_bins']
quantize = config['quantize']
prosodic_stats_path = config['prosodic_stats_path']

def forward(self, x):
return x
class ContinuousProsodicNet(nn.Module):
def __init__(self, config):
super().__init__()

hidden_dim = config['hidden_dim']
self.pitch_convs = torch.nn.Sequential(
torch.nn.Conv1d(2, hidden_dim, kernel_size=1, bias=False),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(hidden_dim, affine=False),
torch.nn.Conv1d(
hidden_dim, hidden_dim,
kernel_size= 3,
stride=1,
padding=1,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(encoder_dim, affine=False),
torch.nn.Conv1d(
hidden_dim, hidden_dim,
kernel_size= 3,
stride=1,
padding=1,
),
torch.nn.LeakyReLU(0.1),

torch.nn.InstanceNorm1d(hidden_dim, affine=False),
)
def forward(self, x):

out = x.transpose(1,2)
out = self.pitch_convs(out)
out = out.transpose(1,2)
return out


class VarianceAdaptor(nn.Module):
""" Variance Adaptor """

def __init__(self, model_config):
super(VarianceAdaptor, self).__init__()

self.d_model = model_config["transformer"]["encoder_hidden"]
self.projection = nn.Linear(model_config['spk_emb_dim'],model_config['transformer']['encoder_hidden'])
self.reduce_projection = nn.Linear(model_config['transformer']['encoder_hidden'] + model_config['spk_emb_dim'], model_config['transformer']['encoder_hidden'])
if model_config['prosodic_rep_type'] == 'continues':
self.pros_net = ContinuousProsodicNet(model_config['pitch_net'])
elif model_config['prosodic_rep_type'] == 'discrete':
self.pros_net = DiscreteProsodicNet(model_config['pitch_net'])

def forward(self, x, spk_emb, f0, mask, max_len):
def forward(self, x, spk_emb, pros_rep, mask, max_len):
batch_size = x.size(0)
spk_emb = self.projection(spk_emb.squeeze(1)).unsqueeze(1)
x = x + spk_emb.expand(batch_size, max_len, self.d_model )
# integrate speaker embedding
spk_emb = F.normalize(spk_emb.squeeze(1)).unsqueeze(1)
x = torch.cat([x,spk_emb.expand(batch_size, max_len, self.d_model )], dim = -1)
x = self.reduce_projection(x)

# integrate prosodic_rep
processed_pros_rep = self.pitch_net(pros_rep)
x = x + processed_pros_rep

if mask is not None:
x = x.masked_fill(mask.unsqueeze(-1), 0)

Expand Down
5 changes: 4 additions & 1 deletion feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import glob
from preprocess.audio_utils import mel_spectrogram, normalize
from prosodic_encoder.ppgvc_f0.ppgvc_lf0 import compute_f0 as compute_ppgvc_f0
from prosodic_encoder.fastspeech2_pitch_energy.pitch_energy import extract_pitch_energy as compute_fastspeech2_pitch_energy
import pyworld as pw
import librosa
import numpy as np
Expand Down Expand Up @@ -127,6 +128,8 @@ def process_speaker(spk_meta, spk, config, args):
feature = ppgvc_hifigan_logmelspectrogram(audio, config)
elif args.feature_type == 'ppgvc_f0':
feature = compute_ppgvc_f0(audio, sr = config['sampling_rate'], frame_period = 10.0)
elif args.feature_type == 'fastspeech2_pitch_energy':
feature = compute_fastspeech2_pitch_energy(audio, config)
feature_path = os.path.join(args.dump_dir, args.split, args.feature_type, spk, ID+'.npy')
os.makedirs(os.path.dirname(feature_path), exist_ok = True)
np.save(feature_path, feature)
Expand All @@ -143,7 +146,7 @@ def process_speaker(spk_meta, spk, config, args):
parser.add_argument('--split', type = str)
parser.add_argument('--max_workers', type = int, default = 20)
parser.add_argument('--speaker', type = str, default = None)
parser.add_argument('--feature_type', type = str, default = 'mel', choices = ['mel', 'ppgvc_mel', 'ppgvc_f0', 'fastspeech2_f0'])
parser.add_argument('--feature_type', type = str, default = 'mel', choices = ['mel', 'ppgvc_mel', 'ppgvc_f0', 'fastspeech2_pitch_energy'])
parser.add_argument('--pitch', default = False, action = 'store_true')
args = parser.parse_args()

Expand Down
33 changes: 24 additions & 9 deletions preprocess/compute_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@
parser.add_argument('--dump_dir', type = str)
parser.add_argument('--metadata', type = str)
parser.add_argument('--split', type = str)
parser.add_argument('--feature_type', type = str, default = 'mel')

args = parser.parse_args()
# create scaler
scaler = StandardScaler()

if args.feature_type == 'fastspeech2_pitch_energy':
scaler_pitch = StandardScaler()
scaler_energy = StandardScaler()
else:
scaler = StandardScaler()
metadata = []
# load metadata
with open(args.metadata) as f:
Expand All @@ -32,13 +36,24 @@
for _meta in tqdm(metadata):
ID = _meta['ID']
spk = _meta['spk']
mel_path = os.path.join(args.dump_dir, args.split, 'mel', spk, ID+'.npy')
assert os.path.exists(mel_path), f'{mel_path}'
mel = np.load(mel_path)
scaler.partial_fit(mel)

out_path = os.path.join(args.dump_dir, args.split, 'mel', args.split + '.npy')
stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
feature_path = os.path.join(args.dump_dir, args.split, args.feature_type, spk, ID+'.npy')
assert os.path.exists(feature_path), f'{feature_path}'
feature = np.load(feature_path)
if args.feature_type == 'fastspeech2_pitch_energy':
pitch = feature[0, :]
energy = feature[1, :]
scaler_pitch.partial_fit(pitch.reshape(-1, 1))
scaler_energy.partial_fit(energy.reshape(-1, 1))
else:
scaler.partial_fit(feature)


out_path = os.path.join(args.dump_dir, args.split, args.feature_type, args.split + '.npy')

if args.feature_type == 'fastspeech2_pitch_energy':
stats = np.stack([scaler_pitch.mean_, scaler_pitch.scale_, scaler_energy.mean_, scaler_energy.scale_], axis = 0)
else:
stats = np.stack([scaler.mean_, scaler.scale_], axis=0)
np.save(
out_path,
stats.astype(np.float32),
Expand Down
58 changes: 58 additions & 0 deletions prosodic_encoder/fastspeech2_pitch_energy/pitch_energy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import random
import json

import librosa
import numpy as np
import pyworld as pw

def process_fastspeech2_pitch_energy(pitch_energy):
return pitch_energy
def extract_energy(
audio,
sampling_rate,
fft_size=1024,
hop_size=256,
win_length=None,
window="hann",
num_mels=80,
fmin=None,
fmax=None,
eps=1e-10,
log_base=10.0,
):
# get amplitude spectrogram
x_stft = librosa.stft(
audio,
n_fft=fft_size,
hop_length=hop_size,
win_length=win_length,
window=window,
pad_mode="reflect",
)
spc = np.abs(x_stft).T # (#frames, #bins)
energy = np.linalg.norm(spc, axis = 1, ord = 2)
return energy

def extract_pitch_energy(audio, config):
pitch, t = pw.dio(
audio.astype(np.float64),
config['sampling_rate'],
frame_period=config['hop_size'] / config['sampling_rate'] * 1000,
)
pitch = pw.stonemask(audio.astype(np.float64), pitch, t, config['sampling_rate'])
energy = extract_energy(
audio,
sampling_rate=config['sampling_rate'],
hop_size=config['hop_size'],
fft_size=config["fft_size"],
win_length=config["win_length"],
window=config["window"],
num_mels=config["num_mels"],
fmin=config["fmin"],
fmax=config["fmax"]
)
pitch_energy = np.array([pitch, energy])
return pitch_energy


0 comments on commit 9ab61ef

Please sign in to comment.