diff --git a/.gitignore b/.gitignore deleted file mode 100644 index d223112..0000000 --- a/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -.idea -monotonic_align/monotonic_align -build diff --git a/LICENSE b/LICENSE index 6a6c318..c7202d4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Jaehyeon Kim +Copyright (c) 2021 Jingyi Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/app.py b/app.py deleted file mode 100644 index bfd3ab8..0000000 --- a/app.py +++ /dev/null @@ -1,103 +0,0 @@ -import gradio as gr -import os -os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..') - -import logging - -numba_logger = logging.getLogger('numba') -numba_logger.setLevel(logging.WARNING) - -import librosa -import torch - -import commons -import utils -from models import SynthesizerTrn -from text.symbols import symbols -from text import text_to_sequence -def resize2d(source, target_len): - source[source<0.001] = np.nan - target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) - return np.nan_to_num(target) -def convert_wav_22050_to_f0(audio): - tmp = librosa.pyin(audio, - fmin=librosa.note_to_hz('C0'), - fmax=librosa.note_to_hz('C7'), - frame_length=1780)[0] - f0 = np.zeros_like(tmp) - f0[tmp>0] = tmp[tmp>0] - return f0 - -def get_text(text, hps): - text_norm = text_to_sequence(text, hps.data.text_cleaners) - if hps.data.add_blank: - text_norm = commons.intersperse(text_norm, 0) - text_norm = torch.LongTensor(text_norm) - print(text_norm.shape) - return text_norm - - -hps = utils.get_hparams_from_file("configs/ljs_base.json") -hps_ms = utils.get_hparams_from_file("configs/vctk_base.json") -net_g_ms = SynthesizerTrn( - len(symbols), - hps_ms.data.filter_length // 2 + 1, - hps_ms.train.segment_size // hps.data.hop_length, - n_speakers=hps_ms.data.n_speakers, - **hps_ms.model) - -import numpy as np - -hubert = torch.hub.load("bshall/hubert:main", "hubert_soft") - -_ = utils.load_checkpoint("G_312000.pth", net_g_ms, None) - -def vc_fn(input_audio,vc_transform): - if input_audio is None: - return "You need to upload an audio", None - sampling_rate, audio = input_audio - # print(audio.shape,sampling_rate) - duration = audio.shape[0] / sampling_rate - if duration > 30: - return "Error: Audio is too long", None - audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) - if len(audio.shape) > 1: - audio = librosa.to_mono(audio.transpose(1, 0)) - if sampling_rate != 16000: - audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) - - audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050) - f0 = convert_wav_22050_to_f0(audio22050) - - source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0) - print(source.shape) - with torch.inference_mode(): - units = hubert.units(source) - soft = units.squeeze(0).numpy() - print(sampling_rate) - f0 = resize2d(f0, len(soft[:, 0])) * vc_transform - soft[:, 0] = f0 / 10 - sid = torch.LongTensor([0]) - stn_tst = torch.FloatTensor(soft) - with torch.no_grad(): - x_tst = stn_tst.unsqueeze(0) - x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) - audio = net_g_ms.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=0.1, noise_scale_w=0.1, length_scale=1)[0][ - 0, 0].data.float().numpy() - - return "Success", (hps.data.sampling_rate, audio) - - - -app = gr.Blocks() -with app: - with gr.Tabs(): - with gr.TabItem("Basic"): - vc_input3 = gr.Audio(label="Input Audio (30s limitation)") - vc_transform = gr.Number(label="transform",value=1.0) - vc_submit = gr.Button("Convert", variant="primary") - vc_output1 = gr.Textbox(label="Output Message") - vc_output2 = gr.Audio(label="Output Audio") - vc_submit.click(vc_fn, [ vc_input3,vc_transform], [vc_output1, vc_output2]) - - app.launch() \ No newline at end of file diff --git a/commons.py b/commons.py index 9ad0444..0748880 100644 --- a/commons.py +++ b/commons.py @@ -4,6 +4,23 @@ from torch import nn from torch.nn import functional as F +def slice_pitch_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + +def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size) + return ret, ret_pitch, ids_str def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ @@ -64,6 +81,16 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4): return ret, ids_str +def rand_spec_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + def get_timing_signal_1d( length, channels, min_timescale=1.0, max_timescale=1.0e4): position = torch.arange(length, dtype=torch.float) diff --git a/configs/nyarumul.json b/configs/config.json similarity index 58% rename from configs/nyarumul.json rename to configs/config.json index 446378c..4dcaf50 100644 --- a/configs/nyarumul.json +++ b/configs/config.json @@ -1,40 +1,39 @@ { "train": { "log_interval": 200, - "eval_interval": 2000, + "eval_interval": 200, "seed": 1234, "epochs": 10000, "learning_rate": 2e-4, "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 16, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, - "segment_size": 8192, + "segment_size": 17920, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, - "c_kl": 1.0 + "c_kl": 1.0, + "use_sr": true, + "max_speclen": 384, + "port": "8001" }, "data": { - "training_files":"/content/drive/MyDrive/SingingVC/trainmul.txt", - "validation_files":"/content/drive/MyDrive/SingingVC/valmul.txt", - "text_cleaners":["english_cleaners2"], + "training_files":"filelists/train.txt", + "validation_files":"filelists/val.txt", "max_wav_value": 32768.0, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, + "sampling_rate": 48000, + "filter_length": 1280, + "hop_length": 320, + "win_length": 1280, "n_mel_channels": 80, "mel_fmin": 0.0, - "mel_fmax": null, - "add_blank": true, - "n_speakers": 3, - "cleaned_text": true + "mel_fmax": null }, "model": { "inter_channels": 192, - "hidden_channels": 256, + "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, @@ -43,11 +42,17 @@ "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [8,8,2,2], + "upsample_rates": [10,8,2,2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16,16,4,4], "n_layers_q": 3, "use_spectral_norm": false, - "gin_channels": 256 + "gin_channels": 256, + "ssl_dim": 256 + }, + "spk":{ + "nen": 0, + "paimon": 1, + "yunhao": 2 } } diff --git a/configs/nyarusing.json b/configs/nyarusing.json deleted file mode 100644 index f05fba2..0000000 --- a/configs/nyarusing.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "train": { - "log_interval": 200, - "eval_interval": 2000, - "seed": 1234, - "epochs": 20000, - "learning_rate": 2e-4, - "betas": [0.8, 0.99], - "eps": 1e-9, - "batch_size": 24, - "fp16_run": true, - "lr_decay": 0.999875, - "segment_size": 8192, - "init_lr_ratio": 1, - "warmup_epochs": 0, - "c_mel": 45, - "c_kl": 1.0 - }, - "data": { - "training_files":"/content/train.txt", - "validation_files":"/content/nyarusing/val.txt", - "text_cleaners":["english_cleaners2"], - "max_wav_value": 32768.0, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": null, - "add_blank": true, - "n_speakers": 0, - "cleaned_text": true - }, - "model": { - "inter_channels": 192, - "hidden_channels": 256, - "filter_channels": 768, - "n_heads": 2, - "n_layers": 6, - "kernel_size": 3, - "p_dropout": 0.1, - "resblock": "1", - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [8,8,2,2], - "upsample_initial_channel": 512, - "upsample_kernel_sizes": [16,16,4,4], - "n_layers_q": 3, - "use_spectral_norm": false - } -} diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..1059d56 --- /dev/null +++ b/convert.py @@ -0,0 +1,142 @@ +import os +import argparse + +import numpy +import torch +import librosa +import time +from scipy.io.wavfile import write +from tqdm import tqdm + +import utils +from models import SynthesizerTrn +from mel_processing import mel_spectrogram_torch +from speaker_encoder.voice_encoder import SpeakerEncoder +import logging + +import parselmouth +import numpy as np + + +def stft(y): + return librosa.stft( + y=y, + n_fft=1280, + hop_length=160, + win_length=1280, + ) + + +def energy(y): + # Extract energy + S = librosa.magphase(stft(y))[0] + e = np.sqrt(np.sum(S ** 2, axis=0)) # np.linalg.norm(S, axis=0) + return e.squeeze() # (Number of frames) => (654,) + + +def get_energy(path, p_len=None): + wav, sr = librosa.load(path, 16000) + e = energy(wav) + if p_len is None: + p_len = wav.shape[0] // 160 + assert e.shape[0] - p_len < 2, (e.shape[0], p_len) + e = e[: p_len] + return e + + +def get_f0(path, p_len=None, f0_up_key=0): + x, _ = librosa.load(path, 16000) + if p_len is None: + p_len = x.shape[0] // 160 + else: + assert abs(p_len - x.shape[0] // 160) < 2, (path, p_len, x.shape) + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = parselmouth.Sound(x, 16000).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + + pad_size = (p_len - len(f0) + 1) // 2 + if (pad_size > 0 or p_len - len(f0) - pad_size > 0): + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant') + + f0bak = f0.copy() + f0 *= pow(2, f0_up_key / 12) + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak + + +logging.getLogger('numba').setLevel(logging.WARNING) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--hpfile", type=str, default="configs/freevc.json", help="path to json config file") + parser.add_argument("--ptfile", type=str, default="logs/freevc/G_14000.pth", help="path to pth file") + parser.add_argument("--outdir", type=str, default="output", help="path to output dir") + parser.add_argument("--use_timestamp", default=False, action="store_true") + args = parser.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + hps = utils.get_hparams_from_file(args.hpfile) + + print("Loading model...") + net_g = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).cuda() + _ = net_g.eval() + print("Loading checkpoint...") + _ = utils.load_checkpoint(args.ptfile, net_g, None) + + print("Loading WavLM for content...") + cmodel = utils.get_hubert_model(0) + + print("Processing text...") + titles, srcs, tgts, pshifts, eshifts = [], [], [], [], [] + + for line in open("convert.txt").readlines(): + sample, i, pshift, eshift = line.strip().split("|") + title = f"{sample[:-4]}-{i}-{pshift}-{eshift}" + src = f"sample/{sample}" + tgt = int(i) + titles.append(title) + srcs.append(src) + tgts.append(tgt) + + pshifts.append(int(pshift)) + eshifts.append(float(eshift)) + + print("Synthesizing...") + with torch.no_grad(): + for line in tqdm(zip(titles, srcs, tgts, pshifts, eshifts)): + title, src, tgt, pshift, eshift = line + # src + wav_src, _ = librosa.load(src, sr=16000) + wav_src = torch.from_numpy(wav_src).unsqueeze(0).cuda() + c = utils.get_hubert_content(cmodel, wav_src) + c = torch.repeat_interleave(c, repeats=2, dim=2) + # print(c.shape) + g = torch.LongTensor([[tgt]]).cuda() + cf0, f0bk = get_f0(src, c.shape[-1], f0_up_key=pshift) + f0 = torch.LongTensor(cf0).unsqueeze(0).cuda() + + e = get_energy(src, c.shape[-1]) * eshift + e = torch.LongTensor(e).unsqueeze(0).cuda() + + audio = net_g.infer(c, f0=f0, energy=e, g=g) + audio = audio[0][0].data.cpu().float().numpy() + if args.use_timestamp: + timestamp = time.strftime("%m-%d_%H-%M", time.localtime()) + write(os.path.join(args.outdir, "{}.wav".format(timestamp + "_" + title)), hps.data.sampling_rate, + audio) + else: + write(os.path.join(args.outdir, f"{title}.wav"), hps.data.sampling_rate, audio) + diff --git a/convert.txt b/convert.txt new file mode 100644 index 0000000..5b5219e --- /dev/null +++ b/convert.txt @@ -0,0 +1 @@ +cxk.wav|0|12|0.5 diff --git a/data_utils.py b/data_utils.py index e2f1645..aa7ace8 100644 --- a/data_utils.py +++ b/data_utils.py @@ -4,75 +4,43 @@ import numpy as np import torch import torch.utils.data -import numpy as np + import commons -from mel_processing import spectrogram_torch -from utils import load_wav_to_torch, load_filepaths_and_text -from text import text_to_sequence, cleaned_text_to_sequence +from mel_processing import spectrogram_torch, spec_to_mel_torch +from utils import load_wav_to_torch, load_filepaths_and_text, transform +# import h5py -def dropout1d(myarray, ratio=0.5): - indices = np.random.choice(np.arange(myarray.size), replace=False, - size=int(myarray.size * ratio)) - myarray[indices] = 0 - return myarray +"""Multi speaker version""" -class TextAudioLoader(torch.utils.data.Dataset): + +class TextAudioSpeakerLoader(torch.utils.data.Dataset): """ - 1) loads audio, text pairs + 1) loads audio, speaker_id, text pairs 2) normalizes text and converts them to sequences of integers 3) computes spectrograms from audio files. """ - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.text_cleaners = hparams.text_cleaners - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - - self.cleaned_text = getattr(hparams, "cleaned_text", False) - - self.add_blank = hparams.add_blank - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 190) + def __init__(self, audiopaths, hparams): + self.audiopaths = load_filepaths_and_text(audiopaths) + self.max_wav_value = hparams.data.max_wav_value + self.sampling_rate = hparams.data.sampling_rate + self.filter_length = hparams.data.filter_length + self.hop_length = hparams.data.hop_length + self.win_length = hparams.data.win_length + self.sampling_rate = hparams.data.sampling_rate + self.use_sr = hparams.train.use_sr + self.spec_len = hparams.train.max_speclen + self.spk_map = hparams.spk random.seed(1234) - random.shuffle(self.audiopaths_and_text) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - lengths = [] - for audiopath, text, pitch in self.audiopaths_and_text: - lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) - self.lengths = lengths - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - audiopath, text, pitch = audiopath_and_text[0], audiopath_and_text[1],audiopath_and_text[2] - text = self.get_text(text) - spec, wav = self.get_audio(audiopath) - pitch = self.get_pitch(pitch) - return (text, spec, wav, pitch) - - def get_pitch(self, pitch): - - return torch.LongTensor(np.load(pitch)) + random.shuffle(self.audiopaths) def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: - raise ValueError("{} {} SR doesn't match target {} SR".format( + raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) @@ -85,138 +53,67 @@ def get_audio(self, filename): center=False) spec = torch.squeeze(spec, 0) torch.save(spec, spec_filename) - return spec, audio_norm - def get_text(self, text): - soft = np.load(text) - text_norm = torch.FloatTensor(soft) - return text_norm + spk = filename.split("/")[-2] + spk = torch.LongTensor([self.spk_map[spk]]) + + c = torch.load(filename + ".soft.pt").squeeze(0) + c = torch.repeat_interleave(c, repeats=3, dim=1) + + f0 = np.load(filename + ".f0.npy") + f0 = torch.FloatTensor(f0) + lmin = min(c.size(-1), spec.size(-1), f0.shape[0]) + assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape, filename) + assert abs(lmin - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) + assert abs(lmin - c.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) + spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin] + audio_norm = audio_norm[:, :lmin * self.hop_length] + _spec, _c, _audio_norm, _f0 = spec, c, audio_norm, f0 + while spec.size(-1) < self.spec_len: + spec = torch.cat((spec, _spec), -1) + c = torch.cat((c, _c), -1) + f0 = torch.cat((f0, _f0), -1) + audio_norm = torch.cat((audio_norm, _audio_norm), -1) + start = random.randint(0, spec.size(-1) - self.spec_len) + end = start + self.spec_len + spec = spec[:, start:end] + c = c[:, start:end] + f0 = f0[start:end] + audio_norm = audio_norm[:, start * self.hop_length:end * self.hop_length] + + return c, f0, spec, audio_norm, spk def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) + return self.get_audio(self.audiopaths[index][0]) def __len__(self): - return len(self.audiopaths_and_text) + return len(self.audiopaths) -class TextAudioCollate(): - """ Zero-pads model inputs and targets - """ - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) - - max_text_len = max([len(x[0]) for x in batch]) - max_spec_len = max([x[1].size(1) for x in batch]) - max_wav_len = max([x[2].size(1) for x in batch]) - max_pitch_len = max([x[3].shape[0] for x in batch]) - # print(batch) - - - text_lengths = torch.LongTensor(len(batch)) - spec_lengths = torch.LongTensor(len(batch)) - wav_lengths = torch.LongTensor(len(batch)) - - text_padded = torch.FloatTensor(len(batch), max_text_len, 256) - spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) - wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) - pitch_padded = torch.LongTensor(len(batch), max_pitch_len) - - text_padded.zero_() - spec_padded.zero_() - wav_padded.zero_() - pitch_padded.zero_() - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - text = row[0] - text_padded[i, :text.size(0), :] = text - text_lengths[i] = text.size(0) - - spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav - wav_lengths[i] = wav.size(1) - - pitch = row[3] - pitch_padded[i, :pitch.size(0)] = pitch - - if self.return_ids: - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing, pitch_padded - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded - - -"""Multi speaker version""" - - -class TextAudioSpeakerLoader(torch.utils.data.Dataset): +class EvalDataLoader(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs 2) normalizes text and converts them to sequences of integers 3) computes spectrograms from audio files. """ - def __init__(self, audiopaths_sid_text, hparams): - self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text) - self.text_cleaners = hparams.text_cleaners - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - - self.cleaned_text = getattr(hparams, "cleaned_text", False) - - self.add_blank = hparams.add_blank - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 190) - - random.seed(1234) - random.shuffle(self.audiopaths_sid_text) - self._filter() + def __init__(self, audiopaths, hparams): + self.audiopaths = load_filepaths_and_text(audiopaths) + self.max_wav_value = hparams.data.max_wav_value + self.sampling_rate = hparams.data.sampling_rate + self.filter_length = hparams.data.filter_length + self.hop_length = hparams.data.hop_length + self.win_length = hparams.data.win_length + self.sampling_rate = hparams.data.sampling_rate + self.use_sr = hparams.train.use_sr + self.audiopaths = self.audiopaths[:10] + self.spk_map = hparams.spk - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - - lengths = [] - for audiopath, sid, text, pitch in self.audiopaths_sid_text: - lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) - self.lengths = lengths - - def get_audio_text_speaker_pair(self, audiopath_sid_text): - # separate filename, speaker_id and text - audiopath, sid, text, pitch = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2], audiopath_sid_text[3] - text = self.get_text(text) - spec, wav = self.get_audio(audiopath) - sid = self.get_sid(sid) - pitch = self.get_pitch(pitch) - - return (text, spec, wav, pitch, sid) def get_audio(self, filename): audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: - raise ValueError("{} {} SR doesn't match target {} SR".format( + raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) @@ -229,187 +126,27 @@ def get_audio(self, filename): center=False) spec = torch.squeeze(spec, 0) torch.save(spec, spec_filename) - return spec, audio_norm - - def get_text(self, text): - soft = np.load(text) - text_norm = torch.FloatTensor(soft) - return text_norm - - def get_pitch(self, pitch): - return torch.LongTensor(np.load(pitch)) - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid + spk = filename.split("/")[-2] + spk = torch.LongTensor([self.spk_map[spk]]) - def __getitem__(self, index): - return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) - - def __len__(self): - return len(self.audiopaths_sid_text) - - -class TextAudioSpeakerCollate(): - """ Zero-pads model inputs and targets - """ - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text, audio and speaker identities - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized, sid] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) - - max_text_len = max([len(x[0]) for x in batch]) - max_spec_len = max([x[1].size(1) for x in batch]) - max_wav_len = max([x[2].size(1) for x in batch]) - max_pitch_len = max([x[3].shape[0] for x in batch]) - - text_lengths = torch.LongTensor(len(batch)) - spec_lengths = torch.LongTensor(len(batch)) - wav_lengths = torch.LongTensor(len(batch)) - sid = torch.LongTensor(len(batch)) + c = torch.load(filename + ".soft.pt").squeeze(0) - text_padded = torch.FloatTensor(len(batch), max_text_len, 256) - spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) - wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) - pitch_padded = torch.LongTensor(len(batch), max_pitch_len) + c = torch.repeat_interleave(c, repeats=3, dim=1) - text_padded.zero_() - spec_padded.zero_() - wav_padded.zero_() - pitch_padded.zero_() + f0 = np.load(filename + ".f0.npy") + f0 = torch.FloatTensor(f0) + lmin = min(c.size(-1), spec.size(-1), f0.shape[0]) + assert abs(c.size(-1) - spec.size(-1)) < 4, (c.size(-1), spec.size(-1), f0.shape) + assert abs(f0.shape[0] - spec.shape[-1]) < 4, (c.size(-1), spec.size(-1), f0.shape) + spec, c, f0 = spec[:, :lmin], c[:, :lmin], f0[:lmin] + audio_norm = audio_norm[:, :lmin * self.hop_length] - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] + return c, f0, spec, audio_norm, spk - text = row[0] - text_padded[i, :text.size(0)] = text - text_lengths[i] = text.size(0) - - spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav - wav_lengths[i] = wav.size(1) - - pitch = row[3] - pitch_padded[i, :pitch.size(0)] = pitch - - sid[i] = row[4] - - if self.return_ids: - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, pitch_padded, sid, ids_sorted_decreasing - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths,pitch_padded , sid - - -class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): - """ - Maintain similar input lengths in a batch. - Length groups are specified by boundaries. - Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. - - It removes samples which are not included in the boundaries. - Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. - """ - - def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - for i in range(len(self.lengths)): - length = self.lengths[i] - idx_bucket = self._bisect(length) - if idx_bucket != -1: - buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, 0, -1): - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i + 1) - - num_samples_per_bucket = [] - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size - num_samples_per_bucket.append(len_bucket + rem) - return buckets, num_samples_per_bucket - - def __iter__(self): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(self.epoch) - - indices = [] - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - batches = [] - for i in range(len(self.buckets)): - bucket = self.buckets[i] - len_bucket = len(bucket) - if len_bucket == 0: - continue - ids_bucket = indices[i] - num_samples_bucket = self.num_samples_per_bucket[i] - - # add extra samples to make it evenly divisible - rem = num_samples_bucket - len_bucket - ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] - - # subsample - ids_bucket = ids_bucket[self.rank::self.num_replicas] - - # batching - for j in range(len(ids_bucket) // self.batch_size): - batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]] - batches.append(batch) - - if self.shuffle: - batch_ids = torch.randperm(len(batches), generator=g).tolist() - batches = [batches[i] for i in batch_ids] - self.batches = batches - - assert len(self.batches) * self.batch_size == self.num_samples - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: - hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: - return mid - elif x <= self.boundaries[mid]: - return self._bisect(x, lo, mid) - else: - return self._bisect(x, mid + 1, hi) - else: - return -1 + def __getitem__(self, index): + return self.get_audio(self.audiopaths[index][0]) def __len__(self): - return self.num_samples // self.batch_size + return len(self.audiopaths) + diff --git a/downsample.py b/downsample.py new file mode 100644 index 0000000..8ea37f7 --- /dev/null +++ b/downsample.py @@ -0,0 +1,47 @@ +import os +import argparse +import librosa +import numpy as np +from multiprocessing import Pool, cpu_count +from scipy.io import wavfile +from tqdm import tqdm + + +def process(item): + spkdir, wav_name, args = item + # speaker 's5', 'p280', 'p315' are excluded, + speaker = spkdir.split("/")[-1] + wav_path = os.path.join(args.in_dir, speaker, wav_name) + if os.path.exists(wav_path) and '.wav' in wav_path: + os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True) + wav, sr = librosa.load(wav_path, None) + wav, _ = librosa.effects.trim(wav, top_db=20) + peak = np.abs(wav).max() + if peak > 1.0: + wav = 0.98 * wav / peak + wav2 = librosa.resample(wav, orig_sr=sr, target_sr=args.sr2) + save_name = wav_name + save_path2 = os.path.join(args.out_dir2, speaker, save_name) + wavfile.write( + save_path2, + args.sr2, + (wav2 * np.iinfo(np.int16).max).astype(np.int16) + ) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--sr2", type=int, default=48000, help="sampling rate") + parser.add_argument("--in_dir", type=str, default="./raw", help="path to source dir") + parser.add_argument("--out_dir2", type=str, default="./dataset/48k", help="path to target dir") + args = parser.parse_args() + processs = cpu_count()-2 if cpu_count() >4 else 1 + pool = Pool(processes=processs) + + for speaker in os.listdir(args.in_dir): + spk_dir = os.path.join(args.in_dir, speaker) + if os.path.isdir(spk_dir): + print([(spk_dir, i) for i in os.listdir(spk_dir) if i.endswith("wav")]) + for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])): + pass diff --git a/filelists/test.txt b/filelists/test.txt new file mode 100644 index 0000000..f2cfa66 --- /dev/null +++ b/filelists/test.txt @@ -0,0 +1,12 @@ +./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav +./dataset/48k/paimon/vo_ABLQ005_2_paimon_01.wav +./dataset/48k/nen/kne110_005.wav +./dataset/48k/paimon/vo_ABLQ004_6_paimon_02.wav +./dataset/48k/paimon/vo_ABLQ004_6_paimon_01.wav +./dataset/48k/nen/kne110_003.wav +./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav +./dataset/48k/nen/kne110_004.wav +./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav +./dataset/48k/nen/kne110_001.wav +./dataset/48k/nen/kne110_006.wav +./dataset/48k/nen/kne110_002.wav diff --git a/filelists/train.txt b/filelists/train.txt new file mode 100644 index 0000000..e69de29 diff --git a/filelists/val.txt b/filelists/val.txt new file mode 100644 index 0000000..5b080dd --- /dev/null +++ b/filelists/val.txt @@ -0,0 +1,4 @@ +./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav +./dataset/48k/nen/kne110_006.wav +./dataset/48k/nen/kne110_002.wav +./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav diff --git a/harmof0/__init__.py b/harmof0/__init__.py deleted file mode 100644 index d983b03..0000000 --- a/harmof0/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# from https://github.com/wx-wei/harmof0 -from .network import HarmoF0 -from .pitch_tracker import PitchTracker -import torchaudio -import torch -pit = PitchTracker() - - -def extract_file_f0(path): - waveform, sr = torchaudio.load(path) - time, freq, activation, activation_map = pit.pred(waveform, sr) - return freq - - -def extract_wav_f0(wav_1d, sr): - wav = torch.FloatTensor(wav_1d).unsqueeze(0) - time, freq, activation, activation_map = pit.pred(wav, sr) - return freq diff --git a/harmof0/checkpoints/checkpoint_mir-1k.pth b/harmof0/checkpoints/checkpoint_mir-1k.pth deleted file mode 100644 index cc32126..0000000 Binary files a/harmof0/checkpoints/checkpoint_mir-1k.pth and /dev/null differ diff --git a/harmof0/checkpoints/mdb-stem-synth.pth b/harmof0/checkpoints/mdb-stem-synth.pth deleted file mode 100644 index df6eeb3..0000000 Binary files a/harmof0/checkpoints/mdb-stem-synth.pth and /dev/null differ diff --git a/harmof0/layers.py b/harmof0/layers.py deleted file mode 100644 index 3f72249..0000000 --- a/harmof0/layers.py +++ /dev/null @@ -1,113 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchaudio - -import numpy as np -from torchaudio.transforms import Spectrogram - -# Multiple Rate Dilated Convolution -class MRDConv(nn.Module): - def __init__(self, in_channels, out_channels, dilation_list = [0, 12, 19, 24, 28, 31, 34, 36]): - super().__init__() - self.dilation_list = dilation_list - self.conv_list = [] - for i in range(len(dilation_list)): - self.conv_list += [nn.Conv2d(in_channels, out_channels, kernel_size = [1, 1])] - self.conv_list = nn.ModuleList(self.conv_list) - - def forward(self, specgram): - # input [b x C x T x n_freq] - # output: [b x C x T x n_freq] - specgram - dilation = self.dilation_list[0] - y = self.conv_list[0](specgram) - y = F.pad(y, pad=[0, dilation]) - y = y[:, :, :, dilation:] - for i in range(1, len(self.conv_list)): - dilation = self.dilation_list[i] - x = self.conv_list[i](specgram) - # => [b x T x (n_freq + dilation)] - # x = F.pad(x, pad=[0, dilation]) - x = x[:, :, :, dilation:] - n_freq = x.size()[3] - y[:, :, :, :n_freq] += x - - return y - -# Fixed Rate Dilated Casual Convolution -class FRDConv(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=[1,3], dilation=[1, 1]) -> None: - super().__init__() - right = (kernel_size[1]-1) * dilation[1] - bottom = (kernel_size[0]-1) * dilation[0] - self.padding = nn.ZeroPad2d([0, right, 0 , bottom]) - self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, dilation=dilation) - - def forward(self,x): - x = self.padding(x) - x = self.conv2d(x) - return x - - -class WaveformToLogSpecgram(nn.Module): - def __init__(self, sample_rate, n_fft, fmin, bins_per_octave, freq_bins, hop_length, logspecgram_type): #, device - super().__init__() - - e = freq_bins/bins_per_octave - fmax = fmin * (2 ** e) - - self.logspecgram_type = logspecgram_type - self.n_fft = n_fft - hamming_window = torch.hann_window(self.n_fft)#.to(device) - # => [1 x 1 x n_fft] - hamming_window = hamming_window[None, None, :] - self.register_buffer("hamming_window", hamming_window, persistent=False) - - # torch.hann_window() - - fre_resolution = sample_rate/n_fft - - idxs = torch.arange(0, freq_bins) #, device=device - - log_idxs = fmin * (2**(idxs/bins_per_octave)) / fre_resolution - - # Linear interpolation: y_k = y_i * (k-i) + y_{i+1} * ((i+1)-k) - log_idxs_floor = torch.floor(log_idxs).long() - log_idxs_floor_w = (log_idxs - log_idxs_floor).reshape([1, 1, freq_bins]) - log_idxs_ceiling = torch.ceil(log_idxs).long() - log_idxs_ceiling_w = (log_idxs_ceiling - log_idxs).reshape([1, 1, freq_bins]) - self.register_buffer("log_idxs_floor", log_idxs_floor, persistent=False) - self.register_buffer("log_idxs_floor_w", log_idxs_floor_w, persistent=False) - self.register_buffer("log_idxs_ceiling", log_idxs_ceiling, persistent=False) - self.register_buffer("log_idxs_ceiling_w", log_idxs_ceiling_w, persistent=False) - - self.waveform_to_specgram = torchaudio.transforms.Spectrogram(n_fft, hop_length=hop_length)#.to(device) - - assert(bins_per_octave % 12 == 0) - bins_per_semitone = bins_per_octave // 12 - - self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB(top_db=80) - - def forward(self, waveforms): - # inputs: [b x num_frames x frame_len] - # outputs: [b x num_frames x n_bins] - - if(self.logspecgram_type == 'logharmgram'): - waveforms = waveforms * self.hamming_window - specgram = torch.fft.fft(waveforms) - specgram = torch.abs(specgram[:, :, :self.n_fft//2 + 1]) - specgram = specgram * specgram - # => [num_frames x n_fft//2 x 1] - # specgram = torch.unsqueeze(specgram, dim=2) - - # => [b x freq_bins x T] - specgram = specgram[:,:, self.log_idxs_floor] * self.log_idxs_floor_w + specgram[:, :, self.log_idxs_ceiling] * self.log_idxs_ceiling_w - - specgram_db = self.amplitude_to_db(specgram) - # specgram_db = specgram_db[:, :, :-1] # remove the last frame. - # specgram_db = specgram_db.permute([0, 2, 1]) - return specgram_db - - - diff --git a/harmof0/main.py b/harmof0/main.py deleted file mode 100644 index 9c91868..0000000 --- a/harmof0/main.py +++ /dev/null @@ -1,46 +0,0 @@ -import argparse -from .pitch_tracker import PitchTracker -import torch -import os - -# @ex.automain -# def main( -# audio_path = 'wav/a.mp3', -# device = "cuda" if torch.cuda.is_available() else "cpu", -# checkpoint_path = 'checkpoints/checkpoint_mdb-stem-synth.pth', -# output_dir = None, -# save_activation = True, -# frames_per_step = 6000, -# ): -# pitch_tracker = PitchTracker(checkpoint_path, device=device, frames_per_step=frames_per_step) -# pitch_tracker.pred_file(audio_path, output_dir, save_activation) - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('audio_path', type=str,) - parser.add_argument('--output-dir', type=str, default=None) - parser.add_argument('--device', type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="cpu or cuda") - parser.add_argument('--save-activation', type=eval, default=True, help="Save the activation as png.") - parser.add_argument('--frames-per-step', type=int, default=1000, help="The number of frames for a step.") - parser.add_argument('--hop-length', type=int, default=160, help="The sample rate is 16000, so the default 160 means 10 milliseconds.") - parser.add_argument('--post-processing', type=eval, default=True, help="use post processing.") - parser.add_argument('--high-threshold', type=float, default=0.8, help="high threshold for post processing.") - parser.add_argument('--low-threshold', type=float, default=0.1, help="low threshold for post processing.") - parser.add_argument('--min-pitch-dur', type=float, default=0.1, help="min pitch duration for post processing.") - parser.add_argument("--n-beam", type=int, default=5, help="beam number of post processing.") - parser.add_argument('--checkpoint-path', type=str, default=None, help="The path to pretrained model weight.") - - args = parser.parse_args() - - pitch_tracker = PitchTracker( - args.checkpoint_path, - hop_length=args.hop_length, - device=args.device, - frames_per_step=args.frames_per_step, - post_processing=args.post_processing, - high_threshold=args.high_threshold, - low_threshold=args.low_threshold, - min_pitch_dur=args.min_pitch_dur, - n_beam=args.n_beam, - ) - pitch_tracker.pred_file(args.audio_path, args.output_dir, args.save_activation) diff --git a/harmof0/network.py b/harmof0/network.py deleted file mode 100644 index ecfce9a..0000000 --- a/harmof0/network.py +++ /dev/null @@ -1,121 +0,0 @@ -from math import sin -import torch -from torch._C import has_openmp -import torch.nn as nn -import torch.nn.functional as F - -import numpy as np - -from .layers import MRDConv, FRDConv, WaveformToLogSpecgram - -def dila_conv_block( - in_channel, out_channel, - bins_per_octave, - n_har, - dilation_mode, - dilation_rate, - dil_kernel_size, - kernel_size = [1,3], - padding = [0,1], -): - - conv = nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size, padding=padding) - batch_norm = nn.BatchNorm2d(out_channel) - - # dilation mode: 'log_scale', 'fixed' - if(dilation_mode == 'log_scale'): - a = np.log(np.arange(1, n_har + 1))/np.log(2**(1.0/bins_per_octave)) - dilation_list = a.round().astype(np.int) - conv_log_dil = MRDConv(out_channel, out_channel, dilation_list) - return nn.Sequential( - conv,nn.ReLU(), - conv_log_dil,nn.ReLU(), - batch_norm, - # pool - ) - elif(dilation_mode == 'fixed_causal'): - dilation_list = np.array([i * dil_kernel_size[1] for i in range(dil_kernel_size[1])]) - causal_conv = FRDConv(out_channel, out_channel, dil_kernel_size, dilation=[1, dilation_rate]) - return nn.Sequential( - conv,nn.ReLU(), - causal_conv,nn.ReLU(), - batch_norm, - # pool - ) - elif(dilation_mode == 'fixed'): - conv_dil = nn.Conv2d(out_channel, out_channel, kernel_size=dil_kernel_size, padding='same', dilation=[1, dilation_rate]) - - return nn.Sequential( - conv,nn.ReLU(), - conv_dil,nn.ReLU(), - batch_norm, - # pool - ) - else: - assert False, "unknown dilation type: " + dilation_mode - - -class HarmoF0(nn.Module): - def __init__(self, - sample_rate=16000, - n_freq=512, - n_har=12, - bins_per_octave=12 * 4, - dilation_modes=['log_scale', 'fixed', 'fixed', 'fixed'], - dilation_rates=[48, 48, 48, 48], - logspecgram_type='logharmgram', - channels=[32, 64, 128, 128], - fmin=27.5, - freq_bins=88 * 4, - dil_kernel_sizes= [[1, 3], [1,3], [1,3], [1,3]], - ): - super().__init__() - self.logspecgram_type = logspecgram_type - - n_fft = n_freq * 2 - self.n_freq = n_freq - self.freq_bins = freq_bins - - self.waveform_to_logspecgram = WaveformToLogSpecgram(sample_rate, n_fft, fmin, bins_per_octave, freq_bins, n_freq, logspecgram_type) #, device - - bins = bins_per_octave - - # [b x 1 x T x 88*8] => [b x 32 x T x 88*4] - self.block_1 = dila_conv_block(1, channels[0], bins, n_har=n_har, dilation_mode=dilation_modes[0], dilation_rate=dilation_rates[0], dil_kernel_size=dil_kernel_sizes[0], kernel_size=[3, 3], padding=[1,1]) - - bins = bins // 2 - # => [b x 64 x T x 88*4] - self.block_2 = dila_conv_block(channels[0], channels[1], bins, 3, dilation_mode=dilation_modes[1], dilation_rate=dilation_rates[1], dil_kernel_size=dil_kernel_sizes[1], kernel_size=[3, 3], padding=[1,1]) - # => [b x 128 x T x 88*4] - self.block_3 = dila_conv_block(channels[1], channels[2], bins, 3, dilation_mode=dilation_modes[2], dilation_rate=dilation_rates[2], dil_kernel_size=dil_kernel_sizes[2], kernel_size=[3, 3], padding=[1,1]) - # => [b x 128 x T x 88*4] - self.block_4 = dila_conv_block(channels[2], channels[3], bins, 3, dilation_mode=dilation_modes[3], dilation_rate=dilation_rates[3], dil_kernel_size=dil_kernel_sizes[3], kernel_size=[3, 3], padding=[1,1]) - - self.conv_5 = nn.Conv2d(channels[3], channels[3]//2, kernel_size=[1,1]) - self.conv_6 = nn.Conv2d(channels[3]//2, 1, kernel_size=[1,1]) - - def forward(self, waveforms): - # input: [b x num_frames x frame_len] - # output: [b x num_frames x 352], [b x num_frames x 352] - - specgram = self.waveform_to_logspecgram(waveforms).float() - # => [b x 1 x num_frames x n_bins] - x = specgram[None, :] - - x = self.block_1(x) - x = self.block_2(x) - x = self.block_3(x) - x = self.block_4(x) - - # [b x 128 x T x 352] => [b x 64 x T x 352] - x = self.conv_5(x) - x = torch.relu(x) - x = self.conv_6(x) - x = torch.sigmoid(x) - - x = torch.squeeze(x, dim=1) - # x = torch.clip(x, 1e-4, 1 - 1e-4) - # => [num_frames x n_bins] - return x, specgram - - diff --git a/harmof0/pitch_tracker.py b/harmof0/pitch_tracker.py deleted file mode 100644 index 5e9b8d7..0000000 --- a/harmof0/pitch_tracker.py +++ /dev/null @@ -1,252 +0,0 @@ -# monophonic pitch estimator using harmonic_net. - -# torch -from random import shuffle -import torch -import torch.cuda -import torch.nn.functional as F -import torchaudio -from torch.utils.data import DataLoader - -# system -from tqdm import tqdm -from datetime import datetime -import os -from glob import glob - -import numpy as np -import matplotlib.pyplot as plt - -from .network import HarmoF0 - -# os.environ["CUDA_VISIBLE_DEVICES"] = "0" - -class PitchTracker(): - def __init__(self, - checkpoint_path = None, - fmin = 27.5, - sample_rate = 16000, - hop_length = 160, - frame_len = 1024, - frames_per_step = 1000, - post_processing = True, - high_threshold=0.8, - low_threshold=0.1, - n_beam = 5, - min_pitch_dur = 0.1, - freq_bins_in = 88*4, - freq_bins_out = 88*4, - bins_per_octave_in = 48, - bins_per_octave_out = 48, - device = "cuda" if torch.cuda.is_available() else "cpu", - ) -> None: - - # Load Model - harmonic_f0 = HarmoF0() - - if(checkpoint_path == None): - package_dir = os.path.dirname(os.path.realpath(__file__)) - weights_name = "mdb-stem-synth.pth" - checkpoint_path = os.path.join(package_dir, 'checkpoints' , weights_name) - - # Load checkpoint - if(checkpoint_path): - harmonic_f0.load_state_dict(torch.load(checkpoint_path, map_location=device)) - harmonic_f0 = harmonic_f0.to(device) - self.net = harmonic_f0 - - self.hop_length = hop_length - self.frame_len = frame_len - self.frames_per_step = frames_per_step - # post processing - self.min_pitch_len = min_pitch_dur * sample_rate / hop_length - self.post_processing = post_processing - self.high_threshold = high_threshold - self.low_threshold = low_threshold - self.n_beam = n_beam - - self.device = device - - self.freq_bins_in = freq_bins_in - self.freq_bins_out = freq_bins_out - self.bins_per_octave_in = bins_per_octave_in - self.bins_per_octave_out = bins_per_octave_out - self.fmin = fmin - self.sample_rate = sample_rate - - def visit(self, activation_map, low_map, out_map, t, pitch, visited_set, sub_set, n_beam): - if(t, pitch) in visited_set or low_map[t, pitch] < 1: - return - out_map[t, pitch] = activation_map[t, pitch] - visited_set.add((t, pitch)) - sub_set.add((t, pitch)) - - low = max(0, pitch - n_beam) - high = min(low_map.shape[1], pitch + n_beam) - # visit left - if t > 0: - for p in range(low, high): - self.visit(activation_map, low_map, out_map, t-1, p, visited_set, sub_set, n_beam) - #visit right - if(t < low_map.shape[0] -1): - for p in range(low, high): - self.visit(activation_map, low_map, out_map, t+1, p, visited_set, sub_set, n_beam) - - def postProcessing(self, activation_map, high_threshold=0.8, low_threshold=0.1): - ''' - - Parameters - ------- - activation_map: ndarray [T x 352] - - Returns - ------- - ''' - high_map = activation_map >= high_threshold - low_map = activation_map >= low_threshold - out_map = np.zeros_like(activation_map) - - visited_set = set() - rows, cols = high_map.nonzero() - for t, pitch in zip(rows, cols): - sub_set = set() - self.visit(activation_map, low_map, out_map, t, pitch, visited_set, sub_set, self.n_beam) - # remove the region that has length < self.min_pitch_len - if len(sub_set) > 0: - pit_len = max([x[0] for x in sub_set]) - min([x[0] for x in sub_set]) - if pit_len < self.min_pitch_len: - for t, pitch in sub_set: - out_map[t, pitch] = 0 - return out_map - - - - def pred(self, waveform, sr): - # inputs: - # waveform: - # sr: 16000 - # returns: - # time, freq, activation, activation_map - # [T], [T], [T], [T x 352] - - if isinstance(waveform,np.ndarray): - waveform = torch.tensor(waveform) - if(len(waveform.size()) == 1): - waveform = waveform[None, :] - - if(sr != self.sample_rate): - print("convert sr from %d to %d"%(sr, self.sample_rate)) - resampler = torchaudio.transforms.Resample(sr, self.sample_rate).to(self.device) - waveform = waveform.to(self.device) - waveform = resampler(waveform) - - # start from the 0 - waveform = F.pad(waveform, [self.frame_len//2, 0], mode='reflect') - b, wav_len = waveform.shape - assert b == 1 - num_frames = int((wav_len - self.frame_len)//self.hop_length) + 1 - batch = torch.zeros([1, num_frames, self.frame_len]) - for i in range(num_frames): - begin = i * self.hop_length - end = begin + self.frame_len - batch[:, i, :] = waveform[:, begin:end] - batch = batch.to(self.device) - - times = np.arange(num_frames) * (self.hop_length/self.sample_rate) - - result_dict = { - # 'pred_freqs':[], - # 'pred_activations':[], - 'pred_activations_map':[], - } - - - steps = int(np.ceil(num_frames / self.frames_per_step)) - for i in tqdm(range(steps)): - begin = i * self.frames_per_step - end = begin + self.frames_per_step - waveforms = batch[:, begin:end ] - with torch.no_grad(): - # => [b x num_frames x (88*4)], [b x num_frames x (88*4)] - est_onehot, specgram = self.net.eval()(waveforms) - - result_dict['pred_activations_map'] += [est_onehot.squeeze(0).cpu()] - - pred_activation_map = torch.concat(result_dict['pred_activations_map'], dim=0).cpu().numpy() - - if(self.post_processing): - pred_activation_map = self.postProcessing(pred_activation_map, self.high_threshold, self.low_threshold) - - # => [num_frames ] - est_freqs, est_activations = self.onehot_to_hz(torch.tensor(pred_activation_map)[None,:], self.bins_per_octave_out, threshold=0.0) - pred_freq = est_freqs.flatten().cpu().numpy() - pred_activation = est_activations.flatten().cpu().numpy() - - return times, pred_freq, pred_activation, pred_activation_map - - def pred_file(self, audio_path, output_dir=None, save_activation=True): - wav_path_list = [] - if os.path.isdir(audio_path): - all_files = glob(os.path.join(audio_path, "*")) - for path in all_files: - _, ext = os.path.splitext(path) - if ext.lower() in ['.wav', '.mp3', '.flac']: - wav_path_list.append(path) - else: - wav_path_list.append(audio_path) - - for i, wav_path in enumerate(wav_path_list): - - result_dir, basename = os.path.split(wav_path) - if(output_dir != None): - result_dir = str(output_dir) - os.makedirs(result_dir, exist_ok=True) - wav_name, ext = os.path.splitext(basename) - pred_path = os.path.join(result_dir, wav_name + ".f0.txt") - - waveform, sr = torchaudio.load(wav_path) - waveform = torch.sum(waveform, dim=0, keepdim=True) - print(f'audio {i+1} of {len(wav_path_list)}') - - pred_time, pred_freq, activation, activation_map = self.pred(waveform, sr) - - pred_table = np.stack([pred_time, pred_freq, activation], axis=1) - np.savetxt(pred_path, pred_table, header='time frequency activation', fmt="%.03f") - if(save_activation): - if self.post_processing == False: - activation_path = os.path.join(result_dir, wav_name + ".activation.png") - else: - activation_path = os.path.join(result_dir, wav_name + ".activation.post.png") - plt.imsave(activation_path, activation_map.T[::-1]) - # activation_map_post = self.postProcessing(activation_map) - # plt.imsave(activation_post_path, activation_map_post.T[::-1]) - - def hz_to_onehot(self, hz, freq_bins, bins_per_octave): - # input: [b x T] - # output: [b x T x freq_bins] - - fmin = self.fmin - - indexs = ( torch.log((hz+0.0000001)/fmin) / np.log(2.0**(1.0/bins_per_octave)) + 0.5 ).long() - assert(torch.max(indexs) < freq_bins) - mask = (indexs >= 0).long() - # => [b x T x 1] - mask = torch.unsqueeze(mask, dim=2) - # => [b x T x freq_bins] - onehot = F.one_hot(torch.clip(indexs, 0), freq_bins) - onehot = onehot * mask # mask the freq below fmin - return onehot - - def onehot_to_hz(self, onehot, bins_per_octave, threshold = 0.6): - # input: [b x T x freq_bins] - # output: [b x T] - fmin = self.fmin - max_onehot = torch.max(onehot, dim=2) - indexs = max_onehot[1] - mask = (max_onehot[0] > threshold).float() - - hz = fmin * (2**(indexs/bins_per_octave)) - hz = hz * mask # set freq to 0 if activate val below threshold - - return hz, max_onehot[0] - diff --git a/hubert/__init__.py b/hubert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hubert/hubert_model.py b/hubert/hubert_model.py new file mode 100644 index 0000000..7fb642d --- /dev/null +++ b/hubert/hubert_model.py @@ -0,0 +1,222 @@ +import copy +import random +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as t_func +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + + +class Hubert(nn.Module): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): + super().__init__() + self._mask = mask + self.feature_extractor = FeatureExtractor() + self.feature_projection = FeatureProjection() + self.positional_embedding = PositionalConvEmbedding() + self.norm = nn.LayerNorm(768) + self.dropout = nn.Dropout(0.1) + self.encoder = TransformerEncoder( + nn.TransformerEncoderLayer( + 768, 12, 3072, activation="gelu", batch_first=True + ), + 12, + ) + self.proj = nn.Linear(768, 256) + + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) + self.label_embedding = nn.Embedding(num_label_embeddings, 256) + + def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + mask = None + if self.training and self._mask: + mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) + x[mask] = self.masked_spec_embed.to(x.dtype) + return x, mask + + def encode( + self, x: torch.Tensor, layer: Optional[int] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = self.feature_extractor(x) + x = self.feature_projection(x.transpose(1, 2)) + x, mask = self.mask(x) + x = x + self.positional_embedding(x) + x = self.dropout(self.norm(x)) + x = self.encoder(x, output_layer=layer) + return x, mask + + def logits(self, x: torch.Tensor) -> torch.Tensor: + logits = torch.cosine_similarity( + x.unsqueeze(2), + self.label_embedding.weight.unsqueeze(0).unsqueeze(0), + dim=-1, + ) + return logits / 0.1 + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x, mask = self.encode(x) + x = self.proj(x) + logits = self.logits(x) + return logits, mask + + +class HubertSoft(Hubert): + def __init__(self): + super().__init__() + + @torch.inference_mode() + def units(self, wav: torch.Tensor) -> torch.Tensor: + wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav) + return self.proj(x) + + +class FeatureExtractor(nn.Module): + def __init__(self): + super().__init__() + self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) + self.norm0 = nn.GroupNorm(512, 512) + self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) + self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = t_func.gelu(self.norm0(self.conv0(x))) + x = t_func.gelu(self.conv1(x)) + x = t_func.gelu(self.conv2(x)) + x = t_func.gelu(self.conv3(x)) + x = t_func.gelu(self.conv4(x)) + x = t_func.gelu(self.conv5(x)) + x = t_func.gelu(self.conv6(x)) + return x + + +class FeatureProjection(nn.Module): + def __init__(self): + super().__init__() + self.norm = nn.LayerNorm(512) + self.projection = nn.Linear(512, 768) + self.dropout = nn.Dropout(0.1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.norm(x) + x = self.projection(x) + x = self.dropout(x) + return x + + +class PositionalConvEmbedding(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d( + 768, + 768, + kernel_size=128, + padding=128 // 2, + groups=16, + ) + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x.transpose(1, 2)) + x = t_func.gelu(x[:, :, :-1]) + return x.transpose(1, 2) + + +class TransformerEncoder(nn.Module): + def __init__( + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + ) -> None: + super(TransformerEncoder, self).__init__() + self.layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for _ in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: torch.Tensor, + mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + output_layer: Optional[int] = None, + ) -> torch.Tensor: + output = src + for layer in self.layers[:output_layer]: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask + ) + return output + + +def _compute_mask( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: torch.device, + min_masks: int = 0, +) -> torch.Tensor: + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + ) + + # compute number of masked spans in batch + num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) + num_masked_spans = max(num_masked_spans, min_masks) + + # make sure num masked indices <= sequence_length + if num_masked_spans * mask_length > sequence_length: + num_masked_spans = sequence_length // mask_length + + # SpecAugment mask to fill + mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) + + # uniform distribution to sample from, make sure that offset samples are < sequence_length + uniform_dist = torch.ones( + (batch_size, sequence_length - (mask_length - 1)), device=device + ) + + # get random indices to mask + mask_indices = torch.multinomial(uniform_dist, num_masked_spans) + + # expand masked indices to masked spans + mask_indices = ( + mask_indices.unsqueeze(dim=-1) + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + offsets = ( + torch.arange(mask_length, device=device)[None, None, :] + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + mask_idxs = mask_indices + offsets + + # scatter indices to mask + mask = mask.scatter(1, mask_idxs, True) + + return mask + + +def hubert_soft( + path: str, +) -> HubertSoft: + r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + path (str): path of a pretrained model + """ + hubert = HubertSoft() + checkpoint = torch.load(path) + consume_prefix_in_state_dict_if_present(checkpoint, "module.") + hubert.load_state_dict(checkpoint) + hubert.eval() + return hubert diff --git a/hubert/put_hubert_ckpt_here b/hubert/put_hubert_ckpt_here new file mode 100644 index 0000000..e69de29 diff --git a/logs/48k/put_pretrained_model_here b/logs/48k/put_pretrained_model_here new file mode 100644 index 0000000..e69de29 diff --git a/losses.py b/losses.py index fb22a0e..41f9be6 100644 --- a/losses.py +++ b/losses.py @@ -53,7 +53,7 @@ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): m_p = m_p.float() logs_p = logs_p.float() z_mask = z_mask.float() - + #print(logs_p) kl = logs_p - logs_q - 0.5 kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) kl = torch.sum(kl * z_mask) diff --git a/mel_processing.py b/mel_processing.py index 817f037..99c5b35 100644 --- a/mel_processing.py +++ b/mel_processing.py @@ -64,7 +64,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) y = y.squeeze(1) spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True) + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) return spec @@ -75,7 +75,7 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): dtype_device = str(spec.dtype) + '_' + str(spec.device) fmax_dtype_device = str(fmax) + '_' + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = spectral_normalize_torch(spec) @@ -93,7 +93,7 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmax_dtype_device = str(fmax) + '_' + dtype_device wnsize_dtype_device = str(win_size) + '_' + dtype_device if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) if wnsize_dtype_device not in hann_window: hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) @@ -102,7 +102,7 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, y = y.squeeze(1) spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True) + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) diff --git a/models.py b/models.py index 0f165c3..eb67458 100644 --- a/models.py +++ b/models.py @@ -3,342 +3,127 @@ import torch from torch import nn from torch.nn import functional as F -import numpy as np + +import attentions import commons import modules -import attentions -import monotonic_align from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm from commons import init_weights, get_padding +from vdecoder.hifigan.models import Generator +from utils import f0_to_coarse - -class StochasticDurationPredictor(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0): - super().__init__() - filter_channels = in_channels # it needs to be removed from future version. - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.log_flow = modules.Log() - self.flows = nn.ModuleList() - self.flows.append(modules.ElementwiseAffine(2)) - for i in range(n_flows): - self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) - self.flows.append(modules.Flip()) - - self.post_pre = nn.Conv1d(1, filter_channels, 1) - self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) - self.post_flows = nn.ModuleList() - self.post_flows.append(modules.ElementwiseAffine(2)) - for i in range(4): - self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) - self.post_flows.append(modules.Flip()) - - self.pre = nn.Conv1d(in_channels, filter_channels, 1) - self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, filter_channels, 1) - - def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0): - x = torch.detach(x) - x = self.pre(x) - if g is not None: - g = torch.detach(g) - x = x + self.cond(g) - x = self.convs(x, x_mask) - x = self.proj(x) * x_mask - - if not reverse: - flows = self.flows - assert w is not None - - logdet_tot_q = 0 - h_w = self.post_pre(w) - h_w = self.post_convs(h_w, x_mask) - h_w = self.post_proj(h_w) * x_mask - e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask - z_q = e_q - for flow in self.post_flows: - z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w)) - logdet_tot_q += logdet_q - z_u, z1 = torch.split(z_q, [1, 1], 1) - u = torch.sigmoid(z_u) * x_mask - z0 = (w - u) * x_mask - logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]) - logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q - - logdet_tot = 0 - z0, logdet = self.log_flow(z0, x_mask) - logdet_tot += logdet - z = torch.cat([z0, z1], 1) - for flow in flows: - z, logdet = flow(z, x_mask, g=x, reverse=reverse) - logdet_tot = logdet_tot + logdet - nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot - return nll + logq # [b] - else: - flows = list(reversed(self.flows)) - flows = flows[:-2] + [flows[-1]] # remove a useless vflow - z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale - for flow in flows: - z = flow(z, x_mask, g=x, reverse=reverse) - z0, z1 = torch.split(z, [1, 1], 1) - logw = z0 - return logw - - -class DurationPredictor(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0): - super().__init__() - - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.gin_channels = gin_channels - - self.drop = nn.Dropout(p_dropout) - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) - self.norm_1 = modules.LayerNorm(filter_channels) - self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) - self.norm_2 = modules.LayerNorm(filter_channels) - self.proj = nn.Conv1d(filter_channels, 1, 1) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, in_channels, 1) - - def forward(self, x, x_mask, g=None): - x = torch.detach(x) - if g is not None: - g = torch.detach(g) - x = x + self.cond(g) - x = self.conv_1(x * x_mask) - x = torch.relu(x) - x = self.norm_1(x) - x = self.drop(x) - x = self.conv_2(x * x_mask) - x = torch.relu(x) - x = self.norm_2(x) - x = self.drop(x) - x = self.proj(x * x_mask) - return x * x_mask - - -class PitchPredictor(nn.Module): - def __init__(self, - n_vocab, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout): - super().__init__() - self.n_vocab = n_vocab # 音素的个数,中文和英文不同 - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - - self.pitch_net = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) - self.proj = nn.Conv1d(hidden_channels, 1, 1) - - def forward(self, x, x_mask): - pitch_embedding = self.pitch_net(x * x_mask, x_mask) - pitch_embedding = pitch_embedding * x_mask - pred_pitch = self.proj(pitch_embedding) - return pred_pitch, pitch_embedding +class ResidualCouplingBlock(nn.Module): + def __init__(self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class Encoder(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + # print(x.shape,x_lengths.shape) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask class TextEncoder(nn.Module): - def __init__(self, - n_vocab, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout): - super().__init__() - self.n_vocab = n_vocab - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - - # self.emb = nn.Embedding(n_vocab, hidden_channels) - # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) - self.emb_pitch = nn.Embedding(256, hidden_channels) - nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5) - - self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, pitch): - # x = x.transpose(1,2) - # x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] - # print(x.shape) - x = x + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) - - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return x, m, logs, x_mask + def __init__(self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + filter_channels=None, + n_heads=None, + p_dropout=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + self.f0_emb = nn.Embedding(256, hidden_channels) + + self.enc_ = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout) + + def forward(self, x, x_lengths, f0=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = x + self.f0_emb(f0).transpose(1,2) + x = self.enc_(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + + return z, m, logs, x_mask -class ResidualCouplingBlock(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - for i in range(n_flows): - self.flows.append( - modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, - gin_channels=gin_channels, mean_only=True)) - self.flows.append(modules.Flip()) - - def forward(self, x, x_mask, g=None, reverse=False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x - - -class PosteriorEncoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - -class Generator(torch.nn.Module): - def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append(weight_norm( - ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)), - k, u, padding=(k - u) // 2))) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, g=None): - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - - return x - - def remove_weight_norm(self): - print('Removing weight norm...') - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): @@ -360,7 +145,7 @@ def forward(self, x): # 1d to 2d b, c, t = x.shape - if t % self.period != 0: # pad first + if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad @@ -408,7 +193,7 @@ def forward(self, x): class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11] + periods = [2,3,5,7,11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] @@ -428,185 +213,138 @@ def forward(self, y, y_hat): fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class SynthesizerTrn(nn.Module): - """ - Synthesizer for Training - """ - - def __init__(self, - n_vocab, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_speakers=0, - gin_channels=0, - use_sdp=True, - **kwargs): - - super().__init__() - self.n_vocab = n_vocab - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.n_speakers = n_speakers - self.gin_channels = gin_channels - - self.use_sdp = use_sdp - - self.enc_p = TextEncoder(n_vocab, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) - self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, - upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, - gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) - # self.pitch_net = PitchPredictor(n_vocab, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, - # kernel_size, p_dropout) - - if use_sdp: - self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels) - else: - self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels) - - if n_speakers > 1: - self.emb_g = nn.Embedding(n_speakers, gin_channels) - - def forward(self, x, x_lengths, y, y_lengths, pitch, sid=None): - - x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch) - - if self.n_speakers > 0: - g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - else: - g = None - - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - # print(f"z: {z.shape}") - - z_p = self.flow(z, y_mask, g=g) - # print(f"z_p: {z_p.shape}") - - with torch.no_grad(): - # negative cross-entropy - s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] - neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s] - neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), - s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] - neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] - neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s] - neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 - - attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) - attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach() - - w = attn.sum(2) - if self.use_sdp: - l_length = self.dp(x, x_mask, w, g=g) - l_length = l_length / torch.sum(x_mask) + + +class SpeakerEncoder(torch.nn.Module): + def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): + super(SpeakerEncoder, self).__init__() + self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) + self.linear = nn.Linear(model_hidden_size, model_embedding_size) + self.relu = nn.ReLU() + + def forward(self, mels): + self.lstm.flatten_parameters() + _, (hidden, _) = self.lstm(mels) + embeds_raw = self.relu(self.linear(hidden[-1])) + return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) + + def compute_partial_slices(self, total_frames, partial_frames, partial_hop): + mel_slices = [] + for i in range(0, total_frames-partial_frames, partial_hop): + mel_range = torch.arange(i, i+partial_frames) + mel_slices.append(mel_range) + + return mel_slices + + def embed_utterance(self, mel, partial_frames=128, partial_hop=64): + mel_len = mel.size(1) + last_mel = mel[:,-partial_frames:] + + if mel_len > partial_frames: + mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) + mels = list(mel[:,s] for s in mel_slices) + mels.append(last_mel) + mels = torch.stack(tuple(mels), 0).squeeze(1) + + with torch.no_grad(): + partial_embeds = self(mels) + embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) + #embed = embed / torch.linalg.norm(embed, 2) else: - logw_ = torch.log(w + 1e-6) * x_mask - logw = self.dp(x, x_mask, g=g) - l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(x_mask) # for averaging - - # expand prior - # print() - # print(f"attn: {attn.shape}") - # print(f"m_p: {m_p.shape}") - # print(f"logs_p: {logs_p.shape}") - - m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) - logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) - # print(f"m_p: {m_p.shape}") - # print(f"logs_p: {logs_p.shape}") - - z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) - # print(f"z_slice: {z_slice.shape}") - - o = self.dec(z_slice, g=g) - return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None): - x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch) - if self.n_speakers > 0: - g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - else: - g = None - - if self.use_sdp: - logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) - else: - logw = self.dp(x, x_mask, g=g) - w = torch.exp(logw) * x_mask * length_scale - w_ceil = torch.ceil(w) - - w_ceil = w_ceil * 0 + 2 - # for index in range(w_ceil.shape[2]): - # if index%4 == 0: - # w_ceil[0,0,index] = 1.0 + with torch.no_grad(): + embed = self(last_mel) + + return embed - for i in range(w_ceil.shape[2]): - sep = 1 / 0.14 - if i * sep >= w_ceil.shape[2] * 2: - break - w_ceil[0, 0, int(i * sep / 2)] = 1 - - # print(w_ceil) - y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() - y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) - - attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) - - attn = commons.generate_path(w_ceil, attn_mask) - - m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] - logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, - 2) # [b, t', t], [b, t, d] -> [b, d, t'] - - z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale - - z = self.flow(z_p, y_mask, g=g, reverse=True) - o = self.dec((z * y_mask)[:, :, :max_len], g=g) - return o, attn, y_mask, (z, z_p, m_p, logs_p) - - def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): - assert self.n_speakers > 0, "n_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) - z_p = self.flow(z, y_mask, g=g_src) - z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) - o_hat = self.dec(z_hat * y_mask, g=g_tgt) - return o_hat, y_mask, (z, z_p, z_hat) +class SynthesizerTrn(nn.Module): + """ + Synthesizer for Training + """ + + def __init__(self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + ssl_dim, + **kwargs): + + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.ssl_dim = ssl_dim + self.emb_g = nn.Embedding(10, gin_channels) + + self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout) + hps = { + "sampling_rate": 48000, + "inter_channels": 192, + "resblock": "1", + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "upsample_rates": [10, 8, 2, 2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16, 16, 4, 4], + "gin_channels": 256, + } + self.dec = Generator(h=hps) + self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) + self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) + + def forward(self, c, f0, spec, g=None, mel=None, c_lengths=None, spec_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + if spec_lengths == None: + spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device) + + g = self.emb_g(g).transpose(1,2) + + z_ptemp, m_p, logs_p, _ = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) + z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) + + z_p = self.flow(z, spec_mask, g=g) + z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size) + + # o = self.dec(z_slice, g=g) + o = self.dec(z_slice, g=g, f0=pitch_slice) + + return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, c, f0, g=None, mel=None, c_lengths=None): + if c_lengths == None: + c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) + g = self.emb_g(g).transpose(1,2) + + z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0)) + z = self.flow(z_p, c_mask, g=g, reverse=True) + # o = self.dec(z * c_mask, g=g) + o = self.dec(z * c_mask, g=g, f0=f0) + + return o diff --git a/modules.py b/modules.py index 9c7fd9c..52ee14e 100644 --- a/modules.py +++ b/modules.py @@ -11,7 +11,6 @@ import commons from commons import init_weights, get_padding -from transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 @@ -341,50 +340,3 @@ def forward(self, x, x_mask, g=None, reverse=False): x1 = (x1 - m) * torch.exp(-logs) * x_mask x = torch.cat([x0, x1], 1) return x - - -class ConvFlow(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) - self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_derivatives = h[..., 2 * self.num_bins:] - - x1, logabsdet = piecewise_rational_quadratic_transform(x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails='linear', - tail_bound=self.tail_bound - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1,2]) - if not reverse: - return x, logdet - else: - return x diff --git a/monotonic_align/__init__.py b/monotonic_align/__init__.py deleted file mode 100644 index 3d7009c..0000000 --- a/monotonic_align/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -import numpy as np -import torch -from .monotonic_align.core import maximum_path_c - - -def maximum_path(neg_cent, mask): - """ Cython optimized version. - neg_cent: [b, t_t, t_s] - mask: [b, t_t, t_s] - """ - device = neg_cent.device - dtype = neg_cent.dtype - neg_cent = neg_cent.data.cpu().numpy().astype(np.float32) - path = np.zeros(neg_cent.shape, dtype=np.int32) - - t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32) - t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32) - maximum_path_c(path, neg_cent, t_t_max, t_s_max) - return torch.from_numpy(path).to(device=device, dtype=dtype) diff --git a/monotonic_align/core.pyx b/monotonic_align/core.pyx deleted file mode 100644 index bfaabd4..0000000 --- a/monotonic_align/core.pyx +++ /dev/null @@ -1,42 +0,0 @@ -cimport cython -from cython.parallel import prange - - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil: - cdef int x - cdef int y - cdef float v_prev - cdef float v_cur - cdef float tmp - cdef int index = t_x - 1 - - for y in range(t_y): - for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): - if x == y: - v_cur = max_neg_val - else: - v_cur = value[y-1, x] - if x == 0: - if y == 0: - v_prev = 0. - else: - v_prev = max_neg_val - else: - v_prev = value[y-1, x-1] - value[y, x] += max(v_prev, v_cur) - - for y in range(t_y - 1, -1, -1): - path[y, index] = 1 - if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): - index = index - 1 - - -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil: - cdef int b = paths.shape[0] - cdef int i - for i in prange(b, nogil=True): - maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i]) diff --git a/monotonic_align/setup.py b/monotonic_align/setup.py deleted file mode 100644 index 30c2248..0000000 --- a/monotonic_align/setup.py +++ /dev/null @@ -1,9 +0,0 @@ -from distutils.core import setup -from Cython.Build import cythonize -import numpy - -setup( - name = 'monotonic_align', - ext_modules = cythonize("core.pyx"), - include_dirs=[numpy.get_include()] -) diff --git a/preprocess.py b/preprocess.py deleted file mode 100644 index aaedbf0..0000000 --- a/preprocess.py +++ /dev/null @@ -1,25 +0,0 @@ -import argparse -import text -from utils import load_filepaths_and_text - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--out_extension", default="cleaned") - parser.add_argument("--text_index", default=1, type=int) - parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"]) - parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"]) - - args = parser.parse_args() - - - for filelist in args.filelists: - print("START:", filelist) - filepaths_and_text = load_filepaths_and_text(filelist) - for i in range(len(filepaths_and_text)): - original_text = filepaths_and_text[i][args.text_index] - cleaned_text = text._clean_text(original_text, args.text_cleaners) - filepaths_and_text[i][args.text_index] = cleaned_text - - new_filelist = filelist + "." + args.out_extension - with open(new_filelist, "w", encoding="utf-8") as f: - f.writelines(["|".join(x) + "\n" for x in filepaths_and_text]) diff --git a/preprocess_flist.py b/preprocess_flist.py new file mode 100644 index 0000000..47600ab --- /dev/null +++ b/preprocess_flist.py @@ -0,0 +1,49 @@ +import os +import argparse +from tqdm import tqdm +from random import shuffle + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list") + parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list") + parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list") + parser.add_argument("--source_dir", type=str, default="./dataset/48k", help="path to source dir") + args = parser.parse_args() + + train = [] + val = [] + test = [] + idx = 0 + + for speaker in tqdm(os.listdir(args.source_dir)): + wavs = [os.path.join(args.source_dir, speaker, i)for i in os.listdir(os.path.join(args.source_dir, speaker))] + wavs = [i for i in wavs if i.endswith("wav")] + shuffle(wavs) + train += wavs[2:-10] + val += wavs[:2] + test += wavs[-10:] + + shuffle(train) + shuffle(val) + shuffle(test) + + print("Writing", args.train_list) + with open(args.train_list, "w") as f: + for fname in tqdm(train): + wavpath = fname + f.write(wavpath + "\n") + + print("Writing", args.val_list) + with open(args.val_list, "w") as f: + for fname in tqdm(val): + wavpath = fname + f.write(wavpath + "\n") + + print("Writing", args.test_list) + with open(args.test_list, "w") as f: + for fname in tqdm(test): + wavpath = fname + f.write(wavpath + "\n") + \ No newline at end of file diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py new file mode 100644 index 0000000..9a770d6 --- /dev/null +++ b/preprocess_hubert_f0.py @@ -0,0 +1,129 @@ +import os +import argparse + +import torch +import json +from glob import glob + +from pyworld import pyworld +from tqdm import tqdm +from scipy.io import wavfile + +import utils +from mel_processing import mel_spectrogram_torch +#import h5py +import logging +logging.getLogger('numba').setLevel(logging.WARNING) + +import parselmouth +import librosa +import numpy as np +def stft(y): + return librosa.stft( + y=y, + n_fft=1280, + hop_length=320, + win_length=1280, + ) + +def energy(y): + # Extract energy + S = librosa.magphase(stft(y))[0] + e = np.sqrt(np.sum(S ** 2, axis=0)) # np.linalg.norm(S, axis=0) + return e.squeeze() # (Number of frames) => (654,) + +def get_energy(path, p_len=None): + wav, sr = librosa.load(path, 48000) + e = energy(wav) + if p_len is None: + p_len = wav.shape[0] // 320 + assert e.shape[0] -p_len <2 ,(e.shape[0] ,p_len) + e = e[: p_len] + return e + + + +def get_f0(path,p_len=None, f0_up_key=0): + x, _ = librosa.load(path, 48000) + if p_len is None: + p_len = x.shape[0]//320 + else: + assert abs(p_len-x.shape[0]//320) < 3, (path, p_len, x.shape) + time_step = 320 / 48000 * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + f0 = parselmouth.Sound(x, 48000).to_pitch_ac( + time_step=time_step / 1000, voicing_threshold=0.6, + pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] + + pad_size=(p_len - len(f0) + 1) // 2 + if(pad_size>0 or p_len - len(f0) - pad_size>0): + f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') + + f0bak = f0.copy() + f0 *= pow(2, f0_up_key / 12) + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak + +def resize2d(x, target_len): + source = np.array(x) + source[source<0.001] = np.nan + target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source) + res = np.nan_to_num(target) + return res + +def compute_f0(path, c_len): + x, sr = librosa.load(path, sr=48000) + f0, t = pyworld.dio( + x.astype(np.double), + fs=sr, + f0_ceil=800, + frame_period=1000 * 320 / sr, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, 48000) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + assert abs(c_len - x.shape[0]//320) < 3, (c_len, f0.shape) + + return None, resize2d(f0, c_len) + + +def process(filename): + print(filename) + save_name = filename+".soft.pt" + if not os.path.exists(save_name): + devive = torch.device("cuda" if torch.cuda.is_available() else "cpu") + wav, _ = librosa.load(filename, sr=16000) + wav = torch.from_numpy(wav).unsqueeze(0).to(devive) + c = utils.get_hubert_content(hmodel, wav) + torch.save(c.cpu(), save_name) + else: + c = torch.load(save_name) + f0path = filename+".f0.npy" + if not os.path.exists(f0path): + cf0, f0 = compute_f0(filename, c.shape[-1] * 3) + np.save(f0path, f0) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--in_dir", type=str, default="dataset/48k", help="path to input dir") + args = parser.parse_args() + + print("Loading hubert for content...") + hmodel = utils.get_hubert_model(0 if torch.cuda.is_available() else None) + print("Loaded hubert.") + + filenames = glob(f'{args.in_dir}/*/*.wav', recursive=True)#[:10] + + for filename in tqdm(filenames): + process(filename) + \ No newline at end of file diff --git a/preprocess_wave.py b/preprocess_wave.py deleted file mode 100644 index a557aa2..0000000 --- a/preprocess_wave.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -import librosa -import pyworld -import utils -import numpy as np -from scipy.io import wavfile - - -class FeatureInput(object): - def __init__(self, samplerate=16000, hop_size=160): - self.fs = samplerate - self.hop = hop_size - - self.f0_bin = 256 - self.f0_max = 1100.0 - self.f0_min = 50.0 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - - def compute_f0(self, path): - x, sr = librosa.load(path, sr=self.fs) - assert sr == self.fs - f0, t = pyworld.dio( - x.astype(np.double), - fs=sr, - f0_ceil=800, - frame_period=1000 * self.hop / sr, - ) - f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return f0 - - # for numpy # code from diffsinger - def coarse_f0(self, f0): - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( - self.f0_bin - 2 - ) / (self.f0_mel_max - self.f0_mel_min) + 1 - - # use 0 or 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 - f0_coarse = np.rint(f0_mel).astype(np.int) - assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( - f0_coarse.max(), - f0_coarse.min(), - ) - return f0_coarse - - # for tensor # code from diffsinger - def coarse_f0_ts(self, f0): - f0_mel = 1127 * (1 + f0 / 700).log() - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( - self.f0_bin - 2 - ) / (self.f0_mel_max - self.f0_mel_min) + 1 - - # use 0 or 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 - f0_coarse = (f0_mel + 0.5).long() - assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( - f0_coarse.max(), - f0_coarse.min(), - ) - return f0_coarse - - def save_wav(self, wav, path): - wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6 - wavfile.write(path, self.fs, wav.astype(np.int16)) - - -if __name__ == "__main__": - wavPath = "./data/waves" - outPath = "./data/label" - if not os.path.exists("./data/label"): - os.mkdir("./data/label") - - # define model and load checkpoint - hps = utils.get_hparams_from_file("./configs/singing_base.json") - featureInput = FeatureInput(hps.data.sampling_rate, hps.data.hop_length) - vits_file = open("./filelists/vc_file.txt", "w", encoding="utf-8") - - for spks in os.listdir(wavPath): - if os.path.isdir(f"./{wavPath}/{spks}"): - os.makedirs(f"./{outPath}/{spks}") - for file in os.listdir(f"./{wavPath}/{spks}"): - if file.endswith(".wav"): - file = file[:-4] - audio_path = f"./{wavPath}/{spks}/{file}.wav" - featur_pit = featureInput.compute_f0(audio_path) - coarse_pit = featureInput.coarse_f0(featur_pit) - np.save( - f"{outPath}/{spks}/{file}_pitch.npy", - coarse_pit, - allow_pickle=False, - ) - np.save( - f"{outPath}/{spks}/{file}_nsff0.npy", - featur_pit, - allow_pickle=False, - ) - - path_audio = f"./data/waves/{spks}/{file}.wav" - path_spkid = f"./data/spkid/{spks}.npy" - path_label = ( - f"./data/phone/{spks}/{file}.npy" # phone means ppg & hubert - ) - path_pitch = f"./data/label/{spks}/{file}_pitch.npy" - path_nsff0 = f"./data/label/{spks}/{file}_nsff0.npy" - print( - f"{path_audio}|{path_spkid}|{path_label}|{path_pitch}|{path_nsff0}", - file=vits_file, - ) - - vits_file.close() diff --git a/raw/wav_structure.txt b/raw/wav_structure.txt new file mode 100644 index 0000000..68cee4e --- /dev/null +++ b/raw/wav_structure.txt @@ -0,0 +1,20 @@ +数据集准备 + +raw +├───speaker0 +│ ├───xxx1-xxx1.wav +│ ├───... +│ └───Lxx-0xx8.wav +└───speaker1 + ├───xx2-0xxx2.wav + ├───... + └───xxx7-xxx007.wav + +此外还需要编辑config.json + +"n_speakers": 10 + +"spk":{ + "speaker0": 0, + "speaker1": 1, +} diff --git a/requirements.txt b/requirements.txt index 0c57847..2ea87af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,8 @@ -Cython -librosa -matplotlib -numpy -phonemizer -scipy -tensorboard -torch -torchvision -Unidecode -torchaudio -pyworld -sacred -tqdm +glob2==0.7 +tqdm==4.62.3 +librosa==0.8.1 +numpy==1.21.6 +scipy==1.7.2 +tensorboard==2.7.0 +torch==1.10.0 +torchvision==0.9.0 \ No newline at end of file diff --git a/samples/sample_out1.wav b/samples/sample_out1.wav deleted file mode 100644 index 54717e1..0000000 Binary files a/samples/sample_out1.wav and /dev/null differ diff --git a/samples/sample_out2+12keys.wav b/samples/sample_out2+12keys.wav deleted file mode 100644 index de13dca..0000000 Binary files a/samples/sample_out2+12keys.wav and /dev/null differ diff --git a/samples/sample_out3-2keys.wav b/samples/sample_out3-2keys.wav deleted file mode 100644 index 907000e..0000000 Binary files a/samples/sample_out3-2keys.wav and /dev/null differ diff --git a/samples/sample_out4.wav b/samples/sample_out4.wav deleted file mode 100644 index a320664..0000000 Binary files a/samples/sample_out4.wav and /dev/null differ diff --git a/samples/sample_out5.wav b/samples/sample_out5.wav deleted file mode 100644 index 9fdfa0b..0000000 Binary files a/samples/sample_out5.wav and /dev/null differ diff --git a/samples/sample_out6_male_target.wav b/samples/sample_out6_male_target.wav deleted file mode 100644 index 1e65aca..0000000 Binary files a/samples/sample_out6_male_target.wav and /dev/null differ diff --git a/samples/sample_out7.wav b/samples/sample_out7.wav deleted file mode 100644 index 9ca2b2e..0000000 Binary files a/samples/sample_out7.wav and /dev/null differ diff --git a/samples/sample_src1.wav b/samples/sample_src1.wav deleted file mode 100644 index d17ff15..0000000 Binary files a/samples/sample_src1.wav and /dev/null differ diff --git a/samples/sample_src2.wav b/samples/sample_src2.wav deleted file mode 100644 index 00f3019..0000000 Binary files a/samples/sample_src2.wav and /dev/null differ diff --git a/samples/sample_src3.wav b/samples/sample_src3.wav deleted file mode 100644 index 8cc697c..0000000 Binary files a/samples/sample_src3.wav and /dev/null differ diff --git a/samples/sample_src4.wav b/samples/sample_src4.wav deleted file mode 100644 index 9a8ed13..0000000 Binary files a/samples/sample_src4.wav and /dev/null differ diff --git a/samples/sample_src5.wav b/samples/sample_src5.wav deleted file mode 100644 index a9a6a5c..0000000 Binary files a/samples/sample_src5.wav and /dev/null differ diff --git a/samples/sample_src6.wav b/samples/sample_src6.wav deleted file mode 100644 index 8cc697c..0000000 Binary files a/samples/sample_src6.wav and /dev/null differ diff --git a/samples/sample_src7.wav b/samples/sample_src7.wav deleted file mode 100644 index 288e9c0..0000000 Binary files a/samples/sample_src7.wav and /dev/null differ diff --git a/text/LICENSE b/text/LICENSE deleted file mode 100644 index 4ad4ed1..0000000 --- a/text/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2017 Keith Ito - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/text/__init__.py b/text/__init__.py deleted file mode 100644 index 4ac41f9..0000000 --- a/text/__init__.py +++ /dev/null @@ -1,54 +0,0 @@ -""" from https://github.com/keithito/tacotron """ -from text import cleaners -from text.symbols import symbols - - -# Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} - - -def text_to_sequence(text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [] - - clean_text = _clean_text(text, cleaner_names) - for symbol in clean_text: - symbol_id = _symbol_to_id[symbol] - sequence += [symbol_id] - return sequence - - -def cleaned_text_to_sequence(cleaned_text): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] - return sequence - - -def sequence_to_text(sequence): - '''Converts a sequence of IDs back to a string''' - result = '' - for symbol_id in sequence: - s = _id_to_symbol[symbol_id] - result += s - return result - - -def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text diff --git a/text/cleaners.py b/text/cleaners.py deleted file mode 100644 index 2658f66..0000000 --- a/text/cleaners.py +++ /dev/null @@ -1,100 +0,0 @@ -""" from https://github.com/keithito/tacotron """ - -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - -import re -from unidecode import unidecode -from phonemizer import phonemize - - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) - - -def convert_to_ascii(text): - return unidecode(text) - - -def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - '''Pipeline for English text, including abbreviation expansion.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_abbreviations(text) - phonemes = phonemize(text, language='en-us', backend='espeak', strip=True) - phonemes = collapse_whitespace(phonemes) - return phonemes - - -def english_cleaners2(text): - '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_abbreviations(text) - phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) - phonemes = collapse_whitespace(phonemes) - return phonemes diff --git a/text/symbols.py b/text/symbols.py deleted file mode 100644 index 869a53e..0000000 --- a/text/symbols.py +++ /dev/null @@ -1,16 +0,0 @@ -""" from https://github.com/keithito/tacotron """ - -''' -Defines the set of symbols used in text input to the model. -''' -_pad = '_' -_punctuation = ';:,.!?¡¿—…"«»“” ' -_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' -_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" - - -# Export all symbols: -symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) - -# Special symbol ids -SPACE_ID = symbols.index(" ") diff --git a/train.py b/train.py index 40cc7be..410a75c 100644 --- a/train.py +++ b/train.py @@ -1,3 +1,5 @@ +import logging +logging.getLogger('matplotlib').setLevel(logging.WARNING) import os import json import argparse @@ -13,283 +15,272 @@ from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler -import librosa -import logging - -logging.getLogger('numba').setLevel(logging.WARNING) - import commons import utils -from data_utils import ( - TextAudioLoader, - TextAudioCollate, - DistributedBucketSampler -) +from data_utils import TextAudioSpeakerLoader, EvalDataLoader from models import ( - SynthesizerTrn, - MultiPeriodDiscriminator, + SynthesizerTrn, + MultiPeriodDiscriminator, ) from losses import ( - generator_loss, - discriminator_loss, - feature_loss, - kl_loss + kl_loss, + generator_loss, discriminator_loss, feature_loss ) -from mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from text.symbols import symbols +from mel_processing import mel_spectrogram_torch, spec_to_mel_torch torch.backends.cudnn.benchmark = True global_step = 0 +# os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO' + + def main(): - """Assume Single Node Multi GPUs Training Only""" - assert torch.cuda.is_available(), "CPU training is not allowed." + """Assume Single Node Multi GPUs Training Only""" + assert torch.cuda.is_available(), "CPU training is not allowed." + hps = utils.get_hparams() - n_gpus = torch.cuda.device_count() - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '25565' + n_gpus = torch.cuda.device_count() + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = hps.train.port - hps = utils.get_hparams() - mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) + mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) def run(rank, n_gpus, hps): - global global_step - if rank == 0: - logger = utils.get_logger(hps.model_dir) - logger.info(hps) - utils.check_git_hash(hps.model_dir) - writer = SummaryWriter(log_dir=hps.model_dir) - writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) - - dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) - torch.manual_seed(hps.train.seed) - torch.cuda.set_device(rank) - - train_dataset = TextAudioLoader(hps.data.training_files, hps.data) - train_sampler = DistributedBucketSampler( - train_dataset, - hps.train.batch_size, - [32,300,400,500,600,700,800,900,1000], - num_replicas=n_gpus, - rank=rank, - shuffle=True) - collate_fn = TextAudioCollate() - train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, - collate_fn=collate_fn, batch_sampler=train_sampler) - if rank == 0: - eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data) - eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False, - batch_size=hps.train.batch_size, pin_memory=True, - drop_last=False, collate_fn=collate_fn) - - net_g = SynthesizerTrn( - len(symbols), - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - **hps.model).cuda(rank) - net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) - optim_g = torch.optim.AdamW( - net_g.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps) - optim_d = torch.optim.AdamW( - net_d.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps) - net_g = DDP(net_g, device_ids=[rank]) - net_d = DDP(net_d, device_ids=[rank]) - - try: - _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) - _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) - global_step = (epoch_str - 1) * len(train_loader) - except: - epoch_str = 1 - global_step = 0 - - scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) - - scaler = GradScaler(enabled=hps.train.fp16_run) - - for epoch in range(epoch_str, hps.train.epochs + 1): - if rank==0: - train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) - else: - train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) - scheduler_g.step() - scheduler_d.step() + global global_step + if rank == 0: + logger = utils.get_logger(hps.model_dir) + logger.info(hps) + utils.check_git_hash(hps.model_dir) + writer = SummaryWriter(log_dir=hps.model_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) + + dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) + torch.manual_seed(hps.train.seed) + torch.cuda.set_device(rank) + + train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps) + train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, + batch_size=hps.train.batch_size) + if rank == 0: + eval_dataset = EvalDataLoader(hps.data.validation_files, hps) + eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False, + batch_size=1, pin_memory=False, + drop_last=False) + + net_g = SynthesizerTrn( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model).cuda(rank) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) + optim_g = torch.optim.AdamW( + net_g.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps) + net_g = DDP(net_g, device_ids=[rank]) # , find_unused_parameters=True) + net_d = DDP(net_d, device_ids=[rank]) + + try: + _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, + optim_g) + _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, + optim_d) + global_step = (epoch_str - 1) * len(train_loader) + except: + epoch_str = 1 + global_step = 0 + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) + + scaler = GradScaler(enabled=hps.train.fp16_run) + + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, + [train_loader, eval_loader], logger, [writer, writer_eval]) + else: + train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, + [train_loader, None], None, None) + scheduler_g.step() + scheduler_d.step() def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): - net_g, net_d = nets - optim_g, optim_d = optims - scheduler_g, scheduler_d = schedulers - train_loader, eval_loader = loaders - if writers is not None: - writer, writer_eval = writers - - train_loader.batch_sampler.set_epoch(epoch) - global global_step - - net_g.train() - net_d.train() - for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, pitch) in enumerate(train_loader): - x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) - pitch = pitch.cuda(rank, non_blocking=True) - with autocast(enabled=hps.train.fp16_run): - y_hat, l_length, attn, ids_slice, x_mask, z_mask,\ - (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, pitch) - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax) - y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax - ) - - y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice - - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) - loss_disc_all = loss_disc - optim_d.zero_grad() - scaler.scale(loss_disc_all).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) - with autocast(enabled=False): - loss_dur = torch.sum(l_length.float()) - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank==0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] - losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) - logger.info([x.item() for x in losses] + [global_step, lr]) - - scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} - scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl}) - - scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) - scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) - scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), - "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) - } - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict) - - if global_step % hps.train.eval_interval == 0: - evaluate(hps, net_g, eval_loader, writer_eval) - utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) - utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) - global_step += 1 - - if rank == 0: - logger.info('====> Epoch: {}'.format(epoch)) - - + net_g, net_d = nets + optim_g, optim_d = optims + scheduler_g, scheduler_d = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + # train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + for batch_idx, items in enumerate(train_loader): + c, f0, spec, y, spk = items + g = spk.cuda(rank, non_blocking=True) + spec, y = spec.cuda(rank, non_blocking=True), y.cuda(rank, non_blocking=True) + c = c.cuda(rank, non_blocking=True) + f0 = f0.cuda(rank, non_blocking=True) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax) + + with autocast(enabled=hps.train.fp16_run): + y_hat, ids_slice, z_mask, \ + (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(c, f0, spec, g=g, mel=mel) + + y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) + + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) + loss_disc_all = loss_disc + + optim_d.zero_grad() + scaler.scale(loss_disc_all).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]['lr'] + losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_kl] + logger.info('Train Epoch: {} [{:.0f}%]'.format( + epoch, + 100. * batch_idx / len(train_loader))) + logger.info([x.item() for x in losses] + [global_step, lr]) + + scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, + "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} + scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl}) + + scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) + scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) + scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + } + audio_dict={ + f"train/gen": y_hat[0], + f"train/gt": y[0], + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + audios=audio_dict, + audio_sampling_rate = hps.data.sampling_rate + ) + + if global_step % hps.train.eval_interval == 0: + evaluate(hps, net_g, eval_loader, writer_eval) + utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) + utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) + global_step += 1 + + if rank == 0: + logger.info('====> Epoch: {}'.format(epoch)) + + def evaluate(hps, generator, eval_loader, writer_eval): generator.eval() + image_dict = {} + audio_dict = {} with torch.no_grad(): - for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, pitch) in enumerate(eval_loader): - x, x_lengths = x.cuda(0), x_lengths.cuda(0) - spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) - y, y_lengths = y.cuda(0), y_lengths.cuda(0) - pitch = pitch.cuda(0) - # remove else - x = x[:1] - x_lengths = x_lengths[:1] - spec = spec[:1] - spec_lengths = spec_lengths[:1] - y = y[:1] - y_lengths = y_lengths[:1] - break - y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, pitch, max_len=1000) - y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1).float(), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax - ) - image_dict = { - "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) - } - audio_dict = { - "gen/audio": y_hat[0,:,:y_hat_lengths[0]] - } - if global_step == 0: - image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) - audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]}) - + for batch_idx, items in enumerate(eval_loader): + c, f0, spec, y, spk = items + g = spk[:1].cuda(0) + spec, y = spec[:1].cuda(0), y[:1].cuda(0) + c = c[:1].cuda(0) + f0 = f0[:1].cuda(0) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax) + y_hat = generator.module.infer(c, f0, g=g, mel=mel) + + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1).float(), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + + audio_dict.update({ + f"gen/audio_{batch_idx}": y_hat[0], + f"gt/audio_{batch_idx}": y[0] + }) + image_dict.update({ + f"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()), + "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy()) + }) utils.summarize( - writer=writer_eval, - global_step=global_step, - images=image_dict, - audios=audio_dict, - audio_sampling_rate=hps.data.sampling_rate + writer=writer_eval, + global_step=global_step, + images=image_dict, + audios=audio_dict, + audio_sampling_rate=hps.data.sampling_rate ) generator.train() - + if __name__ == "__main__": - main() + main() diff --git a/train_ms.py b/train_ms.py deleted file mode 100644 index aee43d3..0000000 --- a/train_ms.py +++ /dev/null @@ -1,303 +0,0 @@ -import os -import json -import argparse -import itertools -import math -import torch -from torch import nn, optim -from torch.nn import functional as F -from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler - -import commons -import utils -from data_utils import ( - TextAudioSpeakerLoader, - TextAudioSpeakerCollate, - DistributedBucketSampler -) -from models import ( - SynthesizerTrn, - MultiPeriodDiscriminator, -) -from losses import ( - generator_loss, - discriminator_loss, - feature_loss, - kl_loss -) -from mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from text.symbols import symbols - -torch.backends.cudnn.benchmark = True -global_step = 0 - - -def main(): - """Assume Single Node Multi GPUs Training Only""" - assert torch.cuda.is_available(), "CPU training is not allowed." - - n_gpus = torch.cuda.device_count() - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '25565' - - hps = utils.get_hparams() - mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) - - -def run(rank, n_gpus, hps): - global global_step - if rank == 0: - logger = utils.get_logger(hps.model_dir) - logger.info(hps) - utils.check_git_hash(hps.model_dir) - writer = SummaryWriter(log_dir=hps.model_dir) - writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) - - dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) - torch.manual_seed(hps.train.seed) - torch.cuda.set_device(rank) - - train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) - train_sampler = DistributedBucketSampler( - train_dataset, - hps.train.batch_size, - [32, 300, 400, 500, 600, 700, 800, 900, 1000], - num_replicas=n_gpus, - rank=rank, - shuffle=True) - collate_fn = TextAudioSpeakerCollate() - train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, - collate_fn=collate_fn, batch_sampler=train_sampler) - if rank == 0: - eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) - eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False, - batch_size=hps.train.batch_size, pin_memory=True, - drop_last=False, collate_fn=collate_fn) - - net_g = SynthesizerTrn( - len(symbols), - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model).cuda(rank) - net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) - optim_g = torch.optim.AdamW( - net_g.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps) - optim_d = torch.optim.AdamW( - net_d.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps) - net_g = DDP(net_g, device_ids=[rank]) - net_d = DDP(net_d, device_ids=[rank]) - - try: - _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, - optim_g) - _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, - optim_d) - global_step = (epoch_str - 1) * len(train_loader) - except: - epoch_str = 1 - global_step = 0 - # epoch_str = 1 - # global_step = 0 - scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - - scaler = GradScaler(enabled=hps.train.fp16_run) - - for epoch in range(epoch_str, hps.train.epochs + 1): - if rank == 0: - train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, - [train_loader, eval_loader], logger, [writer, writer_eval]) - else: - train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, - [train_loader, None], None, None) - scheduler_g.step() - scheduler_d.step() - - -def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): - net_g, net_d = nets - optim_g, optim_d = optims - scheduler_g, scheduler_d = schedulers - train_loader, eval_loader = loaders - if writers is not None: - writer, writer_eval = writers - - train_loader.batch_sampler.set_epoch(epoch) - global global_step - - net_g.train() - net_d.train() - for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, pitch, speakers) in enumerate(train_loader): - x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) - speakers = speakers.cuda(rank, non_blocking=True) - pitch = pitch.cuda(rank, non_blocking=True) - - with autocast(enabled=hps.train.fp16_run): - y_hat, l_length, attn, ids_slice, x_mask, z_mask, \ - (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, pitch, speakers) - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax) - y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax - ) - - y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice - - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) - loss_disc_all = loss_disc - optim_d.zero_grad() - scaler.scale(loss_disc_all).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) - with autocast(enabled=False): - loss_dur = torch.sum(l_length.float()) - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] - losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl] - logger.info('Train Epoch: {} [{:.0f}%]'.format( - epoch, - 100. * batch_idx / len(train_loader))) - logger.info([x.item() for x in losses] + [global_step, lr]) - - scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, - "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} - scalar_dict.update( - {"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl}) - - scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) - scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) - scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), - "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - "all/attn": utils.plot_alignment_to_numpy(attn[0, 0].data.cpu().numpy()) - } - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict) - - if global_step % hps.train.eval_interval == 0: - evaluate(hps, net_g, eval_loader, writer_eval) - utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, - os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) - utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, - os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) - global_step += 1 - - if rank == 0: - logger.info('====> Epoch: {}'.format(epoch)) - - -def evaluate(hps, generator, eval_loader, writer_eval): - generator.eval() - with torch.no_grad(): - for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, pitch, speakers) in enumerate(eval_loader): - x, x_lengths = x.cuda(0), x_lengths.cuda(0) - spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) - y, y_lengths = y.cuda(0), y_lengths.cuda(0) - speakers = speakers.cuda(0) - pitch = pitch.cuda(0) - # remove else - x = x[:1] - x_lengths = x_lengths[:1] - spec = spec[:1] - spec_lengths = spec_lengths[:1] - y = y[:1] - y_lengths = y_lengths[:1] - speakers = speakers[:1] - break - y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, pitch, speakers, max_len=1000) - y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1).float(), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax - ) - image_dict = { - "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) - } - audio_dict = { - "gen/audio": y_hat[0, :, :y_hat_lengths[0]] - } - if global_step == 0: - image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) - audio_dict.update({"gt/audio": y[0, :, :y_lengths[0]]}) - - utils.summarize( - writer=writer_eval, - global_step=global_step, - images=image_dict, - audios=audio_dict, - audio_sampling_rate=hps.data.sampling_rate - ) - generator.train() - - -if __name__ == "__main__": - main() diff --git a/transforms.py b/transforms.py deleted file mode 100644 index 4793d67..0000000 --- a/transforms.py +++ /dev/null @@ -1,193 +0,0 @@ -import torch -from torch.nn import functional as F - -import numpy as np - - -DEFAULT_MIN_BIN_WIDTH = 1e-3 -DEFAULT_MIN_BIN_HEIGHT = 1e-3 -DEFAULT_MIN_DERIVATIVE = 1e-3 - - -def piecewise_rational_quadratic_transform(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - - if tails is None: - spline_fn = rational_quadratic_spline - spline_kwargs = {} - else: - spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = { - 'tails': tails, - 'tail_bound': tail_bound - } - - outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs - ) - return outputs, logabsdet - - -def searchsorted(bin_locations, inputs, eps=1e-6): - bin_locations[..., -1] += eps - return torch.sum( - inputs[..., None] >= bin_locations, - dim=-1 - ) - 1 - - -def unconstrained_rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails='linear', - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) - outside_interval_mask = ~inside_interval_mask - - outputs = torch.zeros_like(inputs) - logabsdet = torch.zeros_like(inputs) - - if tails == 'linear': - unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) - constant = np.log(np.exp(1 - min_derivative) - 1) - unnormalized_derivatives[..., 0] = constant - unnormalized_derivatives[..., -1] = constant - - outputs[outside_interval_mask] = inputs[outside_interval_mask] - logabsdet[outside_interval_mask] = 0 - else: - raise RuntimeError('{} tails are not implemented.'.format(tails)) - - outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( - inputs=inputs[inside_interval_mask], - unnormalized_widths=unnormalized_widths[inside_interval_mask, :], - unnormalized_heights=unnormalized_heights[inside_interval_mask, :], - unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], - inverse=inverse, - left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative - ) - - return outputs, logabsdet - -def rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0., right=1., bottom=0., top=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError('Input to a transform is not within its domain') - - num_bins = unnormalized_widths.shape[-1] - - if min_bin_width * num_bins > 1.0: - raise ValueError('Minimal bin width too large for the number of bins') - if min_bin_height * num_bins > 1.0: - raise ValueError('Minimal bin height too large for the number of bins') - - widths = F.softmax(unnormalized_widths, dim=-1) - widths = min_bin_width + (1 - min_bin_width * num_bins) * widths - cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) - cumwidths = (right - left) * cumwidths + left - cumwidths[..., 0] = left - cumwidths[..., -1] = right - widths = cumwidths[..., 1:] - cumwidths[..., :-1] - - derivatives = min_derivative + F.softplus(unnormalized_derivatives) - - heights = F.softmax(unnormalized_heights, dim=-1) - heights = min_bin_height + (1 - min_bin_height * num_bins) * heights - cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) - cumheights = (top - bottom) * cumheights + bottom - cumheights[..., 0] = bottom - cumheights[..., -1] = top - heights = cumheights[..., 1:] - cumheights[..., :-1] - - if inverse: - bin_idx = searchsorted(cumheights, inputs)[..., None] - else: - bin_idx = searchsorted(cumwidths, inputs)[..., None] - - input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] - input_bin_widths = widths.gather(-1, bin_idx)[..., 0] - - input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] - delta = heights / widths - input_delta = delta.gather(-1, bin_idx)[..., 0] - - input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] - input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] - - input_heights = heights.gather(-1, bin_idx)[..., 0] - - if inverse: - a = (((inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta) - + input_heights * (input_delta - input_derivatives))) - b = (input_heights * input_derivatives - - (inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta)) - c = - input_delta * (inputs - input_cumheights) - - discriminant = b.pow(2) - 4 * a * c - assert (discriminant >= 0).all() - - root = (2 * c) / (-b - torch.sqrt(discriminant)) - outputs = root * input_bin_widths + input_cumwidths - - theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2)) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, -logabsdet - else: - theta = (inputs - input_cumwidths) / input_bin_widths - theta_one_minus_theta = theta * (1 - theta) - - numerator = input_heights * (input_delta * theta.pow(2) - + input_derivatives * theta_one_minus_theta) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) - outputs = input_cumheights + numerator / denominator - - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2)) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, logabsdet diff --git a/utils.py b/utils.py index c60894b..a89edee 100644 --- a/utils.py +++ b/utils.py @@ -5,15 +5,84 @@ import logging import json import subprocess + +import librosa import numpy as np +import torchaudio from scipy.io.wavfile import read import torch - +import torchvision +from torch.nn import functional as F +from commons import sequence_mask +from hubert import hubert_model MATPLOTLIB_FLAG = False logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logger = logging +f0_bin = 256 +f0_max = 1100.0 +f0_min = 50.0 +f0_mel_min = 1127 * np.log(1 + f0_min / 700) +f0_mel_max = 1127 * np.log(1 + f0_max / 700) + +def f0_to_coarse(f0): + is_torch = isinstance(f0, torch.Tensor) + f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 + + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 + f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min()) + return f0_coarse + + +def get_hubert_model(rank=None): + + hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt") + if rank is not None: + hubert_soft = hubert_soft.cuda(rank) + return hubert_soft + +def get_hubert_content(hmodel, y=None, path=None): + if path is not None: + source, sr = torchaudio.load(path) + source = torchaudio.functional.resample(source, sr, 16000) + if len(source.shape) == 2 and source.shape[1] >= 2: + source = torch.mean(source, dim=0).unsqueeze(0) + else: + source = y + source = source.unsqueeze(0) + with torch.inference_mode(): + units = hmodel.units(source) + return units.transpose(1,2) + + +def get_content(cmodel, y): + with torch.no_grad(): + c = cmodel.extract_features(y.squeeze(1))[0] + c = c.transpose(1, 2) + return c + + + +def transform(mel, height): # 68-92 + #r = np.random.random() + #rate = r * 0.3 + 0.85 # 0.85-1.15 + #height = int(mel.size(-2) * rate) + tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1))) + if height >= mel.size(-2): + return tgt[:, :mel.size(-2), :] + else: + silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1) + silence += torch.randn_like(silence) / 10 + return torch.cat((tgt, silence), 1) + + +def stretch(mel, width): # 0.5-2 + return torchvision.transforms.functional.resize(mel, (mel.size(-2), width)) + def load_checkpoint(checkpoint_path, model, optimizer=None): assert os.path.isfile(checkpoint_path) @@ -22,10 +91,7 @@ def load_checkpoint(checkpoint_path, model, optimizer=None): learning_rate = checkpoint_dict['learning_rate'] if optimizer is not None: optimizer.load_state_dict(checkpoint_dict['optimizer']) - # print(1111) saved_state_dict = checkpoint_dict['model'] - # print(1111) - if hasattr(model, 'module'): state_dict = model.module.state_dict() else: @@ -47,6 +113,12 @@ def load_checkpoint(checkpoint_path, model, optimizer=None): def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + ckptname = checkpoint_path.split("/")[-1] + newest_step = int(ckptname.split(".")[0].split("_")[1]) + val_steps = 2000 + last_ckptname = checkpoint_path.replace(str(newest_step), str(newest_step - val_steps*3)) + if newest_step >= val_steps*3: + os.system(f"rm {last_ckptname}") logger.info("Saving model and optimizer state at iteration {} to {}".format( iteration, checkpoint_path)) if hasattr(model, 'module'): @@ -88,7 +160,7 @@ def plot_spectrogram_to_numpy(spectrogram): mpl_logger.setLevel(logging.WARNING) import matplotlib.pylab as plt import numpy as np - + fig, ax = plt.subplots(figsize=(10,2)) im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none') @@ -150,7 +222,7 @@ def get_hparams(init=True): help='JSON file for configuration') parser.add_argument('-m', '--model', type=str, required=True, help='Model name') - + args = parser.parse_args() model_dir = os.path.join("./logs", args.model) @@ -168,7 +240,7 @@ def get_hparams(init=True): with open(config_save_path, "r") as f: data = f.read() config = json.loads(data) - + hparams = HParams(**config) hparams.model_dir = model_dir return hparams @@ -218,7 +290,7 @@ def get_logger(model_dir, filename="train.log"): global logger logger = logging.getLogger(os.path.basename(model_dir)) logger.setLevel(logging.DEBUG) - + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") if not os.path.exists(model_dir): os.makedirs(model_dir) @@ -235,7 +307,7 @@ def __init__(self, **kwargs): if type(v) == dict: v = HParams(**v) self[k] = v - + def keys(self): return self.__dict__.keys() @@ -259,3 +331,4 @@ def __contains__(self, key): def __repr__(self): return self.__dict__.__repr__() + diff --git a/vdecoder/__init__.py b/vdecoder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vdecoder/hifigan/env.py b/vdecoder/hifigan/env.py new file mode 100644 index 0000000..2bdbc95 --- /dev/null +++ b/vdecoder/hifigan/env.py @@ -0,0 +1,15 @@ +import os +import shutil + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def build_env(config, config_name, path): + t_path = os.path.join(path, config_name) + if config != t_path: + os.makedirs(path, exist_ok=True) + shutil.copyfile(config, os.path.join(path, config_name)) diff --git a/vdecoder/hifigan/models.py b/vdecoder/hifigan/models.py new file mode 100644 index 0000000..bdc3fa2 --- /dev/null +++ b/vdecoder/hifigan/models.py @@ -0,0 +1,500 @@ +import os +import json +from .env import AttrDict +import numpy as np +import torch +import torch.nn.functional as F +import torch.nn as nn +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from .utils import init_weights, get_padding + +LRELU_SLOPE = 0.1 + + +def load_model(model_path, device='cuda'): + config_file = os.path.join(os.path.split(model_path)[0], 'config.json') + with open(config_file) as f: + data = f.read() + + global h + json_config = json.loads(data) + h = AttrDict(json_config) + + generator = Generator(h).to(device) + + cp_dict = torch.load(model_path) + generator.load_state_dict(cp_dict['generator']) + generator.eval() + generator.remove_weight_norm() + del cp_dict + return generator, h + + +class ResBlock1(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.h = h + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.h = h + self.convs = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))) + ]) + self.convs.apply(init_weights) + + def forward(self, x): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class SineGen(torch.nn.Module): + """ Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__(self, samp_rate, harmonic_num=0, + sine_amp=0.1, noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + + def _f02uv(self, f0): + # generate uv signal + uv = (f0 > self.voiced_threshold).type(torch.float32) + return uv + + def _f02sine(self, f0_values): + """ f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \ + device=f0_values.device) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: + # for normal case + + # To prevent torch.cumsum numerical overflow, + # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. + # Buffer tmp_over_one_idx indicates the time step to add -1. + # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (torch.diff(tmp_over_one, dim=1)) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) + * 2 * np.pi) + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """ sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, + device=f0.device) + # fundamental component + fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)) + + # generate sine waveforms + sine_waves = self._f02sine(fn) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen(sampling_rate, harmonic_num, + sine_amp, add_noise_std, voiced_threshod) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + """ + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + """ + # source for harmonic branch + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.sine_amp / 3 + return sine_merge, noise, uv + + +class Generator(torch.nn.Module): + def __init__(self, h): + super(Generator, self).__init__() + self.h = h + + self.num_kernels = len(h["resblock_kernel_sizes"]) + self.num_upsamples = len(h["upsample_rates"]) + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) + self.m_source = SourceModuleHnNSF( + sampling_rate=h["sampling_rate"], + harmonic_num=8) + self.noise_convs = nn.ModuleList() + self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) + resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2 + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): + c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) + self.ups.append(weight_norm( + ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)), + k, u, padding=(k - u) // 2))) + if i + 1 < len(h["upsample_rates"]): # + stride_f0 = np.prod(h["upsample_rates"][i + 1:]) + self.noise_convs.append(Conv1d( + 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h["upsample_initial_channel"] // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): + self.resblocks.append(resblock(h, ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1) + + def forward(self, x, f0, g=None): + # print(1,x.shape,f0.shape,f0[:, None].shape) + f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t + # print(2,f0.shape) + har_source, noi_source, uv = self.m_source(f0) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + x = x + self.cond(g) + # print(124,x.shape,har_source.shape) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + # print(3,x.shape) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + # print(4,x_source.shape,har_source.shape,x.shape) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print('Removing weight norm...') + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), + norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), + ]) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, periods=None): + super(MultiPeriodDiscriminator, self).__init__() + self.periods = periods if periods is not None else [2, 3, 5, 7, 11] + self.discriminators = nn.ModuleList() + for period in self.periods: + self.discriminators.append(DiscriminatorP(period)) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList([ + norm_f(Conv1d(1, 128, 15, 1, padding=7)), + norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), + norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), + norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(MultiScaleDiscriminator, self).__init__() + self.discriminators = nn.ModuleList([ + DiscriminatorS(use_spectral_norm=True), + DiscriminatorS(), + DiscriminatorS(), + ]) + self.meanpools = nn.ModuleList([ + AvgPool1d(4, 2, padding=2), + AvgPool1d(4, 2, padding=2) + ]) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + if i != 0: + y = self.meanpools[i - 1](y) + y_hat = self.meanpools[i - 1](y_hat) + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + fmap_rs.append(fmap_r) + y_d_gs.append(y_d_g) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg ** 2) + loss += (r_loss + g_loss) + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses diff --git a/vdecoder/hifigan/nvSTFT.py b/vdecoder/hifigan/nvSTFT.py new file mode 100644 index 0000000..ec90bb1 --- /dev/null +++ b/vdecoder/hifigan/nvSTFT.py @@ -0,0 +1,111 @@ +import math +import os +os.environ["LRU_CACHE_CAPACITY"] = "3" +import random +import torch +import torch.utils.data +import numpy as np +import librosa +from librosa.util import normalize +from librosa.filters import mel as librosa_mel_fn +from scipy.io.wavfile import read +import soundfile as sf + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + sampling_rate = None + try: + data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile. + except Exception as ex: + print(f"'{full_path}' failed to load.\nException:") + print(ex) + if return_empty_on_exception: + return [], sampling_rate or target_sr or 48000 + else: + raise Exception(ex) + + if len(data.shape) > 1: + data = data[:, 0] + assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) + + if np.issubdtype(data.dtype, np.integer): # if audio data is type int + max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX + else: # if audio data is type fp32 + max_mag = max(np.amax(data), -np.amin(data)) + max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 + + data = torch.FloatTensor(data.astype(np.float32))/max_mag + + if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except + return [], sampling_rate or target_sr or 48000 + if target_sr is not None and sampling_rate != target_sr: + data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr)) + sampling_rate = target_sr + + return data, sampling_rate + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + +class STFT(): + def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5): + self.target_sr = sr + + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, center=False): + sampling_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + if fmax not in self.mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) + self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) + self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], + center=center, pad_mode='reflect', normalized=False, onesided=True) + # print(111,spec) + spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) + # print(222,spec) + spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec) + # print(333,spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + # print(444,spec) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + +stft = STFT() diff --git a/vdecoder/hifigan/utils.py b/vdecoder/hifigan/utils.py new file mode 100644 index 0000000..84bff02 --- /dev/null +++ b/vdecoder/hifigan/utils.py @@ -0,0 +1,68 @@ +import glob +import os +import matplotlib +import torch +from torch.nn.utils import weight_norm +matplotlib.use("Agg") +import matplotlib.pylab as plt + + +def plot_spectrogram(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + + fig.canvas.draw() + plt.close() + + return fig + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def apply_weight_norm(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + weight_norm(m) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def load_checkpoint(filepath, device): + assert os.path.isfile(filepath) + print("Loading '{}'".format(filepath)) + checkpoint_dict = torch.load(filepath, map_location=device) + print("Complete.") + return checkpoint_dict + + +def save_checkpoint(filepath, obj): + print("Saving checkpoint to {}".format(filepath)) + torch.save(obj, filepath) + print("Complete.") + + +def del_old_checkpoints(cp_dir, prefix, n_models=2): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) # get checkpoint paths + cp_list = sorted(cp_list)# sort by iter + if len(cp_list) > n_models: # if more than n_models models are found + for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models + open(cp, 'w').close()# empty file contents + os.unlink(cp)# delete file (move to trash when using Colab) + + +def scan_checkpoint(cp_dir, prefix): + pattern = os.path.join(cp_dir, prefix + '????????') + cp_list = glob.glob(pattern) + if len(cp_list) == 0: + return None + return sorted(cp_list)[-1] +