forked from PlayVoice/whisper-vits-svc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request PlayVoice#19 from NaruseMioShirakana/32k
Fix Readme, onnx_export.py and Gradio
- Loading branch information
Showing
4 changed files
with
246 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import hashlib | ||
import json | ||
import logging | ||
import os | ||
import time | ||
from pathlib import Path | ||
import io | ||
import librosa | ||
import maad | ||
import numpy as np | ||
from inference import slicer | ||
import parselmouth | ||
import soundfile | ||
import torch | ||
import torchaudio | ||
|
||
from hubert import hubert_model | ||
import utils | ||
from models import SynthesizerTrn | ||
logging.getLogger('numba').setLevel(logging.WARNING) | ||
logging.getLogger('matplotlib').setLevel(logging.WARNING) | ||
|
||
def resize2d_f0(x, target_len): | ||
source = np.array(x) | ||
source[source < 0.001] = np.nan | ||
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), | ||
source) | ||
res = np.nan_to_num(target) | ||
return res | ||
|
||
def get_f0(x, p_len,f0_up_key=0): | ||
|
||
time_step = 160 / 16000 * 1000 | ||
f0_min = 50 | ||
f0_max = 1100 | ||
f0_mel_min = 1127 * np.log(1 + f0_min / 700) | ||
f0_mel_max = 1127 * np.log(1 + f0_max / 700) | ||
|
||
f0 = parselmouth.Sound(x, 16000).to_pitch_ac( | ||
time_step=time_step / 1000, voicing_threshold=0.6, | ||
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency'] | ||
|
||
pad_size=(p_len - len(f0) + 1) // 2 | ||
if(pad_size>0 or p_len - len(f0) - pad_size>0): | ||
f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant') | ||
|
||
f0 *= pow(2, f0_up_key / 12) | ||
f0_mel = 1127 * np.log(1 + f0 / 700) | ||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 | ||
f0_mel[f0_mel <= 1] = 1 | ||
f0_mel[f0_mel > 255] = 255 | ||
f0_coarse = np.rint(f0_mel).astype(np.int) | ||
return f0_coarse, f0 | ||
|
||
def clean_pitch(input_pitch): | ||
num_nan = np.sum(input_pitch == 1) | ||
if num_nan / len(input_pitch) > 0.9: | ||
input_pitch[input_pitch != 1] = 1 | ||
return input_pitch | ||
|
||
|
||
def plt_pitch(input_pitch): | ||
input_pitch = input_pitch.astype(float) | ||
input_pitch[input_pitch == 1] = np.nan | ||
return input_pitch | ||
|
||
|
||
def f0_to_pitch(ff): | ||
f0_pitch = 69 + 12 * np.log2(ff / 440) | ||
return f0_pitch | ||
|
||
|
||
def fill_a_to_b(a, b): | ||
if len(a) < len(b): | ||
for _ in range(0, len(b) - len(a)): | ||
a.append(a[0]) | ||
|
||
|
||
def mkdir(paths: list): | ||
for path in paths: | ||
if not os.path.exists(path): | ||
os.mkdir(path) | ||
|
||
|
||
class VitsSvc(object): | ||
def __init__(self): | ||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
self.SVCVITS = None | ||
self.hps = None | ||
self.speakers = None | ||
self.hubert_soft = hubert_model.hubert_soft("hubert/model.pt") | ||
|
||
def set_device(self, device): | ||
self.device = torch.device(device) | ||
self.hubert_soft.to(self.device) | ||
if self.SVCVITS != None: | ||
self.SVCVITS.to(self.device) | ||
|
||
def loadCheckpoint(self, path): | ||
self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json") | ||
self.SVCVITS = SynthesizerTrn( | ||
self.hps.data.filter_length // 2 + 1, | ||
self.hps.train.segment_size // self.hps.data.hop_length, | ||
**self.hps.model) | ||
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None) | ||
_ = self.SVCVITS.eval().to(self.device) | ||
self.speakers = self.hps.spk | ||
|
||
def get_units(self, source, sr): | ||
source = source.unsqueeze(0).to(self.device) | ||
with torch.inference_mode(): | ||
units = self.hubert_soft.units(source) | ||
return units | ||
|
||
|
||
def get_unit_pitch(self, in_path, tran): | ||
source, sr = torchaudio.load(in_path) | ||
source = torchaudio.functional.resample(source, sr, 16000) | ||
if len(source.shape) == 2 and source.shape[1] >= 2: | ||
source = torch.mean(source, dim=0).unsqueeze(0) | ||
soft = self.get_units(source, sr).squeeze(0).cpu().numpy() | ||
f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran) | ||
return soft, f0 | ||
|
||
def infer(self, speaker_id, tran, raw_path): | ||
speaker_id = self.speakers[speaker_id] | ||
sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0) | ||
soft, pitch = self.get_unit_pitch(raw_path, tran) | ||
f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device) | ||
stn_tst = torch.FloatTensor(soft) | ||
with torch.no_grad(): | ||
x_tst = stn_tst.unsqueeze(0).to(self.device) | ||
x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2) | ||
audio = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float() | ||
return audio, audio.shape[-1] | ||
|
||
def inference(self,srcaudio,chara,tran,slice_db): | ||
sampling_rate, audio = srcaudio | ||
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | ||
if len(audio.shape) > 1: | ||
audio = librosa.to_mono(audio.transpose(1, 0)) | ||
if sampling_rate != 16000: | ||
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) | ||
soundfile.write("tmpwav.wav", audio, 16000, format="wav") | ||
chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db) | ||
audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks) | ||
audio = [] | ||
for (slice_tag, data) in audio_data: | ||
length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate)) | ||
raw_path = io.BytesIO() | ||
soundfile.write(raw_path, data, audio_sr, format="wav") | ||
raw_path.seek(0) | ||
if slice_tag: | ||
_audio = np.zeros(length) | ||
else: | ||
out_audio, out_sr = self.infer(chara, tran, raw_path) | ||
_audio = out_audio.cpu().numpy() | ||
audio.extend(list(_audio)) | ||
audio = (np.array(audio) * 32768.0).astype('int16') | ||
return (self.hps.data.sampling_rate,audio) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from inference.infer_tool_grad import VitsSvc | ||
import gradio as gr | ||
import os | ||
|
||
class VitsGradio: | ||
def __init__(self): | ||
self.so = VitsSvc() | ||
self.lspk = [] | ||
self.modelPaths = [] | ||
for root,dirs,files in os.walk("checkpoints"): | ||
for dir in dirs: | ||
self.modelPaths.append(dir) | ||
with gr.Blocks() as self.Vits: | ||
with gr.Tab("VoiceConversion"): | ||
with gr.Row(visible=False) as self.VoiceConversion: | ||
with gr.Column(): | ||
with gr.Row(): | ||
with gr.Column(): | ||
self.srcaudio = gr.Audio(label = "输入音频") | ||
self.btnVC = gr.Button("说话人转换") | ||
with gr.Column(): | ||
self.dsid = gr.Dropdown(label = "目标角色", choices = self.lspk) | ||
self.tran = gr.Slider(label = "升降调", maximum = 60, minimum = -60, step = 1, value = 0) | ||
self.th = gr.Slider(label = "切片阈值", maximum = 32767, minimum = -32768, step = 0.1, value = -40) | ||
with gr.Row(): | ||
self.VCOutputs = gr.Audio() | ||
self.btnVC.click(self.so.inference, inputs=[self.srcaudio,self.dsid,self.tran,self.th], outputs=[self.VCOutputs]) | ||
with gr.Tab("SelectModel"): | ||
with gr.Column(): | ||
modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value") | ||
devicestrs = gr.Dropdown(label = "设备", choices = ["cpu","cuda"], value = "cpu", type = "value") | ||
btnMod = gr.Button("载入模型") | ||
btnMod.click(self.loadModel, inputs=[modelstrs,devicestrs], outputs = [self.dsid,self.VoiceConversion]) | ||
|
||
def loadModel(self, path, device): | ||
self.lspk = [] | ||
self.so.set_device(device) | ||
self.so.loadCheckpoint(path) | ||
for spk, sid in self.so.hps.spk.items(): | ||
self.lspk.append(spk) | ||
VChange = gr.update(visible = True) | ||
SDChange = gr.update(choices = self.lspk, value = self.lspk[0]) | ||
return [SDChange,VChange] | ||
|
||
grVits = VitsGradio() | ||
|
||
grVits.Vits.launch() |