Skip to content

Commit

Permalink
update README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
innnky committed Dec 7, 2022
1 parent 21f8509 commit e1700a8
Show file tree
Hide file tree
Showing 10 changed files with 245 additions and 92 deletions.
67 changes: 59 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,63 @@
# SoftVC VITS Singing Voice Conversion
## 重大BUG修复
+ 断音问题已解决,音质提升了一个档次
+ 2.0版本已经移至 sovits_2.0分支
+ 3.0版本使用FreeVC的代码结构,与旧版本不通用

## Update
> 断音问题已解决,音质提升了一个档次\
> 2.0版本已经移至 sovits_2.0分支\
> 3.0版本使用FreeVC的代码结构,与旧版本不通用
## 模型简介
歌声音色转换模型,通过SoftVC内容编码器提取源音频语音特征,与F0同时输入VITS替换原本的文本输入达到歌声转换的效果。
> 目前模型修使用 [coarse F0](https://github.com/PlayVoice/VI-SVC/blob/main/svc/prepare/preprocess_wave.py) ,尝试使用[HarmoF0](https://github.com/wx-wei/harmof0) 进行f0提取但效果不佳,尝试使用[icassp2022-vocal-transcription](https://github.com/keums/icassp2022-vocal-transcription)提取midi替换f0输入但效果不佳
歌声音色转换模型,通过SoftVC内容编码器提取源音频语音特征,与F0同时输入VITS替换原本的文本输入达到歌声转换的效果。同时,更换声码器为 [NSF HiFiGAN](https://github.com/openvpi/DiffSinger/tree/refactor/modules/nsf_hifigan) 解决断音问题

## 数据集准备
```shell
仅需要以以下文件结构将数据集放入raw目录即可

raw
├───speaker0
│ ├───xxx1-xxx1.wav
│ ├───...
│ └───Lxx-0xx8.wav
└───speaker1
├───xx2-0xxx2.wav
├───...
└───xxx7-xxx007.wav
```

## 预先下载的模型文件
+ soft vc hubert:[hubert-soft-0d54a1f4.pt](https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt)
+ 放在hubert目录下
+ 预训练模型文件 [G_0.pth D_0.pth](https://)
+ 放在logs/48k 目录下
+ 预训练模型为必选项,因为据测试从零开始训练有概率不收敛,同时也能加快训练速度
+ 预训练模型删除了optimizer flow speakerembedding 等无关权重,因此可以认为基本剔除了旧的音色信息
```shell
# 一键下载
# hubert
wget -P hubert/ https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt
# G与D预训练模型
wget -P logs/48k/ https://
wget -P logs/48k/ https://

```
## 数据预处理
1. 重采样至 48khz

```shell
python resample.py
```
2. 自动划分训练集 验证集 测试集 以及配置文件
```shell
python preprocess_flist_config.py
```
3. 生成hubert与f0
```shell
python preprocess_hubert_f0.py
```
执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除raw文件夹了

## 训练
```shell
python train.py -c configs/config.json -m 48k
```

## 推理

模型推理、训练、一键脚本汇总整理仓库 [sovits_guide](https://github.com/IceKyrin/sovits_guide)

63 changes: 48 additions & 15 deletions configs/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
"eval_interval": 200,
"seed": 1234,
"epochs": 10000,
"learning_rate": 2e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"learning_rate": 0.0002,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 16,
"fp16_run": false,
"lr_decay": 0.999875,
Expand All @@ -20,8 +23,8 @@
"port": "8001"
},
"data": {
"training_files":"filelists/train.txt",
"validation_files":"filelists/val.txt",
"training_files": "filelists/train.txt",
"validation_files": "filelists/val.txt",
"max_wav_value": 32768.0,
"sampling_rate": 48000,
"filter_length": 1280,
Expand All @@ -40,19 +43,49 @@
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [10,8,2,2],
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
10,
8,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4],
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 256,
"ssl_dim": 256
"ssl_dim": 256,
"n_speakers": 4
},
"spk":{
"nen": 0,
"paimon": 1,
"yunhao": 2
"spk": {
"paimon": 0,
"nen": 1
}
}
}
18 changes: 9 additions & 9 deletions filelists/test.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
./dataset/48k/paimon/vo_ABLQ005_2_paimon_01.wav
./dataset/48k/nen/kne110_005.wav
./dataset/48k/paimon/vo_ABLQ004_6_paimon_02.wav
./dataset/48k/paimon/vo_ABLQ004_6_paimon_01.wav
./dataset/48k/nen/kne110_003.wav
./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
./dataset/48k/nen/kne110_004.wav
./dataset/48k/nen/kne110_006.wav
./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
./dataset/48k/paimon/vo_ABLQ005_2_paimon_01.wav
./dataset/48k/nen/kne110_001.wav
./dataset/48k/nen/kne110_006.wav
./dataset/48k/nen/kne110_003.wav
./dataset/48k/nen/kne110_002.wav
./dataset/48k/paimon/vo_ABLQ004_6_paimon_01.wav
./dataset/48k/paimon/vo_ABLQ004_6_paimon_02.wav
./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
./dataset/48k/nen/kne110_004.wav
./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
./dataset/48k/nen/kne110_005.wav
6 changes: 3 additions & 3 deletions filelists/val.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
./dataset/48k/paimon/vo_ABLQ004_7_paimon_01.wav
./dataset/48k/nen/kne110_006.wav
./dataset/48k/nen/kne110_002.wav
./dataset/48k/paimon/vo_ABLQ004_5_paimon_02.wav
./dataset/48k/paimon/vo_ABLQ005_2_paimon_02.wav
./dataset/48k/nen/kne110_003.wav
3 changes: 2 additions & 1 deletion models.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def __init__(self,
upsample_kernel_sizes,
gin_channels,
ssl_dim,
n_speakers,
**kwargs):

super().__init__()
Expand All @@ -300,7 +301,7 @@ def __init__(self,
self.segment_size = segment_size
self.gin_channels = gin_channels
self.ssl_dim = ssl_dim
self.emb_g = nn.Embedding(10, gin_channels)
self.emb_g = nn.Embedding(n_speakers, gin_channels)

self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout)
hps = {
Expand Down
49 changes: 0 additions & 49 deletions preprocess_flist.py

This file was deleted.

117 changes: 117 additions & 0 deletions preprocess_flist_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
import argparse
from tqdm import tqdm
from random import shuffle
import json
config_template = {
"train": {
"log_interval": 200,
"eval_interval": 200,
"seed": 1234,
"epochs": 10000,
"learning_rate": 2e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 16,
"fp16_run": False,
"lr_decay": 0.999875,
"segment_size": 17920,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"use_sr": True,
"max_speclen": 384,
"port": "8001"
},
"data": {
"training_files":"filelists/train.txt",
"validation_files":"filelists/val.txt",
"max_wav_value": 32768.0,
"sampling_rate": 48000,
"filter_length": 1280,
"hop_length": 320,
"win_length": 1280,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": None
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [10,8,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4],
"n_layers_q": 3,
"use_spectral_norm": False,
"gin_channels": 256,
"ssl_dim": 256,
"n_speakers": 0,
},
"spk":{
"nen": 0,
"paimon": 1,
"yunhao": 2
}
}


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
parser.add_argument("--test_list", type=str, default="./filelists/test.txt", help="path to test list")
parser.add_argument("--source_dir", type=str, default="./dataset/48k", help="path to source dir")
args = parser.parse_args()

train = []
val = []
test = []
idx = 0
spk_dict = {}
spk_id = 0
for speaker in tqdm(os.listdir(args.source_dir)):
spk_dict[speaker] = spk_id
spk_id += 1
wavs = [os.path.join(args.source_dir, speaker, i)for i in os.listdir(os.path.join(args.source_dir, speaker))]
wavs = [i for i in wavs if i.endswith("wav")]
shuffle(wavs)
train += wavs[2:-10]
val += wavs[:2]
test += wavs[-10:]
n_speakers = len(spk_dict.keys())*2
shuffle(train)
shuffle(val)
shuffle(test)

print("Writing", args.train_list)
with open(args.train_list, "w") as f:
for fname in tqdm(train):
wavpath = fname
f.write(wavpath + "\n")

print("Writing", args.val_list)
with open(args.val_list, "w") as f:
for fname in tqdm(val):
wavpath = fname
f.write(wavpath + "\n")

print("Writing", args.test_list)
with open(args.test_list, "w") as f:
for fname in tqdm(test):
wavpath = fname
f.write(wavpath + "\n")

config_template["model"]["n_speakers"] = n_speakers
config_template["spk"] = spk_dict
print("Writing configs/config.json")
with open("configs/config.json", "w") as f:
json.dump(config_template, f, indent=2)
2 changes: 1 addition & 1 deletion preprocess_hubert_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def process(filename):
c = torch.load(save_name)
f0path = filename+".f0.npy"
if not os.path.exists(f0path):
cf0, f0 = compute_f0(filename, c.shape[-1] * 3)
cf0, f0 = get_f0(filename, c.shape[-1] * 3)
np.save(f0path, f0)


Expand Down
File renamed without changes.
Loading

0 comments on commit e1700a8

Please sign in to comment.