forked from MoonInTheRiver/DiffSinger
-
Notifications
You must be signed in to change notification settings - Fork 285
/
acoustic.yaml
133 lines (120 loc) · 2.62 KB
/
acoustic.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
base_config:
- configs/base.yaml
task_cls: training.acoustic_task.AcousticTask
num_spk: 1
speakers:
- opencpop
spk_ids: []
test_prefixes: [
'2044',
'2086',
'2092',
'2093',
'2100',
]
vocoder: NsfHifiGAN
vocoder_ckpt: checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt
audio_sample_rate: 44100
audio_num_mel_bins: 128
hop_size: 512 # Hop size.
fft_size: 2048 # FFT size.
win_size: 2048 # FFT size.
fmin: 40
fmax: 16000
binarization_args:
shuffle: true
num_workers: 0
augmentation_args:
random_pitch_shifting:
enabled: false
range: [-5., 5.]
scale: 0.75
fixed_pitch_shifting:
enabled: false
targets: [-5., 5.]
scale: 0.5
random_time_stretching:
enabled: false
range: [0.5, 2.]
scale: 0.75
raw_data_dir: 'data/opencpop/raw'
binary_data_dir: 'data/opencpop/binary'
binarizer_cls: preprocessing.acoustic_binarizer.AcousticBinarizer
dictionary: dictionaries/opencpop-extension.txt
spec_min: [-12]
spec_max: [0]
mel_vmin: -14.
mel_vmax: 4.
mel_base: 'e'
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
tension_smooth_width: 0.12
use_spk_id: false
use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
use_key_shift_embed: false
use_speed_embed: false
diffusion_type: reflow
time_scale_factor: 1000
timesteps: 1000
max_beta: 0.02
rel_pos: true
sampling_algorithm: euler
sampling_steps: 20
diff_accelerator: ddim
diff_speedup: 10
hidden_size: 256
residual_layers: 20
residual_channels: 512
dilation_cycle_length: 4 # *
backbone_type: 'wavenet'
main_loss_type: l2
main_loss_log_norm: false
schedule_type: 'linear'
# shallow diffusion
use_shallow_diffusion: true
T_start: 0.4
T_start_infer: 0.4
K_step: 400
K_step_infer: 400
shallow_diffusion_args:
train_aux_decoder: true
train_diffusion: true
val_gt_start: false
aux_decoder_arch: convnext
aux_decoder_args:
num_channels: 512
num_layers: 6
kernel_size: 7
dropout_rate: 0.1
aux_decoder_grad: 0.1
lambda_aux_mel_loss: 0.2
# train and eval
num_sanity_val_steps: 1
optimizer_args:
lr: 0.0006
lr_scheduler_args:
step_size: 10000
gamma: 0.75
max_batch_frames: 50000
max_batch_size: 64
dataset_size_key: 'lengths'
val_with_vocoder: true
val_check_interval: 2000
num_valid_plots: 10
max_updates: 160000
num_ckpt_keep: 5
permanent_ckpt_start: 80000
permanent_ckpt_interval: 20000
finetune_enabled: false
finetune_ckpt_path: null
finetune_ignored_params:
- model.fs2.encoder.embed_tokens
- model.fs2.txt_embed
- model.fs2.spk_embed
finetune_strict_shapes: true
freezing_enabled: false
frozen_params: []