Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update modules #59

Merged
merged 31 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c8d6f86
VALLE add continual inference
lifeiteng Mar 19, 2023
91ecd50
separate text embedding & position of AR and NAR Decoders
lifeiteng Mar 19, 2023
e34c101
Separate Modules of AR and NAR Decoders
lifeiteng Mar 19, 2023
486898d
Support train AR Decoder and NAR Decoder separately
lifeiteng Mar 19, 2023
1297357
Copy transformer modules from pytorch
lifeiteng Mar 20, 2023
b6a824c
update trainer.py
lifeiteng Mar 20, 2023
aced965
Implement InputStrategy PromptedPrecomputedFeatures
lifeiteng Mar 20, 2023
7afedd5
VALL-E Add prefix_mode=4
lifeiteng Mar 20, 2023
fbb3fbc
Fix InputStrategy PromptedPrecomputedFeatures
lifeiteng Mar 20, 2023
4c05d68
Fix InputStrategy PromptedPrecomputedFeatures
lifeiteng Mar 20, 2023
cfe4965
LibriTTS update README
lifeiteng Mar 21, 2023
5c4f85f
use load_manifest_lazy
lifeiteng Mar 22, 2023
0f0c7fd
Fix index of PromptedPrecomputedFeatures
lifeiteng Mar 22, 2023
db5997c
Trainer - Add config --filter-min-duration
lifeiteng Mar 22, 2023
e7162e5
Unify Prefix Mode 2 and 4
lifeiteng Mar 22, 2023
751c226
update trainer
lifeiteng Mar 26, 2023
637c476
Add Hparam --share-embedding
lifeiteng Mar 26, 2023
a50b5b4
Merge branch 'prefix4' into stage
lifeiteng Mar 26, 2023
f6f3017
Fix Hparam --share-embedding
lifeiteng Mar 26, 2023
140a0b9
Fix MultiGPU load_checkpoint
lifeiteng Mar 31, 2023
7657ef6
Tune prefix_mode 1
lifeiteng Mar 31, 2023
a952f95
valid every epoch
lifeiteng Mar 31, 2023
51a6955
update --train-stage logic
lifeiteng Mar 31, 2023
e55582f
set NUM_TEXT_TOKENS=512 for multi-language models
lifeiteng Mar 31, 2023
d34b025
VALLF support --train-stage
lifeiteng Mar 31, 2023
8a8facf
VALLF support --prefix-mode
lifeiteng Mar 31, 2023
7e3bb2f
Fix VALl-F test
lifeiteng Apr 3, 2023
7d6b721
Fix DDP --train-stage
lifeiteng Apr 4, 2023
9acece1
Add model hparam --scale-factor
lifeiteng Apr 4, 2023
5154048
VALL-E & F update embedding sharing and inference sampling
lifeiteng Apr 6, 2023
cf9f26c
egs rename run.sh to prepare.sh and simplify README
lifeiteng Apr 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
VALL-E & F update embedding sharing and inference sampling
  • Loading branch information
lifeiteng committed Apr 6, 2023
commit 5154048140aea7efffb1b6063000b074c742a4f6
13 changes: 10 additions & 3 deletions valle/models/valle.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,9 @@ def __init__(

if share_embedding:
# We share the parameters of the output projection layer with the parameters of the acoustic embedding Wa
self.ar_predict_layer.weight = self.ar_audio_embedding.weight
# NOTE(Feiteng): In the experiment, this undermines accuracy
# self.ar_predict_layer.weight = self.ar_audio_embedding.weight

# We also share the parameters of the acoustic embedding layer and the output prediction layer,
# which means the weights of the j-th prediction layer are the same as the (j + 1)-th acoustic embedding layer.
for j in range(0, 6):
Expand Down Expand Up @@ -573,9 +575,13 @@ def inference(
)

if (
samples[0, 0] == NUM_AUDIO_TOKENS
torch.argmax(logits, dim=-1)[0] == NUM_AUDIO_TOKENS
or samples[0, 0] == NUM_AUDIO_TOKENS
or (y.shape[1] - prefix_len) > x_lens.max() * 16
):
if prompts.shape[1] == y.shape[1]:
y = torch.concat([y, samples], dim=1)

print(f"VALL-F EOS [{prefix_len} -> {y.shape[1]}]")
break

Expand Down Expand Up @@ -940,7 +946,8 @@ def inference(
)

if (
samples[0, 0] == NUM_AUDIO_TOKENS
torch.argmax(logits, dim=-1)[0] == NUM_AUDIO_TOKENS
or samples[0, 0] == NUM_AUDIO_TOKENS
or (y.shape[1] - prompts.shape[1]) > x_lens.max() * 16
):
if prompts.shape[1] == y.shape[1]:
Expand Down
2 changes: 1 addition & 1 deletion valle/modules/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(
super().__init__()
self.dim_model = dim_model
self.x_scale = math.sqrt(dim_model) if scale else 1.0
self.alpha = nn.Parameter(torch.ones(1)) if alpha else 1.0
self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
self.dropout = torch.nn.Dropout(p=dropout)

self.reverse = False
Expand Down