VALL-E Add prefix_mode=4

lifeiteng · lifeiteng · Apr 6, 2023 · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023
commit 7afedd5bb460ebf1af168be75645371690d8b07f
diff --git a/valle/models/valle.py b/valle/models/valle.py
@@ -21,6 +21,7 @@
 from icefall.utils import make_pad_mask
 from torchmetrics.classification import MulticlassAccuracy
 
+from valle.data.input_strategies import PromptedFeatures
 from valle.modules.embedding import SinePositionalEmbedding, TokenEmbedding
 from valle.modules.transformer import (
     AdaptiveLayerNorm,
@@ -287,8 +288,8 @@ def forward(
         self,
         x: torch.Tensor,
         x_lens: torch.Tensor,
-        y: torch.Tensor,
-        y_lens: torch.Tensor,
+        y: Union[torch.Tensor, PromptedFeatures],
+        y_lens: Union[torch.Tensor, PromptedFeatures],
         reduction: str = "sum",
         train_stage: int = 0,
     ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
@@ -570,8 +571,8 @@ def forward(
         self,
         x: torch.Tensor,
         x_lens: torch.Tensor,
-        y: torch.Tensor,
-        y_lens: torch.Tensor,
+        y: Union[torch.Tensor, PromptedFeatures],
+        y_lens: Union[torch.Tensor, PromptedFeatures],
         reduction: str = "sum",
         train_stage: int = 0,
     ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]:
@@ -594,9 +595,17 @@ def forward(
         """
         assert x.ndim == 2, x.shape
         assert x_lens.ndim == 1, x_lens.shape
+
+        y_prompts_codes = None
+        if isinstance(y, PromptedFeatures):
+            y_prompts_codes, y = y.data
+            prompts_len, y_lens = y_lens.data
+            assert prompts_len.min() == prompts_len.max()
+            assert self.prefix_mode == 4
+            y_prompts_codes = y_prompts_codes.type(torch.int64)
+
         assert y.ndim == 3, y.shape
         assert y_lens.ndim == 1, y_lens.shape
-
         assert torch.all(x_lens > 0)
 
         # NOTE: x has been padded in TextTokenCollater
@@ -756,6 +765,31 @@ def pad_y_eos(y, eos_id):
                 )
 
                 prefix_len = 0
+            elif self.prefix_mode == 4:
+                assert y_prompts_codes is not None
+                y_prompts = self.nar_audio_embeddings[0](
+                    y_prompts_codes[..., 0]
+                )
+                y_emb = self.nar_audio_embeddings[0](y)
+                for j in range(1, 8):
+                    y_prompts += self.nar_audio_embeddings[j](
+                        y_prompts_codes[..., j]
+                    )
+                    if j < nar_stage:
+                        y_emb += self.nar_audio_embeddings[j](codes[..., j])
+                y_emb = torch.concat([y_prompts, y_emb], axis=1)
+
+                prompts_len += y_prompts.shape[1]
+                xy_padding_mask = torch.concat(
+                    [
+                        x_mask,
+                        F.pad(y_mask, (y_prompts.shape[1], 0), value=False),
+                    ],
+                    dim=1,
+                )
+
+                prefix_len = 0
+
             else:
                 raise ValueError
 
@@ -902,7 +936,7 @@ def inference(
         # Non-AR Decoders
         y_emb = self.nar_audio_embeddings[0](y)
 
-        if self.prefix_mode == 2:  # Exclude enrolled_phonemes
+        if self.prefix_mode in [2, 4]:  # Exclude enrolled_phonemes
             enrolled_len = enroll_x_lens.max().item()
             # SOS + Synthesis Text + EOS
             x = torch.concat(

diff --git a/valle/tests/model_test.py b/valle/tests/model_test.py
@@ -20,6 +20,7 @@
 from icefall.utils import AttributeDict
 from torchmetrics.classification import MulticlassAccuracy
 
+from valle.data.input_strategies import PromptedFeatures
 from valle.models import NUM_MEL_BINS, get_model
 
 
@@ -107,6 +108,49 @@ def test_valle(self):
                     x[-1:], x_lens[-1:], y[-1:], enroll_x_lens=enroll_x_lens
                 )
 
+    def test_valle_prefix4(self):
+        params = AttributeDict()
+        params.decoder_dim = 64
+        params.nhead = 16
+        params.num_decoder_layers = 4
+
+        x = torch.from_numpy(np.random.randint(0, 100, size=[4, 8]))
+        x_lens = torch.from_numpy(np.random.randint(4, 8, size=[4]))
+        x_lens[-1] = 8
+        enroll_x_lens = torch.from_numpy(np.random.randint(1, 3, size=[4]))
+
+        y = torch.from_numpy(np.random.randint(0, 1000, size=[4, 16, 8]))
+        y_lens = torch.from_numpy(np.random.randint(8, 16, size=[4]))
+        y_lens[-1] = 16
+
+        prompts = torch.from_numpy(np.random.randint(0, 1000, size=[4, 12, 8]))
+        prompts_lens = torch.from_numpy(np.random.randint(12, 13, size=[4]))
+
+        params.norm_first = False
+        params.add_prenet = True
+        params.model_name = "VALL-E"
+
+        for device in self.devices:
+            for mode in [4]:
+                params.prefix_mode = mode
+                # VALL-E
+                model = get_model(params)
+                model.to(device)
+                x = x.to(device)
+                x_lens = x_lens.to(device)
+                y = y.to(device)
+
+                _y = PromptedFeatures(prompts, y).to(device)
+                _y_lens = PromptedFeatures(prompts_lens, y_lens).to(device)
+
+                # Training
+                codes, loss, metrics = model(x, x_lens, _y, _y_lens)
+                # Inference
+                model.eval()
+                codes = model.inference(
+                    x[-1:], x_lens[-1:], y[-1:], enroll_x_lens=enroll_x_lens
+                )
+
     def test_topmetric(self):
         metric_top10 = MulticlassAccuracy(1024, top_k=10, average="micro")
         metric_top1 = MulticlassAccuracy(1024, top_k=1, average="micro")