update tests

vllm-project · DarkLight1337 · Jul 7, 2024 · Jun 2, 2024 · Jun 2, 2024 · Jun 7, 2024
commit 9b5269d67456516f197e4cd3fb1c9a8ece9550d8
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -94,13 +94,13 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pytest -v -s models -m \"not llava\"
+    - pytest -v -s models -m \"not vlm\"
 
-- label: Llava Test
+- label: Vision Model Test
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m llava
+    - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]

diff --git a/pyproject.toml b/pyproject.toml
@@ -71,5 +71,5 @@ markers = [
     "skip_global_cleanup",
     "llm: run tests for vLLM API only",
     "openai: run tests for OpenAI API only",
-    "llava: run tests for LLaVA models only",
+    "vlm: run tests for vision language models only",
 ]
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
@@ -7,7 +7,7 @@
 
 from ..conftest import IMAGE_FILES
 
-pytestmark = pytest.mark.llava
+pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [

diff --git a/tests/models/text_paligemma.py b/tests/models/text_paligemma.py
@@ -0,0 +1,111 @@
+from typing import List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.vlm
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "<image>\nUSER: What is the season?\nASSISTANT:",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
+
+def iter_paligemma_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (224, 224): 256,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1152)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=257152,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_paligemma_configs("google/paligemma-3b-pt-224"),
+]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    # remove image token, bos token and the last newline token
+    hf_input_ids = [
+        input_id for input_id in input_ids 
+        if input_id != image_token_id and input_id != 2
+    ]
+    if hf_input_ids[-1] == 108:
+        hf_input_ids = hf_input_ids[:-1]
+
+    # remove image token from the output string
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, "")
+
+    return hf_input_ids, hf_output_str
+
+
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
+    vllm_image_prompts = [
+        '<image>' * vlm_config.image_feature_size + '<bos>' + p + '\n'
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch import nn
-from transformers import SiglipVisionModel, PaliGemmaConfig
+from transformers import PaliGemmaConfig, SiglipVisionModel
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -14,6 +14,8 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.gemma import GemmaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
 from .vlm_base import VisionLanguageModelBase
@@ -53,6 +55,9 @@ class PaliGemmaImageFeatureInputs(TypedDict):
                              PaliGemmaImageFeatureInputs]
 
 
+@MULTIMODAL_REGISTRY.register_image_feature_input()
+@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class PaliGemmaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
@@ -182,7 +187,12 @@ def forward(self,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
                 image_input: Optional[torch.Tensor] = None) -> SamplerOutput:
+        """
+        The correct prompt format needs to be:
+        '<image>' * image_feature_size + '<bos>' + prompt + '\n'
 
+        See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55
+        """ # noqa
         parsed_image_input = self._parse_and_validate_image_input(image_input)
 
         if parsed_image_input is not None:

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
@@ -79,6 +79,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
 
     if config.hf_config.model_type == "llava":
         full_prompt = f"{image_prompt}\n{text_prompt}"
+    elif config.hf_config.model_type == "paligemma":
+        full_prompt = f"{image_prompt}<bos>{text_prompt}\n"
     else:
         raise ValueError(
             f"Unsupported model type: {config.hf_config.model_type}")