Update phi3v examples with num crops overrides

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
vllm-project · Sep 23, 2024 · 049dbe6 · 049dbe6
1 parent a3ab6cb
commit 049dbe6
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 1 deletion.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -83,11 +83,19 @@ def run_phi3v(question, modality):
 
     # In this example, we override max_num_seqs to 5 while
     # keeping the original context length of 128k.
+
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance.
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
     llm = LLM(
         model="microsoft/Phi-3-vision-128k-instruct",
         trust_remote_code=True,
         max_num_seqs=5,
-        processor_kwargs={"num_crops": 16},
+        mm_processor_kwargs={"num_crops": 16},
     )
     stop_token_ids = None
     return llm, prompt, stop_token_ids

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
@@ -67,11 +67,19 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
 
 
 def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance.
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
         limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
     )
     placeholders = "\n".join(f"<|image_{i}|>"
                              for i, _ in enumerate(image_urls, start=1))