Skip to content

Commit

Permalink
Update phi3v examples with num crops overrides
Browse files Browse the repository at this point in the history
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
  • Loading branch information
alex-jw-brooks committed Sep 23, 2024
1 parent a3ab6cb commit 049dbe6
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 1 deletion.
10 changes: 9 additions & 1 deletion examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,19 @@ def run_phi3v(question, modality):

# In this example, we override max_num_seqs to 5 while
# keeping the original context length of 128k.

# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance.
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
llm = LLM(
model="microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True,
max_num_seqs=5,
processor_kwargs={"num_crops": 16},
mm_processor_kwargs={"num_crops": 16},
)
stop_token_ids = None
return llm, prompt, stop_token_ids
Expand Down
8 changes: 8 additions & 0 deletions examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,19 @@ def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:


def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance.
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"num_crops": 4},
)
placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
Expand Down

0 comments on commit 049dbe6

Please sign in to comment.