Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Support serving encoder/decoder models #7258

Merged
merged 35 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
33c9e25
Introduce `is_list_of`
DarkLight1337 Aug 7, 2024
e6dd6f5
Avoid circular imports
DarkLight1337 Aug 7, 2024
f938c86
Refactor prompt parsing and extend this to async engine
DarkLight1337 Aug 7, 2024
6332d1e
Remove unnecessary comments
DarkLight1337 Aug 7, 2024
07b4d21
Enable full async
DarkLight1337 Aug 7, 2024
e29864c
grammar
DarkLight1337 Aug 7, 2024
c9dfb40
Add description
DarkLight1337 Aug 7, 2024
1233192
Fix wrong type annotations
DarkLight1337 Aug 7, 2024
f332275
Merge branch 'upstream' into inputs-parser
DarkLight1337 Aug 7, 2024
dcdebee
Remove redundant docs
DarkLight1337 Aug 7, 2024
65db3f1
Be more strict
DarkLight1337 Aug 7, 2024
9ffeb22
Fix docs
DarkLight1337 Aug 7, 2024
c9e0b08
Fix 2
DarkLight1337 Aug 7, 2024
14bca1f
Disallow multi-modal data for enc/dec models
DarkLight1337 Aug 7, 2024
8fc7099
Improve type narrowing behavior using `TypeIs`
DarkLight1337 Aug 7, 2024
3a8a072
Avoid sequential await
DarkLight1337 Aug 7, 2024
ef5327c
Fix type annotations based on test files
DarkLight1337 Aug 7, 2024
8a835cc
Properly handle `inputs["decoder_prompt"]=None`
DarkLight1337 Aug 7, 2024
e0024c2
Clean
DarkLight1337 Aug 7, 2024
76af172
Clean
DarkLight1337 Aug 7, 2024
5c16f2e
Fix incorrect decoder inputs in singleton case
DarkLight1337 Aug 7, 2024
e239ba9
Clean
DarkLight1337 Aug 7, 2024
4b0e3df
Move functions to a more appropriate place
DarkLight1337 Aug 7, 2024
53f7f50
Remove outdated comment
DarkLight1337 Aug 7, 2024
3afdbc5
Fix mismatch between hf and vllm output text
DarkLight1337 Aug 7, 2024
c61b01f
Factor out duplicate code
DarkLight1337 Aug 7, 2024
f8ed373
Factor out more duplicate code
DarkLight1337 Aug 7, 2024
a4df70a
Remove default values to avoid accidentally miss those arguments
DarkLight1337 Aug 7, 2024
5240bb3
Add test for serving encoder/decoder model with OpenAI server
DarkLight1337 Aug 7, 2024
d321c82
Use two type variables
DarkLight1337 Aug 7, 2024
931d1f6
Merge branch 'upstream' into inputs-parser
DarkLight1337 Aug 7, 2024
a06c67f
Merge branch 'upstream' into inputs-parser
DarkLight1337 Aug 7, 2024
9f64a05
Merge branch 'upstream' into inputs-parser
DarkLight1337 Aug 7, 2024
e4c5c21
Update error message
DarkLight1337 Aug 8, 2024
68fbf5a
Merge branch 'upstream' into inputs-parser
DarkLight1337 Aug 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Clean
  • Loading branch information
DarkLight1337 committed Aug 7, 2024
commit 76af1724f5f18aa4f3a31fb7c212b9158567163e
46 changes: 16 additions & 30 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from vllm.core.scheduler import SchedulerOutputs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_timeout import asyncio_timeout
from vllm.engine.llm_engine import LLMEngine
from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine,
PromptComponents)
from vllm.engine.metrics import StatLoggerBase
from vllm.executor.executor_base import ExecutorAsyncBase
from vllm.executor.ray_utils import initialize_ray_cluster, ray
Expand All @@ -22,7 +23,6 @@
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalDataDict
from vllm.outputs import EmbeddingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
Expand Down Expand Up @@ -314,7 +314,7 @@ async def _extract_prompt_components_async(
inputs: SingletonPromptInputs,
request_id: str,
lora_request: Optional[LoRARequest] = None,
) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]:
) -> PromptComponents:
"""Async version of :meth:`_extract_prompt_components`."""
if isinstance(inputs, str):
prompt = inputs
Expand Down Expand Up @@ -349,50 +349,36 @@ async def _process_encoder_decoder_prompt_async(
request_id: str,
) -> EncoderDecoderLLMInputs:
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
encoder_comps: PromptComponents
decoder_comps: DecoderPromptComponents

if is_explicit_encoder_decoder_prompt(inputs):
encoder_task = self._extract_prompt_components_async(
inputs["encoder_prompt"],
request_id=request_id,
)

decoder_input = inputs["decoder_prompt"]
if decoder_input is None:
(
encoder_prompt,
encoder_prompt_ids,
encoder_mm_data,
) = await encoder_task

(
decoder_prompt,
decoder_prompt_ids,
decoder_mm_data,
) = None, None, None
if (decoder_input := inputs["decoder_prompt"]) is None:
encoder_comps = await encoder_task
decoder_comps = None, None, None
else:
decoder_task = self._extract_prompt_components_async(
decoder_input,
request_id=request_id,
)

# NOTE: mypy crashes without the intermediate assignment to
# (a, b)
(
(encoder_prompt, encoder_prompt_ids, encoder_mm_data),
(decoder_prompt, decoder_prompt_ids, decoder_mm_data),
) = a, b = await asyncio.gather(encoder_task, decoder_task)
encoder_comps, decoder_comps = await asyncio.gather(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does the order matter here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's just how asyncio.gather works.

encoder_task, decoder_task)
else:
(
encoder_prompt,
encoder_prompt_ids,
encoder_mm_data,
) = await self._extract_prompt_components_async(
encoder_comps = await self._extract_prompt_components_async(
inputs,
request_id=request_id,
)

decoder_prompt_ids = encoder_prompt_ids
decoder_prompt = encoder_prompt
decoder_mm_data = encoder_mm_data
decoder_comps = encoder_comps

encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps

if encoder_mm_data is not None or decoder_mm_data is not None:
raise ValueError("Multi-modal data is not supported for "
Expand Down
44 changes: 18 additions & 26 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:

_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)

PromptComponents = Tuple[Optional[str], List[int],
Optional[MultiModalDataDict]]
DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
Optional[MultiModalDataDict]]


class LLMEngine:
"""An LLM engine that receives requests and generates texts.
Expand Down Expand Up @@ -690,7 +695,7 @@ def _extract_prompt_components(
inputs: SingletonPromptInputs,
request_id: str,
lora_request: Optional[LoRARequest] = None,
) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]:
) -> PromptComponents:
'''
Extract the components of any single encoder or decoder input prompt.

Expand Down Expand Up @@ -820,45 +825,32 @@ def _process_encoder_decoder_prompt(
* :class:`EncoderDecoderLLMInputs` instance
'''

encoder_comps: PromptComponents
decoder_comps: DecoderPromptComponents

if is_explicit_encoder_decoder_prompt(inputs):
(
encoder_prompt,
encoder_prompt_ids,
encoder_mm_data,
) = self._extract_prompt_components(
encoder_comps = self._extract_prompt_components(
inputs["encoder_prompt"],
request_id=request_id,
)

decoder_input = inputs["decoder_prompt"]
if decoder_input is None:
(
decoder_prompt,
decoder_prompt_ids,
decoder_mm_data,
) = None, None, None
if (decoder_input := inputs["decoder_prompt"]) is None:
decoder_comps = None, None, None
else:
(
decoder_prompt,
decoder_prompt_ids,
decoder_mm_data,
) = self._extract_prompt_components(
decoder_comps = self._extract_prompt_components(
decoder_input,
request_id=request_id,
)
else:
(
encoder_prompt,
encoder_prompt_ids,
encoder_mm_data,
) = self._extract_prompt_components(
encoder_comps = self._extract_prompt_components(
inputs,
request_id=request_id,
)

decoder_prompt_ids = encoder_prompt_ids
decoder_prompt = encoder_prompt
decoder_mm_data = encoder_mm_data
decoder_comps = encoder_comps

encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps

if encoder_mm_data is not None or decoder_mm_data is not None:
raise ValueError("Multi-modal data is not supported for "
Expand Down
Loading