-
-
Notifications
You must be signed in to change notification settings - Fork 3.8k
feat(vllm): add grammar and structured output support #8806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 7 commits
666f8c7
1fd670c
0fa07d3
ea89ee8
d65b35f
8511c50
bb08454
278e7e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,9 +2,11 @@ | |
| import asyncio | ||
| from concurrent import futures | ||
| import argparse | ||
| import json | ||
| import signal | ||
| import sys | ||
| import os | ||
| import time | ||
| from typing import List | ||
| from PIL import Image | ||
|
|
||
|
|
@@ -15,6 +17,21 @@ | |
| from vllm.engine.arg_utils import AsyncEngineArgs | ||
| from vllm.engine.async_llm_engine import AsyncLLMEngine | ||
| from vllm.sampling_params import SamplingParams | ||
|
|
||
| # vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions. | ||
| # The corresponding SamplingParams field also changed from guided_decoding to structured_outputs. | ||
| try: | ||
| from vllm.sampling_params import StructuredOutputsParams | ||
| _structured_output_cls = StructuredOutputsParams | ||
| _structured_output_field = "structured_outputs" | ||
| except ImportError: | ||
| try: | ||
| from vllm.sampling_params import GuidedDecodingParams | ||
| _structured_output_cls = GuidedDecodingParams | ||
| _structured_output_field = "guided_decoding" | ||
| except ImportError: | ||
| _structured_output_cls = None | ||
| _structured_output_field = None | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a fallback? We usually pin the upstream version.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I checked and vLLM is actually not pinned to a specific version — That said, if the project plans to pin vLLM to a specific version, I'm happy to drop the fallback and target whichever API is current. Let me know which you'd prefer.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, when you say newer versions, how new? If it's a very recent change then maybe we need this, otherwise we probably don't
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The rename happened in vLLM v0.8.x → latest. Since vLLM isn't pinned ( Also in the latest push: I've refactored to use the |
||
| from vllm.utils import random_uuid | ||
| from vllm.transformers_utils.tokenizer import get_tokenizer | ||
| from vllm.multimodal.utils import fetch_image | ||
|
|
@@ -218,7 +235,6 @@ async def _predict(self, request, context, streaming=False): | |
| "SkipSpecialTokens": "skip_special_tokens", | ||
| "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", | ||
| "TruncatePromptTokens": "truncate_prompt_tokens", | ||
| "GuidedDecoding": "guided_decoding", | ||
| } | ||
|
|
||
| sampling_params = SamplingParams(top_p=0.9, max_tokens=200) | ||
|
|
@@ -229,6 +245,19 @@ async def _predict(self, request, context, streaming=False): | |
| if value not in (None, 0, [], False): | ||
| setattr(sampling_params, param_field, value) | ||
|
|
||
| # Handle structured output via guided decoding / structured outputs | ||
| if _structured_output_cls is not None: | ||
| constraint = None | ||
| if hasattr(request, 'JSONSchema') and request.JSONSchema: | ||
| constraint = _structured_output_cls(json=request.JSONSchema) | ||
| elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object": | ||
| constraint = _structured_output_cls(json_object=True) | ||
| elif hasattr(request, 'Grammar') and request.Grammar: | ||
| constraint = _structured_output_cls(grammar=request.Grammar) | ||
|
|
||
| if constraint is not None: | ||
| setattr(sampling_params, _structured_output_field, constraint) | ||
|
|
||
| # Extract image paths and process images | ||
| prompt = request.Prompt | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.