mudler · eureka928 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/backend/backend.proto b/backend/backend.proto
@@ -163,6 +163,8 @@ message PredictOptions {
   int32 Logprobs = 50;  // Number of top logprobs to return (maps to OpenAI logprobs parameter)
   int32 TopLogprobs = 51;  // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
   map<string, string> Metadata = 52;  // Generic per-request metadata (e.g., enable_thinking)
+  string JSONSchema = 53;  // Raw JSON schema string for backends that support native structured output (e.g. vLLM guided decoding)
+  string ResponseFormat = 54;  // Response format type: "json_object", "json_schema", or empty
 }
 
 // The response message containing the result

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -2,9 +2,11 @@
 import asyncio
 from concurrent import futures
 import argparse
+import json
 import signal
 import sys
 import os
+import time
 from typing import List
 from PIL import Image
 
@@ -15,6 +17,21 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+
+# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions.
+# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs.
+try:
+    from vllm.sampling_params import StructuredOutputsParams
+    _structured_output_cls = StructuredOutputsParams
+    _structured_output_field = "structured_outputs"
+except ImportError:
+    try:
+        from vllm.sampling_params import GuidedDecodingParams
+        _structured_output_cls = GuidedDecodingParams
+        _structured_output_field = "guided_decoding"
+    except ImportError:
+        _structured_output_cls = None
+        _structured_output_field = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
@@ -218,7 +235,6 @@ async def _predict(self, request, context, streaming=False):
             "SkipSpecialTokens": "skip_special_tokens",
             "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
             "TruncatePromptTokens": "truncate_prompt_tokens",
-            "GuidedDecoding": "guided_decoding",
         }
 
         sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
@@ -229,6 +245,19 @@ async def _predict(self, request, context, streaming=False):
                 if value not in (None, 0, [], False):
                     setattr(sampling_params, param_field, value)
 
+        # Handle structured output via guided decoding / structured outputs
+        if _structured_output_cls is not None:
+            constraint = None
+            if hasattr(request, 'JSONSchema') and request.JSONSchema:
+                constraint = _structured_output_cls(json=request.JSONSchema)
+            elif hasattr(request, 'ResponseFormat') and request.ResponseFormat == "json_object":
+                constraint = _structured_output_cls(json_object=True)
+            elif hasattr(request, 'Grammar') and request.Grammar:
+                constraint = _structured_output_cls(grammar=request.Grammar)
+
+            if constraint is not None:
+                setattr(sampling_params, _structured_output_field, constraint)
+
         # Extract image paths and process images
         prompt = request.Prompt
 

diff --git a/core/backend/options.go b/core/backend/options.go
@@ -253,6 +253,8 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		TensorSplit:         c.TensorSplit,
 		TailFreeSamplingZ:   float32(*c.TFZ),
 		TypicalP:            float32(*c.TypicalP),
+		JSONSchema:          c.JSONSchema,
+		ResponseFormat:      c.ResponseFormat,
 	}
 
 	metadata := map[string]string{}

diff --git a/core/config/model_config.go b/core/config/model_config.go
@@ -51,6 +51,7 @@ type ModelConfig struct {
 	functionCallString, functionCallNameString string                 `yaml:"-" json:"-"`
 	ResponseFormat                             string                 `yaml:"-" json:"-"`
 	ResponseFormatMap                          map[string]interface{} `yaml:"-" json:"-"`
+	JSONSchema                                 string                 `yaml:"-" json:"-"`
 
 	FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"`
 	ReasoningConfig reasoning.Config          `yaml:"reasoning,omitempty" json:"reasoning,omitempty"`

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -430,7 +430,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			switch d.Type {
 			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
 			case "json_schema":
+				config.ResponseFormat = "json_schema"
 				d := schema.JsonSchemaRequest{}
 				dat, err := json.Marshal(config.ResponseFormatMap)
 				if err != nil {
@@ -440,6 +442,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				if err != nil {
 					return err
 				}
+
+				// Pass raw JSON schema to backends that support native structured output
+				schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+				if err == nil {
+					config.JSONSchema = string(schemaBytes)
+				}
+
 				fs := &functions.JSONFunctionStructure{
 					AnyOf: []functions.Item{d.JsonSchema.Schema},
 				}

diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
@@ -87,8 +87,31 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 			d := schema.ChatCompletionResponseFormat{}
 			dat, _ := json.Marshal(config.ResponseFormatMap)
 			_ = json.Unmarshal(dat, &d)
-			if d.Type == "json_object" {
+			switch d.Type {
+			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
+			case "json_schema":
+				config.ResponseFormat = "json_schema"
+				jsr := schema.JsonSchemaRequest{}
+				dat, err := json.Marshal(config.ResponseFormatMap)
+				if err == nil {
+					if err := json.Unmarshal(dat, &jsr); err == nil {
+						schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
+						if err == nil {
+							config.JSONSchema = string(schemaBytes)
+						}
+						fs := &functions.JSONFunctionStructure{
+							AnyOf: []functions.Item{jsr.JsonSchema.Schema},
+						}
+						g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
+						if err == nil {
+							input.Grammar = g
+						} else {
+							xlog.Error("Failed generating grammar", "error", err)
+						}
+					}
+				}
 			}
 		}
 

diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md
@@ -10,7 +10,11 @@ url = "/features/constrained_grammars/"
 The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).
 
 {{% notice note %}}
-**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887).
+**Compatibility Notice:** Grammar and structured output support is available for the following backends:
+- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object`
+- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object`
+
+For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page.
  {{% /notice %}}
 
 ## Setup
@@ -66,6 +70,59 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars
 - Character classes (`[a-z]`)
 - String literals (`"text"`)
 
+## vLLM Backend
+
+The vLLM backend supports structured output via three methods:
+
+### JSON Schema (recommended)
+
+Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person object"}],
+  "response_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+Force the model to output valid JSON (without a specific schema):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person as JSON"}],
+  "response_format": {"type": "json_object"}
+}'
+```
+
+### Grammar
+
+The `grammar` parameter also works with vLLM via xgrammar:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Do you like apples?"}],
+  "grammar": "root ::= (\"yes\" | \"no\")"
+}'
+```
+
 ## Related Features
 
 - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs