diff --git a/samples/README.md b/samples/README.md index bcac6bf3..bed7e41c 100644 --- a/samples/README.md +++ b/samples/README.md @@ -10,5 +10,5 @@ Explore complete working examples that demonstrate how to use Foundry Local — |----------|---------|-------------| | [**C#**](cs/) | 13 | .NET SDK samples including native chat, embeddings, audio transcription, tool calling, model management, web server, and tutorials. Uses WinML on Windows for hardware acceleration. | | [**JavaScript**](js/) | 13 | Node.js SDK samples including native chat, embeddings, audio transcription, Electron desktop app, Copilot SDK integration, LangChain, tool calling, web server, and tutorials. | -| [**Python**](python/) | 10 | Python samples using the OpenAI-compatible API, including chat, embeddings, audio transcription, LangChain integration, tool calling, web server, and tutorials. | +| [**Python**](python/) | 11 | Python samples using the OpenAI-compatible API, including chat, embeddings, audio transcription, LangChain integration, tool calling, web server, Responses API, and tutorials. | | [**Rust**](rust/) | 9 | Rust SDK samples including native chat, embeddings, audio transcription, tool calling, web server, and tutorials. | diff --git a/samples/python/README.md b/samples/python/README.md index 7262f012..49e99c8a 100644 --- a/samples/python/README.md +++ b/samples/python/README.md @@ -14,6 +14,7 @@ These samples demonstrate how to use Foundry Local with Python. | [embeddings](embeddings/) | Generate single and batch text embeddings using the Foundry Local SDK. | | [audio-transcription](audio-transcription/) | Transcribe audio files using the Whisper model. | | [web-server](web-server/) | Start a local OpenAI-compatible web server and call it with the OpenAI Python SDK. | +| [web-server-responses](web-server-responses/) | Call a running local OpenAI-compatible web server with the Responses API, including streaming and tool calling. | | [tool-calling](tool-calling/) | Tool calling with custom function definitions (get_weather, calculate). | | [langchain-integration](langchain-integration/) | LangChain integration for building translation and text generation chains. | | [tutorial-chat-assistant](tutorial-chat-assistant/) | Build an interactive multi-turn chat assistant (tutorial). | diff --git a/samples/python/web-server-responses/README.md b/samples/python/web-server-responses/README.md new file mode 100644 index 00000000..95666d91 --- /dev/null +++ b/samples/python/web-server-responses/README.md @@ -0,0 +1,44 @@ +# Foundry Local Python Responses Web-Service Sample + +This sample starts the Foundry Local OpenAI-compatible web service, then calls the Responses API with the official OpenAI Python client. + +It demonstrates: + +- A non-streaming `/v1/responses` call +- A streaming `/v1/responses` call +- A function/tool-calling round trip using `previous_response_id` + +## What gets installed + +Install the sample dependencies from `requirements.txt`: + +```bash +pip install -r requirements.txt +``` + +That installs: + +- `foundry-local-sdk` on non-Windows platforms +- `foundry-local-sdk-winml` on Windows +- `openai` + +The sample downloads/registers Foundry Local execution providers and downloads the `qwen2.5-0.5b` model the first time it runs. + +## Run the sample + +From this directory: + +```bash +python -m venv .venv +.\.venv\Scripts\activate +pip install -r requirements.txt +python src\app.py +``` + +On macOS or Linux, activate the virtual environment with: + +```bash +source .venv/bin/activate +``` + +The sample starts the local web service, sends Responses API requests to `http://localhost:/v1`, prints the model output, and then unloads the model and stops the web service. diff --git a/samples/python/web-server-responses/requirements.txt b/samples/python/web-server-responses/requirements.txt new file mode 100644 index 00000000..db870f60 --- /dev/null +++ b/samples/python/web-server-responses/requirements.txt @@ -0,0 +1,3 @@ +foundry-local-sdk; sys_platform != "win32" +foundry-local-sdk-winml; sys_platform == "win32" +openai diff --git a/samples/python/web-server-responses/src/app.py b/samples/python/web-server-responses/src/app.py new file mode 100644 index 00000000..6f186a2a --- /dev/null +++ b/samples/python/web-server-responses/src/app.py @@ -0,0 +1,152 @@ +# +# +import json +from typing import Any + +from openai import OpenAI + +from foundry_local_sdk import Configuration, FoundryLocalManager +# + + +def get_response_text(response: Any) -> str: + if isinstance(getattr(response, "output_text", None), str): + return response.output_text + return "".join( + getattr(part, "text", "") + for item in getattr(response, "output", []) or [] + for part in getattr(item, "content", []) or [] + if getattr(part, "type", None) == "output_text" + ) + + +# +# Initialize the Foundry Local SDK +config = Configuration(app_name="foundry_local_samples") +FoundryLocalManager.initialize(config) +manager = FoundryLocalManager.instance + +# Download and register all execution providers. +current_ep = "" + + +def _ep_progress(ep_name: str, percent: float): + global current_ep + if ep_name != current_ep: + if current_ep: + print() + current_ep = ep_name + print(f"\r {ep_name:<30} {percent:5.1f}%", end="", flush=True) + + +manager.download_and_register_eps(progress_callback=_ep_progress) +if current_ep: + print() +# + +# +model_alias = "qwen2.5-0.5b" +model = manager.catalog.get_model(model_alias) + +print(f"\nDownloading model {model_alias}...") +model.download( + lambda progress: print( + f"\rDownloading model: {progress:.2f}%", + end="", + flush=True, + ) +) +print("\nModel downloaded") + +print("\nLoading model...") +model.load() +print("Model loaded") +# + +# +print("\nStarting web service...") +manager.start_web_service() +base_url = manager.urls[0].rstrip("/") + "/v1" +print("Web service started") + +# <<<<<< OPENAI SDK USAGE >>>>>> +# Use the OpenAI SDK to call the local Foundry web service Responses API +openai = OpenAI( + base_url=base_url, + api_key="notneeded", +) +# + +try: + print("\nTesting a non-streaming Responses call...") + response = openai.responses.create( + model=model.id, + input="Reply with one short sentence about local AI.", + ) + print(f"[ASSISTANT]: {get_response_text(response)}") + + print("\nTesting a streaming Responses call...") + stream = openai.responses.create( + model=model.id, + input="Count from one to three.", + stream=True, + ) + + print("[ASSISTANT STREAM]: ", end="", flush=True) + for event in stream: + if getattr(event, "type", None) == "response.output_text.delta": + print(getattr(event, "delta", ""), end="", flush=True) + print() + + print("\nTesting Responses tool calling...") + tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Get the current weather. This sample always returns Seattle weather.", + "parameters": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, + }, + ] + + tool_response = openai.responses.create( + model=model.id, + input="Use the get_weather tool and then answer with the weather.", + tools=tools, + tool_choice="required", + store=True, + ) + + function_call = next( + (item for item in getattr(tool_response, "output", []) or [] if getattr(item, "type", None) == "function_call"), + None, + ) + if function_call is None: + raise RuntimeError("Expected the model to call get_weather.") + + print(f"[TOOL CALL]: {function_call.name}({function_call.arguments})") + + final_response = openai.responses.create( + model=model.id, + previous_response_id=tool_response.id, + input=[ + { + "type": "function_call_output", + "call_id": function_call.call_id, + "output": json.dumps({"location": "Seattle", "weather": "72 degrees F and sunny"}), + } + ], + tools=tools, + ) + + print(f"[ASSISTANT FINAL]: {get_response_text(final_response)}") + # <<<<<< END OPENAI SDK USAGE >>>>>> +finally: + # Tidy up + openai.close() + manager.stop_web_service() + model.unload() +# diff --git a/sdk/python/test/openai/test_responses_web_service.py b/sdk/python/test/openai/test_responses_web_service.py new file mode 100644 index 00000000..e323a892 --- /dev/null +++ b/sdk/python/test/openai/test_responses_web_service.py @@ -0,0 +1,244 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Integration tests for /v1/responses through the local web service. + +These tests intentionally use FoundryLocalManager only for SDK setup, model +lifecycle, and web-service lifecycle. Actual Responses API calls go through the +OpenAI-compatible HTTP endpoint directly. +""" + +from __future__ import annotations + +import json + +import pytest +import requests + +from ..conftest import TEST_MODEL_ALIAS, skip_in_ci + + +pytestmark = skip_in_ci + + +def _response_text(response: dict) -> str: + text = response.get("output_text") + if isinstance(text, str) and text: + return text + + return "".join( + part.get("text", "") + for item in response.get("output", []) or [] + if item.get("type") == "message" + for part in item.get("content", []) or [] + if part.get("type") == "output_text" and isinstance(part.get("text"), str) + ) + + +def _post_response(base_url: str, body: dict) -> dict: + response = requests.post( + f"{base_url}/v1/responses", + headers={"Content-Type": "application/json"}, + json=body, + timeout=60, + ) + assert response.ok, response.text + return response.json() + + +def _post_streaming_response(base_url: str, body: dict) -> list[dict]: + response = requests.post( + f"{base_url}/v1/responses", + headers={"Content-Type": "application/json", "Accept": "text/event-stream"}, + json={**body, "stream": True}, + stream=True, + timeout=(60, None), + ) + assert response.ok, response.text + + events: list[dict] = [] + buffer = "" + try: + for chunk in response.iter_content(chunk_size=None, decode_unicode=False): + if not chunk: + continue + text = chunk.decode("utf-8", errors="replace") if isinstance(chunk, bytes) else chunk + buffer += text.replace("\r\n", "\n") + + while "\n\n" in buffer: + block, buffer = buffer.split("\n\n", 1) + data = _sse_data(block) + if not data: + continue + if data == "[DONE]": + return events + events.append(json.loads(data)) + finally: + response.close() + + tail = buffer.strip() + if tail: + data = _sse_data(tail) + if data and data != "[DONE]": + events.append(json.loads(data)) + return events + + +def _sse_data(block: str) -> str: + lines: list[str] = [] + for line in block.strip().split("\n"): + if line.startswith("data: "): + lines.append(line[6:]) + elif line == "data:": + lines.append("") + return "\n".join(lines).strip() + + +def _get_function_call(response: dict) -> dict | None: + for item in response.get("output", []) or []: + if item.get("type") == "function_call": + return item + return None + + +def _get_weather_tool() -> dict: + return { + "type": "function", + "name": "get_weather", + "description": "Get the current weather. This test always returns Seattle weather.", + "parameters": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, + } + + +@pytest.fixture(scope="module") +def responses_web_service(manager, catalog): + cached = catalog.get_cached_models() + cached_variant = next((m for m in cached if m.alias == TEST_MODEL_ALIAS), None) + if cached_variant is None: + pytest.skip(f"{TEST_MODEL_ALIAS} must be cached to run Responses web-service tests") + + model = catalog.get_model(TEST_MODEL_ALIAS) + if model is None: + pytest.skip(f"{TEST_MODEL_ALIAS} was not found in the catalog") + + model.select_variant(cached_variant) + service_started = False + model_loaded = False + + try: + try: + model.load() + model_loaded = True + manager.start_web_service() + service_started = True + except Exception as exc: + message = str(exc) + if "execute_command_with_binary" in message: + pytest.skip( + "Local Foundry Local Core/native runtime is stale: " + "failed to resolve execute_command_with_binary" + ) + pytest.skip(f"Failed to start Responses web-service test prerequisites: {exc}") + + if not manager.urls: + pytest.skip("Web service started but did not return any URLs") + + yield manager.urls[0].rstrip("/"), model.id + finally: + if service_started: + try: + manager.stop_web_service() + except Exception: + pass + if model_loaded: + try: + model.unload() + except Exception: + pass + + +class TestResponsesWebService: + def test_should_create_non_streaming_response(self, responses_web_service): + base_url, model_id = responses_web_service + + response = _post_response( + base_url, + { + "model": model_id, + "input": "What is 2 + 2? Answer with just the number.", + "temperature": 0, + "max_output_tokens": 64, + "store": False, + }, + ) + + assert response["object"] == "response" + assert response["status"] == "completed" + assert _response_text(response).strip() + + def test_should_stream_response_events(self, responses_web_service): + base_url, model_id = responses_web_service + + events = _post_streaming_response( + base_url, + { + "model": model_id, + "input": "Count from 1 to 3.", + "temperature": 0, + "max_output_tokens": 64, + "store": False, + }, + ) + + assert any(event.get("type") == "response.created" for event in events) + assert any(event.get("type") == "response.output_text.delta" for event in events) + assert any(event.get("type") == "response.completed" for event in events) + + def test_should_round_trip_function_call_output(self, responses_web_service): + base_url, model_id = responses_web_service + weather_tool = _get_weather_tool() + + tool_response = _post_response( + base_url, + { + "model": model_id, + "input": "Use the get_weather tool and then answer with the weather.", + "tools": [weather_tool], + "tool_choice": "required", + "temperature": 0, + "max_output_tokens": 64, + "store": True, + }, + ) + function_call = _get_function_call(tool_response) + + assert function_call is not None, json.dumps(tool_response.get("output", [])) + assert function_call["name"] == "get_weather" + assert isinstance(function_call["call_id"], str) + + final_response = _post_response( + base_url, + { + "model": model_id, + "previous_response_id": tool_response["id"], + "input": [ + { + "type": "function_call_output", + "call_id": function_call["call_id"], + "output": json.dumps({"location": "Seattle", "weather": "72 degrees F and sunny"}), + } + ], + "tools": [weather_tool], + "temperature": 0, + "max_output_tokens": 64, + "store": False, + }, + ) + + assert final_response["status"] == "completed" + assert _response_text(final_response).strip()