From 6a205c80ebfab237be15bb645946e6f0f69e8062 Mon Sep 17 00:00:00 2001 From: Wanbogang Date: Thu, 12 Mar 2026 12:24:12 +0700 Subject: [PATCH 1/2] feat: add local Ollama VLM input plugin for multimodal visual reasoning Add VLM_Ollama_Local plugin that captures webcam frames and sends them to a locally running Ollama instance (e.g., llava, moondream) for offline visual reasoning without cloud dependency. - Follows existing FuserInput pattern (vlm_local_yolo, vlm_coco_local) - Uses aiohttp to POST base64-encoded frames to Ollama /api/chat - Supports any Ollama multimodal model via config (default: llava) - Gracefully handles camera failures, API errors, and timeouts - 22 tests with 100% coverage Addresses the llava multimodal gap noted in config/ollama.json5 --- src/inputs/plugins/vlm_ollama_local.py | 270 ++++++++++ tests/inputs/plugins/test_vlm_ollama_local.py | 468 ++++++++++++++++++ 2 files changed, 738 insertions(+) create mode 100644 src/inputs/plugins/vlm_ollama_local.py create mode 100644 tests/inputs/plugins/test_vlm_ollama_local.py diff --git a/src/inputs/plugins/vlm_ollama_local.py b/src/inputs/plugins/vlm_ollama_local.py new file mode 100644 index 000000000..a5c0c8c65 --- /dev/null +++ b/src/inputs/plugins/vlm_ollama_local.py @@ -0,0 +1,270 @@ +import asyncio +import base64 +import logging +import time +from typing import Optional + +import aiohttp +import cv2 +import numpy as np +from pydantic import Field + +from inputs.base import Message, SensorConfig +from inputs.base.loop import FuserInput +from providers.io_provider import IOProvider + + +class VLM_Ollama_LocalConfig(SensorConfig): + """ + Configuration for Ollama-based local VLM sensor. + + Parameters + ---------- + camera_index : int + Index of the camera device to capture frames from. + base_url : str + Base URL for the Ollama API service. + model : str + Ollama multimodal model name (e.g., llava, llava-phi3, moondream). + prompt : str + Text prompt sent alongside the image to the VLM. + timeout : int + Request timeout in seconds for Ollama inference. + """ + + camera_index: int = Field(default=0, description="Index of the camera device") + base_url: str = Field( + default="http://localhost:11434", + description="Base URL for the Ollama API service", + ) + model: str = Field( + default="llava", + description="Ollama multimodal model name (e.g., llava, llava-phi3, moondream)", + ) + prompt: str = Field( + default="Briefly describe what you see in one or two sentences.", + description="Text prompt sent alongside the image to the VLM", + ) + timeout: int = Field( + default=30, + description="Request timeout in seconds for Ollama inference", + ) + + +def check_webcam(index_to_check: int) -> bool: + """ + Check if a webcam is available at the given index. + + Parameters + ---------- + index_to_check : int + The camera index to check. + + Returns + ------- + bool + True if the webcam is available, False otherwise. + """ + cap = cv2.VideoCapture(index_to_check) + if not cap.isOpened(): + logging.error(f"VLM Ollama Local: camera not found at index {index_to_check}") + cap.release() + return False + logging.info(f"VLM Ollama Local: camera found at index {index_to_check}") + cap.release() + return True + + +class VLM_Ollama_Local(FuserInput[VLM_Ollama_LocalConfig, Optional[np.ndarray]]): + """ + Vision Language Model input using a local Ollama multimodal model. + + Captures frames from a webcam, encodes them as base64, and sends + them to a locally running Ollama instance (e.g., llava, moondream) + for visual reasoning. The resulting text description is forwarded + to the fuser for use by the main LLM. + + Requires Ollama to be running locally with a multimodal model pulled: + + ollama pull llava + ollama serve + """ + + def __init__(self, config: VLM_Ollama_LocalConfig): + """ + Initialize the Ollama VLM input handler. + + Parameters + ---------- + config : VLM_Ollama_LocalConfig + Configuration settings for the Ollama VLM sensor. + """ + super().__init__(config) + + self.io_provider = IOProvider() + + self.messages: list[Message] = [] + + self.descriptor_for_LLM = "Vision" + + base_url = self.config.base_url.rstrip("/") + self._chat_url = f"{base_url}/api/chat" + + self.have_cam = check_webcam(self.config.camera_index) + self.cap: Optional[cv2.VideoCapture] = None + + if self.have_cam: + self.cap = cv2.VideoCapture(self.config.camera_index) + width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + logging.info(f"VLM Ollama Local: camera resolution {width}x{height}") + + logging.info( + f"VLM Ollama Local: initialized with model='{self.config.model}' " + f"endpoint='{self._chat_url}'" + ) + + async def _poll(self) -> Optional[np.ndarray]: + """ + Poll for a new frame from the camera. + + Returns + ------- + Optional[np.ndarray] + Captured frame as a numpy array, or None if unavailable. + """ + await asyncio.sleep(0.5) + + if not self.have_cam or self.cap is None: + return None + + ret, frame = self.cap.read() + if not ret or frame is None: + logging.warning("VLM Ollama Local: failed to read frame from camera") + return None + + return frame + + async def _raw_to_text(self, raw_input: Optional[np.ndarray]) -> Optional[Message]: + """ + Send a camera frame to Ollama and return the text response. + + Encodes the frame as a JPEG base64 string, posts it to the + Ollama /api/chat endpoint with the configured prompt, and + wraps the response in a timestamped Message. + + Parameters + ---------- + raw_input : Optional[np.ndarray] + Camera frame to process. + + Returns + ------- + Optional[Message] + Timestamped message containing the VLM description, + or None if processing fails. + """ + if raw_input is None: + return None + + success, buffer = cv2.imencode(".jpg", raw_input) + if not success: + logging.error("VLM Ollama Local: failed to encode frame to JPEG") + return None + + image_b64 = base64.b64encode(buffer.tobytes()).decode("utf-8") + + payload = { + "model": self.config.model, + "messages": [ + { + "role": "user", + "content": self.config.prompt, + "images": [image_b64], + } + ], + "stream": False, + } + + try: + timeout = aiohttp.ClientTimeout(total=self.config.timeout) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.post(self._chat_url, json=payload) as response: + if response.status != 200: + logging.error(f"VLM Ollama Local: API error {response.status}") + return None + + result = await response.json() + + description = result.get("message", {}).get("content", "").strip() + + if not description: + logging.warning("VLM Ollama Local: received empty response") + return None + + logging.info(f"VLM Ollama Local: {description}") + return Message(timestamp=time.time(), message=description) + + except aiohttp.ClientConnectorError: + logging.error( + "VLM Ollama Local: cannot connect to Ollama. " + "Is Ollama running? Start with: ollama serve" + ) + return None + except asyncio.TimeoutError: + logging.error( + f"VLM Ollama Local: request timed out after {self.config.timeout}s. " + "Try increasing timeout or using a smaller model." + ) + return None + except Exception as e: + logging.error(f"VLM Ollama Local: unexpected error: {e}") + return None + + async def raw_to_text(self, raw_input: Optional[np.ndarray]) -> None: + """ + Convert a camera frame to text and append to the message buffer. + + Parameters + ---------- + raw_input : Optional[np.ndarray] + Camera frame to process. + """ + pending_message = await self._raw_to_text(raw_input) + + if pending_message is not None: + self.messages.append(pending_message) + + def formatted_latest_buffer(self) -> Optional[str]: + """ + Format the latest buffered message for the fuser and clear the buffer. + + Retrieves the most recent VLM description, formats it with the + standard INPUT block structure, records it in the IO provider, + and clears the internal message buffer. + + Returns + ------- + Optional[str] + Formatted input string for the fuser, or None if buffer is empty. + """ + if len(self.messages) == 0: + return None + + latest_message = self.messages[-1] + + logging.info(f"VLM_Ollama_Local: {latest_message.message}") + + result = ( + f"\nINPUT: {self.descriptor_for_LLM}\n// START\n" + f"{latest_message.message}\n// END\n" + ) + + self.io_provider.add_input( + self.descriptor_for_LLM, + latest_message.message, + latest_message.timestamp, + ) + self.messages = [] + + return result diff --git a/tests/inputs/plugins/test_vlm_ollama_local.py b/tests/inputs/plugins/test_vlm_ollama_local.py new file mode 100644 index 000000000..1977d9065 --- /dev/null +++ b/tests/inputs/plugins/test_vlm_ollama_local.py @@ -0,0 +1,468 @@ +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import aiohttp +import numpy as np +import pytest + +from inputs.base import Message +from inputs.plugins.vlm_ollama_local import ( + VLM_Ollama_Local, + VLM_Ollama_LocalConfig, + check_webcam, +) + + +def test_check_webcam_found(): + """Test check_webcam returns True when camera is available.""" + mock_cap = MagicMock() + mock_cap.isOpened.return_value = True + + with patch( + "inputs.plugins.vlm_ollama_local.cv2.VideoCapture", return_value=mock_cap + ): + result = check_webcam(0) + assert result is True + + +def test_check_webcam_not_found(): + """Test check_webcam returns False when camera is unavailable.""" + mock_cap = MagicMock() + mock_cap.isOpened.return_value = False + + with patch( + "inputs.plugins.vlm_ollama_local.cv2.VideoCapture", return_value=mock_cap + ): + result = check_webcam(0) + assert result is False + + +def test_initialization_no_camera(): + """Test initialization without camera sets safe defaults.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + assert hasattr(sensor, "messages") + assert sensor.descriptor_for_LLM == "Vision" + assert sensor.have_cam is False + assert sensor.cap is None + + +def test_initialization_with_camera(): + """Test initialization opens VideoCapture when camera is available.""" + mock_cap = MagicMock() + mock_cap.get.return_value = 640 + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=True), + patch( + "inputs.plugins.vlm_ollama_local.cv2.VideoCapture", + return_value=mock_cap, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + assert sensor.have_cam is True + assert sensor.cap is not None + + +def test_chat_url_built_correctly(): + """Test Ollama chat URL is correctly built from base_url config.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + ): + config = VLM_Ollama_LocalConfig(base_url="http://localhost:11434") + sensor = VLM_Ollama_Local(config=config) + + assert sensor._chat_url == "http://localhost:11434/api/chat" + + +def test_chat_url_strips_trailing_slash(): + """Test that trailing slash in base_url is stripped correctly.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + ): + config = VLM_Ollama_LocalConfig(base_url="http://localhost:11434/") + sensor = VLM_Ollama_Local(config=config) + + assert sensor._chat_url == "http://localhost:11434/api/chat" + + +@pytest.mark.asyncio +async def test_poll_returns_frame_when_camera_available(): + """Test _poll returns a numpy frame when camera read succeeds.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + mock_cap = MagicMock() + mock_cap.read.return_value = (True, fake_frame) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=True), + patch( + "inputs.plugins.vlm_ollama_local.cv2.VideoCapture", + return_value=mock_cap, + ), + patch("inputs.plugins.vlm_ollama_local.asyncio.sleep", new=AsyncMock()), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._poll() + assert result is not None + assert isinstance(result, np.ndarray) + + +@pytest.mark.asyncio +async def test_poll_returns_none_when_no_camera(): + """Test _poll returns None when no camera is available.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch("inputs.plugins.vlm_ollama_local.asyncio.sleep", new=AsyncMock()), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._poll() + assert result is None + + +@pytest.mark.asyncio +async def test_poll_returns_none_on_failed_read(): + """Test _poll returns None when cap.read() returns ret=False.""" + mock_cap = MagicMock() + mock_cap.read.return_value = (False, None) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=True), + patch( + "inputs.plugins.vlm_ollama_local.cv2.VideoCapture", + return_value=mock_cap, + ), + patch("inputs.plugins.vlm_ollama_local.asyncio.sleep", new=AsyncMock()), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._poll() + assert result is None + + +@pytest.mark.asyncio +async def test_poll_returns_none_on_none_frame(): + """Test _poll returns None when cap.read() returns True but frame is None.""" + mock_cap = MagicMock() + mock_cap.read.return_value = (True, None) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=True), + patch( + "inputs.plugins.vlm_ollama_local.cv2.VideoCapture", + return_value=mock_cap, + ), + patch("inputs.plugins.vlm_ollama_local.asyncio.sleep", new=AsyncMock()), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._poll() + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_none_input(): + """Test _raw_to_text returns None when input is None.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(None) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_encode_failure(): + """Test _raw_to_text returns None when cv2.imencode fails.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.cv2.imencode", + return_value=(False, None), + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_message_on_success(): + """Test _raw_to_text returns a Message when Ollama responds successfully.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + mock_response = MagicMock() + mock_response.status = 200 + mock_response.json = AsyncMock( + return_value={"message": {"content": "I see a room with a chair."}} + ) + mock_response.__aenter__ = AsyncMock(return_value=mock_response) + mock_response.__aexit__ = AsyncMock(return_value=False) + + mock_session = MagicMock() + mock_session.post.return_value = mock_response + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.aiohttp.ClientSession", + return_value=mock_session, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + + assert result is not None + assert isinstance(result, Message) + assert "chair" in result.message + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_empty_response(): + """Test _raw_to_text returns None when Ollama returns empty content.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + mock_response = MagicMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value={"message": {"content": ""}}) + mock_response.__aenter__ = AsyncMock(return_value=mock_response) + mock_response.__aexit__ = AsyncMock(return_value=False) + + mock_session = MagicMock() + mock_session.post.return_value = mock_response + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.aiohttp.ClientSession", + return_value=mock_session, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_api_error(): + """Test _raw_to_text returns None when Ollama returns non-200 status.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + mock_response = MagicMock() + mock_response.status = 500 + mock_response.__aenter__ = AsyncMock(return_value=mock_response) + mock_response.__aexit__ = AsyncMock(return_value=False) + + mock_session = MagicMock() + mock_session.post.return_value = mock_response + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.aiohttp.ClientSession", + return_value=mock_session, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_connection_error(): + """Test _raw_to_text returns None when Ollama is unreachable.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + mock_session = MagicMock() + mock_session.post.side_effect = aiohttp.ClientConnectorError( + MagicMock(), MagicMock() + ) + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.aiohttp.ClientSession", + return_value=mock_session, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_timeout(): + """Test _raw_to_text returns None when Ollama request times out.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + mock_session = MagicMock() + mock_session.post.side_effect = asyncio.TimeoutError() + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.aiohttp.ClientSession", + return_value=mock_session, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_returns_none_on_unexpected_exception(): + """Test _raw_to_text returns None on unexpected exceptions.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + + mock_session = MagicMock() + mock_session.post.side_effect = RuntimeError("unexpected error") + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=False) + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch( + "inputs.plugins.vlm_ollama_local.aiohttp.ClientSession", + return_value=mock_session, + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = await sensor._raw_to_text(fake_frame) + assert result is None + + +@pytest.mark.asyncio +async def test_raw_to_text_appends_to_buffer(): + """Test raw_to_text appends message to buffer on success.""" + fake_frame = np.zeros((480, 640, 3), dtype=np.uint8) + fake_message = Message(timestamp=123.456, message="I see a table.") + + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch.object( + VLM_Ollama_Local, + "_raw_to_text", + new=AsyncMock(return_value=fake_message), + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + await sensor.raw_to_text(fake_frame) + assert len(sensor.messages) == 1 + assert sensor.messages[0].message == "I see a table." + + +@pytest.mark.asyncio +async def test_raw_to_text_does_not_append_on_none(): + """Test raw_to_text does not append when _raw_to_text returns None.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + patch.object( + VLM_Ollama_Local, + "_raw_to_text", + new=AsyncMock(return_value=None), + ), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + await sensor.raw_to_text(None) + assert len(sensor.messages) == 0 + + +def test_formatted_latest_buffer_empty(): + """Test formatted_latest_buffer returns None when buffer is empty.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + result = sensor.formatted_latest_buffer() + assert result is None + + +def test_formatted_latest_buffer_returns_formatted_string(): + """Test formatted_latest_buffer returns correct format and clears buffer.""" + with ( + patch("inputs.plugins.vlm_ollama_local.IOProvider"), + patch("inputs.plugins.vlm_ollama_local.check_webcam", return_value=False), + ): + config = VLM_Ollama_LocalConfig() + sensor = VLM_Ollama_Local(config=config) + + test_message = Message( + timestamp=123.456, message="I see a person sitting on a chair." + ) + sensor.messages.append(test_message) + + result = sensor.formatted_latest_buffer() + + assert result is not None + assert isinstance(result, str) + assert "INPUT:" in result + assert "Vision" in result + assert "I see a person sitting on a chair." in result + assert "// START" in result + assert "// END" in result + assert len(sensor.messages) == 0 From 4af6ca3498a7ef890a462a2974cb28e63211abfa Mon Sep 17 00:00:00 2001 From: Wanbogang Date: Thu, 12 Mar 2026 12:52:27 +0700 Subject: [PATCH 2/2] fix: remove return type annotation from raw_to_text to match base class signature --- src/inputs/plugins/vlm_ollama_local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/inputs/plugins/vlm_ollama_local.py b/src/inputs/plugins/vlm_ollama_local.py index a5c0c8c65..5e0a050d9 100644 --- a/src/inputs/plugins/vlm_ollama_local.py +++ b/src/inputs/plugins/vlm_ollama_local.py @@ -221,7 +221,7 @@ async def _raw_to_text(self, raw_input: Optional[np.ndarray]) -> Optional[Messag logging.error(f"VLM Ollama Local: unexpected error: {e}") return None - async def raw_to_text(self, raw_input: Optional[np.ndarray]) -> None: + async def raw_to_text(self, raw_input: Optional[np.ndarray]): """ Convert a camera frame to text and append to the message buffer.