ItzCrazyKns · tsgdgszjj-sketch · Mar 20, 2026 · Mar 20, 2026 · cubic-dev-ai · Mar 20, 2026
diff --git a/README.md b/README.md
@@ -211,6 +211,34 @@ If you're encountering a Lemonade connection error, it is likely due to the back
    - Verify that Lemonade is configured to accept connections from all interfaces (`0.0.0.0`), not just localhost (`127.0.0.1`).
    - Ensure that the port (default is 8000) is not blocked by your firewall.
 
+### Image Upload Search with Local Ollama Vision
+
+Vane can be extended to support **image upload → vision understanding → automatic search query generation** by pointing the `/api/vision` route to a local Ollama vision model.
+
+A working setup looks like this:
+
+```bash
+docker run -d -p 3000:3000 \
+  -e OLLAMA_BASE_URL=http://host.docker.internal:11434/v1 \
+  -e OLLAMA_VISION_MODEL=qwen3-vl:latest \
+  -v vane-data:/home/vane/data \
+  --name vane vane-image-search:local
+```
+
+Recommended notes:
+
+- `OLLAMA_BASE_URL` should point to an OpenAI-compatible Ollama endpoint.
+- `OLLAMA_VISION_MODEL` must be a **vision-capable** model, for example `qwen3-vl:latest`.
+- Plain text-only Ollama models will not work for image understanding.
+- On Docker for Mac/Windows, `host.docker.internal` is usually the correct host.
+- On Linux, replace it with your host machine IP if needed.
+
+Once configured, uploading an image in the attachment button can:
+
+1. send the image to `/api/vision`
+2. generate a concise search query from the image
+3. automatically submit that query into Vane's existing text search flow
+
 ## Using as a Search Engine
 
 If you wish to use Vane as an alternative to traditional search engines like Google or Bing, or if you want to add a shortcut for quick access from your browser's search bar, follow these steps:

diff --git a/src/app/api/vision/route.ts b/src/app/api/vision/route.ts
@@ -0,0 +1,50 @@
+import { NextResponse } from 'next/server';
+import OpenAI from 'openai';
+
+export async function POST(req: Request) {
+  try {
+    const formData = await req.formData();
+    const imageFile = formData.get('image') as File;
+
+    if (!imageFile) {
+      return NextResponse.json({ message: 'Missing image' }, { status: 400 });
+    }
+
+    const buffer = Buffer.from(await imageFile.arrayBuffer());
+    const base64Image = buffer.toString('base64');
+    const mimeType = imageFile.type;
+
+    const ollamaBaseURL = process.env.OLLAMA_BASE_URL || 'http://host.docker.internal:11434/v1';
+    const model = process.env.OLLAMA_VISION_MODEL || 'qwen3-vl:latest';
+
+    const openai = new OpenAI({
+      apiKey: process.env.OPENAI_API_KEY || 'ollama',
+      baseURL: ollamaBaseURL,
+    });
+
+    const response = await openai.chat.completions.create({
+      model,
+      messages: [
+        {
+          role: 'user',
+          content: [
+            {
+              type: 'text',
+              text: '请分析这张图片，并输出一句适合联网搜索的中文搜索词。只输出搜索词本身，不要解释，不要加引号。'
+            },
+            {
+              type: 'image_url',
+              image_url: { url: `data:${mimeType};base64,${base64Image}` }
+            }
+          ]
+        }
+      ]
+    });
+
+    const query = response.choices?.[0]?.message?.content?.toString().trim() || '请描述这张图片的主体内容';
+    return NextResponse.json({ query });
+  } catch (error) {
+    console.error('Vision Error:', error);
+    return NextResponse.json({ message: 'Error processing image' }, { status: 500 });
+  }
+}
diff --git a/src/components/MessageInputActions/Attach.tsx b/src/components/MessageInputActions/Attach.tsx
@@ -20,7 +20,7 @@ import { AnimatePresence } from 'motion/react';
 import { motion } from 'framer-motion';
 
 const Attach = () => {
-  const { files, setFiles, setFileIds, fileIds } = useChat();
+  const { files, setFiles, setFileIds, fileIds, sendMessage } = useChat();
 
   const [loading, setLoading] = useState(false);
   const fileInputRef = useRef<any>();
@@ -29,10 +29,30 @@ const Attach = () => {
     setLoading(true);
     const data = new FormData();
 
+    let hasImage = false;
+    let imageFile: File | null = null;
     for (let i = 0; i < e.target.files!.length; i++) {
+      if (e.target.files![i].type.startsWith('image/')) {
+        hasImage = true;
+        imageFile = e.target.files![i];
+      }
       data.append('files', e.target.files![i]);
     }
 
+    if (hasImage && imageFile) {
-    if (hasImage && imageFile) {
+    if (hasImage && imageFile && e.target.files!.length === 1) {
-    if (hasImage && imageFile) {
+    if (hasImage && imageFile && e.target.files!.length === 1) {
+      const visionData = new FormData();
+      visionData.append('image', imageFile);
+      visionData.append('chat_model_provider_id', localStorage.getItem('chatModelProviderId') || '');
+      visionData.append('chat_model_key', localStorage.getItem('chatModelKey') || '');
+      const res = await fetch('/api/vision', { method: 'POST', body: visionData });
+      const resData = await res.json();
+      if (resData.query) {
+        sendMessage(`Search for information based on this image: ${resData.query}`);
+      }
+      setLoading(false);
+      return;
+    }
+
     const embeddingModelProvider = localStorage.getItem(
       'embeddingModelProviderId',
     );
@@ -94,7 +114,7 @@ const Attach = () => {
                           type="file"
                           onChange={handleChange}
                           ref={fileInputRef}
-                          accept=".pdf,.docx,.txt"
+                          accept=".pdf,.docx,.txt,image/*"
                           multiple
                           hidden
                         />
@@ -157,7 +177,7 @@ const Attach = () => {
         type="file"
         onChange={handleChange}
         ref={fileInputRef}
-        accept=".pdf,.docx,.txt"
+        accept=".pdf,.docx,.txt,image/*"
         multiple
         hidden
       />