vllm-project · ricky-chaoju · Mar 11, 2026
diff --git a/docs/qwen35.md b/docs/qwen35.md
@@ -0,0 +1,41 @@
+# Qwen3.5 Model Support
+
+Qwen3.5 models (dense and MoE variants) are supported via the `[qwen35]` optional extra.
+
+## Requirements
+
+| Dependency | Minimum | Why |
+|---|---|---|
+| mlx-lm | 0.31.0 | Native `qwen3_5` / `qwen3_5_moe` model modules |
+| mlx-vlm | 0.3.12 | Qwen3.5 VLM support |
+| transformers | 5.0.0 | Qwen3.5 config compatibility |
+| vllm | 0.17.0 | `Qwen3_5MoeForConditionalGeneration` model registry |
+
+## Installation
+
+```bash
+# Step 1: install vllm 0.17.0 (required for Qwen3.5 model registry)
+VLLM_VERSION=0.17.0 ./install.sh
+
+# Step 2: install Qwen3.5 dependencies
+pip install 'vllm-metal[qwen35]'
+```
+
+## Verified models
+
+| Model | Type | Tested |
+|---|---|---|
+| Qwen3.5-35B-A3B | MoE, multimodal | Yes |
+
+## Usage
+
+```bash
+vllm serve Qwen/Qwen3.5-35B-A3B --max-model-len 4096 --dtype auto
+```
+
+## Architecture notes
+
+Qwen3.5 is a hybrid model with alternating `linear_attention` (Mamba/SSM)
+and `full_attention` layers. The KV cache contains a mix of `ArraysCache`
+(for linear attention) and `KVCache` (for full attention). As of mlx-lm
+0.31.0, both sequential and batched decode work correctly with this layout.
diff --git a/install.sh b/install.sh
@@ -123,16 +123,16 @@ main() {
 
   ensure_venv "$venv"
 
-  local vllm_v="0.14.1"
+  local vllm_v="${VLLM_VERSION:-0.14.1}"
   local url_base="https://github.com/vllm-project/vllm/releases/download"
   local filename="vllm-$vllm_v.tar.gz"
-  curl -OL $url_base/v$vllm_v/$filename
-  tar xf $filename
-  cd vllm-$vllm_v
+  curl -OL "$url_base/v$vllm_v/$filename"
+  tar xf "$filename"
+  cd "vllm-$vllm_v"
   uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
   uv pip install .
   cd -
-  rm -rf vllm-$vllm_v*
+  rm -rf "vllm-$vllm_v"*
 
   if [[ -n "$local_lib" && -f "$local_lib" ]]; then
     uv pip install .

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,11 @@ dependencies = [
 
 [project.optional-dependencies]
 vllm = ["vllm>=0.14.0"]
+qwen35 = [
+    "mlx-lm>=0.31.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
+    "mlx-vlm>=0.3.12; platform_system == 'Darwin' and platform_machine == 'arm64'",
+    "transformers>=5.0.0",
+]
 stt = [
     # Speech-to-text audio processing (Whisper models)
     "librosa>=0.10.2",
@@ -56,7 +61,7 @@ dev = [
     "mypy>=1.19.1",
 ]
 all = [
-    "vllm-metal[vllm,stt,dev]",
+    "vllm-metal[vllm,stt,qwen35,dev]",
 ]
 
 [project.urls]