diff --git a/docs/qwen35.md b/docs/qwen35.md new file mode 100644 index 00000000..fbc55214 --- /dev/null +++ b/docs/qwen35.md @@ -0,0 +1,41 @@ +# Qwen3.5 Model Support + +Qwen3.5 models (dense and MoE variants) are supported via the `[qwen35]` optional extra. + +## Requirements + +| Dependency | Minimum | Why | +|---|---|---| +| mlx-lm | 0.31.0 | Native `qwen3_5` / `qwen3_5_moe` model modules | +| mlx-vlm | 0.3.12 | Qwen3.5 VLM support | +| transformers | 5.0.0 | Qwen3.5 config compatibility | +| vllm | 0.17.0 | `Qwen3_5MoeForConditionalGeneration` model registry | + +## Installation + +```bash +# Step 1: install vllm 0.17.0 (required for Qwen3.5 model registry) +VLLM_VERSION=0.17.0 ./install.sh + +# Step 2: install Qwen3.5 dependencies +pip install 'vllm-metal[qwen35]' +``` + +## Verified models + +| Model | Type | Tested | +|---|---|---| +| Qwen3.5-35B-A3B | MoE, multimodal | Yes | + +## Usage + +```bash +vllm serve Qwen/Qwen3.5-35B-A3B --max-model-len 4096 --dtype auto +``` + +## Architecture notes + +Qwen3.5 is a hybrid model with alternating `linear_attention` (Mamba/SSM) +and `full_attention` layers. The KV cache contains a mix of `ArraysCache` +(for linear attention) and `KVCache` (for full attention). As of mlx-lm +0.31.0, both sequential and batched decode work correctly with this layout. diff --git a/install.sh b/install.sh index a7344266..ef4415d5 100755 --- a/install.sh +++ b/install.sh @@ -123,16 +123,16 @@ main() { ensure_venv "$venv" - local vllm_v="0.14.1" + local vllm_v="${VLLM_VERSION:-0.14.1}" local url_base="https://github.com/vllm-project/vllm/releases/download" local filename="vllm-$vllm_v.tar.gz" - curl -OL $url_base/v$vllm_v/$filename - tar xf $filename - cd vllm-$vllm_v + curl -OL "$url_base/v$vllm_v/$filename" + tar xf "$filename" + cd "vllm-$vllm_v" uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match uv pip install . cd - - rm -rf vllm-$vllm_v* + rm -rf "vllm-$vllm_v"* if [[ -n "$local_lib" && -f "$local_lib" ]]; then uv pip install . diff --git a/pyproject.toml b/pyproject.toml index 846e42f0..0dbebdfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,11 @@ dependencies = [ [project.optional-dependencies] vllm = ["vllm>=0.14.0"] +qwen35 = [ + "mlx-lm>=0.31.0; platform_system == 'Darwin' and platform_machine == 'arm64'", + "mlx-vlm>=0.3.12; platform_system == 'Darwin' and platform_machine == 'arm64'", + "transformers>=5.0.0", +] stt = [ # Speech-to-text audio processing (Whisper models) "librosa>=0.10.2", @@ -56,7 +61,7 @@ dev = [ "mypy>=1.19.1", ] all = [ - "vllm-metal[vllm,stt,dev]", + "vllm-metal[vllm,stt,qwen35,dev]", ] [project.urls]