From 1595e868db7c48d2e8fde0c923ef1086cfb41c3f Mon Sep 17 00:00:00 2001 From: Manuel Maly Date: Mon, 2 Mar 2026 17:52:24 +0100 Subject: [PATCH] Add Qwen3.5 MoE compatibility path --- pyproject.toml | 2 +- vllm_metal/v1/model_runner.py | 44 +++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 763c53d..a054a18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ classifiers = [ dependencies = [ # MLX - Required for Apple Silicon GPU acceleration "mlx>=0.29.2; platform_system == 'Darwin' and platform_machine == 'arm64'", - "mlx-lm>=0.28.4; platform_system == 'Darwin' and platform_machine == 'arm64'", + "mlx-lm>=0.30.8; platform_system == 'Darwin' and platform_machine == 'arm64'", "mlx-vlm>=0.3.0; platform_system == 'Darwin' and platform_machine == 'arm64'", # Vision-language model support # Model loading and weights "transformers>=4.40.0", diff --git a/vllm_metal/v1/model_runner.py b/vllm_metal/v1/model_runner.py index 39118b8..0f158fd 100644 --- a/vllm_metal/v1/model_runner.py +++ b/vllm_metal/v1/model_runner.py @@ -11,8 +11,10 @@ """ import hashlib +import importlib import math import os +import sys import time from array import array from dataclasses import dataclass @@ -61,6 +63,47 @@ _model_cache: dict[str, tuple[Any, Any]] = {} # model_name -> (model, tokenizer) _model_cache_lock = Lock() + +def _ensure_mlx_lm_model_aliases() -> None: + """Register temporary model-type aliases for mlx_lm compatibility. + + Some newer model snapshots report a ``model_type`` that is not yet + available as a module in the currently bundled ``mlx_lm`` release. + In that case we register a best-effort import alias so ``mlx_lm`` can + still resolve the architecture. + """ + alias_map = { + # Qwen3.5 MoE currently reuses the same implementation shape as qwen3_moe. + "qwen3_5_moe": "qwen3_moe", + } + + for alias, target in alias_map.items(): + alias_module = f"mlx_lm.models.{alias}" + target_module = f"mlx_lm.models.{target}" + + if alias_module in sys.modules: + continue + + # If mlx_lm already ships this architecture, do not override it. + try: + importlib.import_module(alias_module) + continue + except ModuleNotFoundError: + pass + + try: + resolved = importlib.import_module(target_module) + except ModuleNotFoundError: + logger.debug( + "Skipping mlx_lm alias registration for %s (target %s missing)", + alias, + target, + ) + continue + + sys.modules[alias_module] = resolved + logger.info("Registered mlx_lm model alias: %s -> %s", alias, target) + # Try to import Rust extension for high-performance token state management try: from vllm_metal._rs import RequestStateManager as RustRequestStateManager @@ -645,6 +688,7 @@ def load_model(self) -> None: self._is_vlm = True else: # Load model and tokenizer using mlx_lm for text-only models + _ensure_mlx_lm_model_aliases() self.model, self.tokenizer = mlx_lm_load( model_name, tokenizer_config={