From 1595e868db7c48d2e8fde0c923ef1086cfb41c3f Mon Sep 17 00:00:00 2001
From: Manuel Maly <Manuel.maly@gmail.com>
Date: Mon, 2 Mar 2026 17:52:24 +0100
Subject: [PATCH] Add Qwen3.5 MoE compatibility path

---
 pyproject.toml                |  2 +-
 vllm_metal/v1/model_runner.py | 44 +++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 763c53d..a054a18 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ classifiers = [
 dependencies = [
     # MLX - Required for Apple Silicon GPU acceleration
     "mlx>=0.29.2; platform_system == 'Darwin' and platform_machine == 'arm64'",
-    "mlx-lm>=0.28.4; platform_system == 'Darwin' and platform_machine == 'arm64'",
+    "mlx-lm>=0.30.8; platform_system == 'Darwin' and platform_machine == 'arm64'",
     "mlx-vlm>=0.3.0; platform_system == 'Darwin' and platform_machine == 'arm64'",  # Vision-language model support
     # Model loading and weights
     "transformers>=4.40.0",
diff --git a/vllm_metal/v1/model_runner.py b/vllm_metal/v1/model_runner.py
index 39118b8..0f158fd 100644
--- a/vllm_metal/v1/model_runner.py
+++ b/vllm_metal/v1/model_runner.py
@@ -11,8 +11,10 @@
 """
 
 import hashlib
+import importlib
 import math
 import os
+import sys
 import time
 from array import array
 from dataclasses import dataclass
@@ -61,6 +63,47 @@
 _model_cache: dict[str, tuple[Any, Any]] = {}  # model_name -> (model, tokenizer)
 _model_cache_lock = Lock()
 
+
+def _ensure_mlx_lm_model_aliases() -> None:
+    """Register temporary model-type aliases for mlx_lm compatibility.
+
+    Some newer model snapshots report a ``model_type`` that is not yet
+    available as a module in the currently bundled ``mlx_lm`` release.
+    In that case we register a best-effort import alias so ``mlx_lm`` can
+    still resolve the architecture.
+    """
+    alias_map = {
+        # Qwen3.5 MoE currently reuses the same implementation shape as qwen3_moe.
+        "qwen3_5_moe": "qwen3_moe",
+    }
+
+    for alias, target in alias_map.items():
+        alias_module = f"mlx_lm.models.{alias}"
+        target_module = f"mlx_lm.models.{target}"
+
+        if alias_module in sys.modules:
+            continue
+
+        # If mlx_lm already ships this architecture, do not override it.
+        try:
+            importlib.import_module(alias_module)
+            continue
+        except ModuleNotFoundError:
+            pass
+
+        try:
+            resolved = importlib.import_module(target_module)
+        except ModuleNotFoundError:
+            logger.debug(
+                "Skipping mlx_lm alias registration for %s (target %s missing)",
+                alias,
+                target,
+            )
+            continue
+
+        sys.modules[alias_module] = resolved
+        logger.info("Registered mlx_lm model alias: %s -> %s", alias, target)
+
 # Try to import Rust extension for high-performance token state management
 try:
     from vllm_metal._rs import RequestStateManager as RustRequestStateManager
@@ -645,6 +688,7 @@ def load_model(self) -> None:
             self._is_vlm = True
         else:
             # Load model and tokenizer using mlx_lm for text-only models
+            _ensure_mlx_lm_model_aliases()
             self.model, self.tokenizer = mlx_lm_load(
                 model_name,
                 tokenizer_config={