diff --git a/docker/patch/latest/sglang.patch b/docker/patch/latest/sglang.patch index 5c6c7fa6d..741e1252d 100644 --- a/docker/patch/latest/sglang.patch +++ b/docker/patch/latest/sglang.patch @@ -3098,6 +3098,14 @@ index d641826e3..3abc39ef3 100644 hidden_states, residual = layer( positions, hidden_states, +@@ -1112,6 +1117,7 @@ class Qwen3VLForConditionalGeneration(nn.Module): + if "visual" in name: + # adapt to VisionAttention + name = name.replace(r"attn.qkv.", r"attn.qkv_proj.") ++ name = name.replace(r"model.visual.", r"visual.") + + try: + # Skip loading extra bias for GPTQ models. diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py index 33cce6fe2..0970c4550 100644 --- a/python/sglang/srt/multimodal/processors/glm4v.py