diff --git a/README.md b/README.md index 7489881d8a..78bae91a75 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ MaxText aims to provide you with the best OSS models, whether as a reference imp * Gemma 2 (2B, 9B, 27B) * Gemma 1 (2B, 7B) * Alibaba - * Qwen 2.5 (7B, 14B) + * Qwen 2.5 (1.5B, 7B, 14B) * Qwen 3 MoE 2507 (235B, 480B) * Qwen 3 MoE (30B, 235B) * Qwen 3 Dense (0.6B, 1.7B, 4B, 8B, 14B, 32B) diff --git a/docs/guides/checkpointing_solutions/convert_checkpoint.md b/docs/guides/checkpointing_solutions/convert_checkpoint.md index eba5fc7261..ca37c9f11b 100644 --- a/docs/guides/checkpointing_solutions/convert_checkpoint.md +++ b/docs/guides/checkpointing_solutions/convert_checkpoint.md @@ -11,7 +11,7 @@ The following models are supported: | **Gemma2** | 2B, 9B, 27B | √ | √ | √ | √ | | **Gemma3** (Multimodal) | 4B, 12B, 27B | √ | √ | √ | √ | | **Llama3.1** | 8B, 70B, 450B | √ | √ | √ | √ | -| **Qwen2.5** | 7B, 14B | √ | √ | √ | √ | +| **Qwen2.5** | 1.5B, 7B, 14B | √ | √ | √ | √ | | **Qwen3** | 0.6B, 4B, 8B, 14B, 32B | √ | √ | √ | √ | | **Qwen3 MoE** | 30B, 235B, 480B | √ | √ | √ | √ | | **Mixtral** | 8x7B, 8x22B | √ | √ | √ | √ | diff --git a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py index 6103476d03..a9b7a18d23 100644 --- a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py +++ b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py @@ -1,16 +1,16 @@ -# Copyright 2023–2025 Google LLC +# Copyright 2023–2026 Google LLC # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# https://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This config defines the architectural configurations of the Hugging Face version of a model. @@ -210,6 +210,22 @@ query_pre_attn_scalar=144, ) +qwen25_1_5b_config = transformers.Qwen2Config( + vocab_size=151936, + hidden_size=1536, + intermediate_size=8960, + num_hidden_layers=28, + num_attention_heads=12, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=32768, + rms_norm_eps=1e-06, + rope_theta=1000000.0, + tie_word_embeddings=True, + torch_dtype="bfloat16", + attention_bias=True, +) + qwen25_7b_config = transformers.Qwen2Config( vocab_size=152064, hidden_size=3584, @@ -866,6 +882,7 @@ "gemma3-4b": gemma3_4b_config, "gemma3-12b": gemma3_12b_config, "gemma3-27b": gemma3_27b_config, + "qwen2.5-1.5b": qwen25_1_5b_config, "qwen2.5-7b": qwen25_7b_config, "qwen2.5-14b": qwen25_14b_config, "qwen3-0.6b": qwen3_0_6b_config, diff --git a/src/maxtext/checkpoint_conversion/utils/hf_shape.py b/src/maxtext/checkpoint_conversion/utils/hf_shape.py index d934178c8d..125876739a 100644 --- a/src/maxtext/checkpoint_conversion/utils/hf_shape.py +++ b/src/maxtext/checkpoint_conversion/utils/hf_shape.py @@ -766,6 +766,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config): "gemma3-4b": GEMMA3_HF_WEIGHTS_TO_SHAPE, "gemma3-12b": GEMMA3_HF_WEIGHTS_TO_SHAPE, "gemma3-27b": GEMMA3_HF_WEIGHTS_TO_SHAPE, + "qwen2.5-1.5b": QWEN_HF_WEIGHTS_TO_SHAPE, "qwen2.5-7b": QWEN_HF_WEIGHTS_TO_SHAPE, "qwen2.5-14b": QWEN_HF_WEIGHTS_TO_SHAPE, "qwen3-0.6b": QWEN_HF_WEIGHTS_TO_SHAPE, diff --git a/src/maxtext/checkpoint_conversion/utils/param_mapping.py b/src/maxtext/checkpoint_conversion/utils/param_mapping.py index 7e318d7fe5..1d1469f49c 100644 --- a/src/maxtext/checkpoint_conversion/utils/param_mapping.py +++ b/src/maxtext/checkpoint_conversion/utils/param_mapping.py @@ -2359,6 +2359,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape): "gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING, "gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING, "gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING, + "qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING, @@ -2399,6 +2400,7 @@ def pad_hf_embedding_layer(input_tensor, target_shape): "gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN, "gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN, "gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN, + "qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN, diff --git a/src/maxtext/configs/models/qwen2.5-1.5b.yml b/src/maxtext/configs/models/qwen2.5-1.5b.yml new file mode 100644 index 0000000000..1ce9a8924d --- /dev/null +++ b/src/maxtext/configs/models/qwen2.5-1.5b.yml @@ -0,0 +1,34 @@ +# Copyright 2023–2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Qwen 2.5 1.5B Instruct Configuration +# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct + +base_emb_dim: 1536 +base_num_query_heads: 12 +base_num_kv_heads: 2 +base_mlp_dim: 8960 +base_num_decoder_layers: 28 +head_dim: 128 +mlp_activations: ["silu", "linear"] +vocab_size: 151936 +decoder_block: "qwen2" +normalization_layer_epsilon: 1e-06 +rope_max_timescale: 1000000.0 +use_qk_norm: False +# Bias for q, k, v proj. +attention_bias: True +logits_via_embedding: True +normalize_embedding_logits: False +tokenizer_type: "huggingface" diff --git a/src/maxtext/configs/pyconfig_deprecated.py b/src/maxtext/configs/pyconfig_deprecated.py index 888a23b199..cfe182a53f 100644 --- a/src/maxtext/configs/pyconfig_deprecated.py +++ b/src/maxtext/configs/pyconfig_deprecated.py @@ -460,6 +460,7 @@ def validate_model_name(s: str) -> bool: "gemma3-4b", "gemma3-12b", "gemma3-27b", + "qwen2.5-1.5b", "qwen2.5-7b", "qwen2.5-14b", "qwen3-0.6b", diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py index c98e4b363f..c4fce230a7 100644 --- a/src/maxtext/configs/types.py +++ b/src/maxtext/configs/types.py @@ -233,6 +233,7 @@ class ProfilerType(str, Enum): "gemma3-4b", "gemma3-12b", "gemma3-27b", + "qwen2.5-1.5b", "qwen2.5-7b", "qwen2.5-14b", "qwen3-0.6b", diff --git a/src/maxtext/utils/globals.py b/src/maxtext/utils/globals.py index 203d7a6165..1a8403a965 100644 --- a/src/maxtext/utils/globals.py +++ b/src/maxtext/utils/globals.py @@ -50,6 +50,7 @@ "gemma3-4b": "google/gemma-3-4b-it", # hf multi-modal should also support the pure-text "gemma3-12b": "google/gemma-3-12b-it", "gemma3-27b": "google/gemma-3-27b-it", + "qwen2.5-1.5b": "Qwen/Qwen2.5-1.5B-Instruct", "qwen2.5-7b": "Qwen/Qwen2.5-7B-Instruct", "qwen2.5-14b": "Qwen/Qwen2.5-14B-Instruct", "qwen3-0.6b": "Qwen/Qwen3-0.6B",