Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/megatron/bridge/models/conversion/auto_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,8 @@ def save_megatron_model(
hf_tokenizer_path: Optional[str | Path] = None,
low_memory_save: bool = False,
hf_tokenizer_kwargs: Optional[dict] = None,
fully_parallel_save: bool = True,
validate_access_integrity: bool = True,
) -> None:
"""
Save a Megatron model in native Megatron checkpoint format without optimizer
Expand All @@ -907,6 +909,10 @@ def save_megatron_model(
hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer.
Common options include trust_remote_code=True for models with custom tokenizers,
or use_fast=True for models that require the fast tokenizer.
fully_parallel_save: If True (default), uses fully parallel save strategy which
requires all DP ranks to participate in collective operations. Set to False
when saving from contexts where not all ranks will enter the save path
(e.g., mixed training/inference worlds with non-colocated vLLM).

Example:
>>> # Save model checkpoint after conversion
Expand Down Expand Up @@ -942,6 +948,8 @@ def save_megatron_model(
hf_tokenizer_path=hf_tokenizer_path,
low_memory_save=low_memory_save,
hf_tokenizer_kwargs=hf_tokenizer_kwargs,
fully_parallel_save=fully_parallel_save,
validate_access_integrity=validate_access_integrity,
)

def load_megatron_model(
Expand Down
9 changes: 9 additions & 0 deletions src/megatron/bridge/training/model_load_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,9 @@ def save_megatron_model(
hf_tokenizer_path: Optional[Union[str, Path]] = None,
low_memory_save: bool = False,
hf_tokenizer_kwargs: Optional[dict] = None,
fully_parallel_save: bool = True,
validate_access_integrity: bool = True,
distributed_timeout_minutes: int = 10,
) -> None:
"""Save a Megatron model in native Megatron checkpoint format without optimizer state.

Expand All @@ -472,6 +475,10 @@ def save_megatron_model(
Default is False, preserving the model for further use.
hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer.
Common options include trust_remote_code=True for models with custom tokenizers.
fully_parallel_save: If True (default), uses fully parallel save strategy which
requires all DP ranks to participate in collective operations. Set to False
when saving from contexts where not all ranks will enter the save path
(e.g., mixed training/inference worlds with non-colocated vLLM).

Example:
>>> # Save model checkpoint
Expand Down Expand Up @@ -538,6 +545,8 @@ def save_megatron_model(
save_rng=False,
ckpt_format=ckpt_format,
dist_ckpt_optim_fully_reshardable=True,
fully_parallel_save=fully_parallel_save,
ckpt_assume_constant_structure=not validate_access_integrity,
),
dist=None,
)
Expand Down
Loading