diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py index e1df1c5b97..8b215e40e8 100644 --- a/src/megatron/bridge/models/conversion/auto_bridge.py +++ b/src/megatron/bridge/models/conversion/auto_bridge.py @@ -884,6 +884,8 @@ def save_megatron_model( hf_tokenizer_path: Optional[str | Path] = None, low_memory_save: bool = False, hf_tokenizer_kwargs: Optional[dict] = None, + fully_parallel_save: bool = True, + validate_access_integrity: bool = True, ) -> None: """ Save a Megatron model in native Megatron checkpoint format without optimizer @@ -907,6 +909,10 @@ def save_megatron_model( hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer. Common options include trust_remote_code=True for models with custom tokenizers, or use_fast=True for models that require the fast tokenizer. + fully_parallel_save: If True (default), uses fully parallel save strategy which + requires all DP ranks to participate in collective operations. Set to False + when saving from contexts where not all ranks will enter the save path + (e.g., mixed training/inference worlds with non-colocated vLLM). Example: >>> # Save model checkpoint after conversion @@ -942,6 +948,8 @@ def save_megatron_model( hf_tokenizer_path=hf_tokenizer_path, low_memory_save=low_memory_save, hf_tokenizer_kwargs=hf_tokenizer_kwargs, + fully_parallel_save=fully_parallel_save, + validate_access_integrity=validate_access_integrity, ) def load_megatron_model( diff --git a/src/megatron/bridge/training/model_load_save.py b/src/megatron/bridge/training/model_load_save.py index 5265ac7792..9ff7629a18 100644 --- a/src/megatron/bridge/training/model_load_save.py +++ b/src/megatron/bridge/training/model_load_save.py @@ -446,6 +446,9 @@ def save_megatron_model( hf_tokenizer_path: Optional[Union[str, Path]] = None, low_memory_save: bool = False, hf_tokenizer_kwargs: Optional[dict] = None, + fully_parallel_save: bool = True, + validate_access_integrity: bool = True, + distributed_timeout_minutes: int = 10, ) -> None: """Save a Megatron model in native Megatron checkpoint format without optimizer state. @@ -472,6 +475,10 @@ def save_megatron_model( Default is False, preserving the model for further use. hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer. Common options include trust_remote_code=True for models with custom tokenizers. + fully_parallel_save: If True (default), uses fully parallel save strategy which + requires all DP ranks to participate in collective operations. Set to False + when saving from contexts where not all ranks will enter the save path + (e.g., mixed training/inference worlds with non-colocated vLLM). Example: >>> # Save model checkpoint @@ -538,6 +545,8 @@ def save_megatron_model( save_rng=False, ckpt_format=ckpt_format, dist_ckpt_optim_fully_reshardable=True, + fully_parallel_save=fully_parallel_save, + ckpt_assume_constant_structure=not validate_access_integrity, ), dist=None, )