diff --git a/training/a4x/qwen3-235b/README.md b/training/a4x/qwen3-235b/README.md new file mode 100644 index 00000000..e184f139 --- /dev/null +++ b/training/a4x/qwen3-235b/README.md @@ -0,0 +1,9 @@ +# To run this recipe: + +1. Insert your Hugging Face token into `launch_script.sh` by replacing ``. +2. SSH into the Slurm login node +3. Copy this recipe folder to `/home/$USER/recipe` +4. From your home directory (`/home/$USER`), run: + ``` + sbatch recipe/sbatch_script.sh + ``` diff --git a/training/a4x/qwen3-235b/custom_setup_experiment.py b/training/a4x/qwen3-235b/custom_setup_experiment.py new file mode 100644 index 00000000..32173cbc --- /dev/null +++ b/training/a4x/qwen3-235b/custom_setup_experiment.py @@ -0,0 +1,233 @@ +import glob +import logging +import os +from pathlib import Path +import sys +import time +from typing import Any, Dict, List, Optional + +import nemo_run as run +from nemo_run.config import get_nemorun_home + + +try: + from argument_parser import parse_cli_args + from utils.evaluate import calc_convergence_and_performance + from utils.executors import dgxc_executor, slurm_executor +except (ImportError, ModuleNotFoundError): + from .argument_parser import parse_cli_args + from .utils.evaluate import calc_convergence_and_performance + from .utils.executors import dgxc_executor, slurm_executor + +try: + import wandb + + HAVE_WANDB = True +except (ImportError, ModuleNotFoundError): + HAVE_WANDB = False + +try: + from perf_plugins import NsysPlugin, PerfEnvPlugin + from resiliency_plugins import FaultTolerancePlugin +except (ImportError, ModuleNotFoundError): + from .perf_plugins import NsysPlugin, PerfEnvPlugin + from .resiliency_plugins import FaultTolerancePlugin + +import logging + + +SCRIPT_DIR = Path(__file__).parent.resolve() +ENTRYPOINT_PEFORMANCE = "run_script.py" +ENTRYPOINT_RECIPE = "run_recipe.py" + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +def main( + use_recipes: bool, + model_family_name: str, + model_recipe_name: str, + task: str, + compute_dtype: str, + gpu: str, + hf_token: str, + detach: bool, + dryrun: bool, + enable_vboost: bool, + enable_nsys: bool, + moe_a2a_overlap: bool, + tp_size: Optional[int], + pp_size: Optional[int], + cp_size: Optional[int], + wandb_key: str, + wandb_project_name: str, + wandb_experiment_name: str, + wandb_entity_name: str, + profiling_start_step: int, + profiling_stop_step: int, + profiling_gpu_metrics: bool, + profiling_ranks: Optional[List[int]], + nemo_home: str, + account: str, + partition: str, + log_dir: str, + gpus_per_node: int, + time_limit: str, + container_image: str, + custom_mounts: List[str], + custom_env_vars: List[str], + custom_srun_args: List[str], + pretrained_checkpoint: Optional[str], + num_gpus: int, + is_long_convergence_run: bool, + additional_slurm_params: Optional[Dict[str, Any]], + golden_values_path: str, + convergence_params: Dict[str, Any], + performance_params: Dict[str, Any], + max_retries: int, + dgxc_base_url: str, + dgxc_cluster: str, + dgxc_kube_apiserver_url: str, + dgxc_app_id: str, + dgxc_app_secret: str, + dgxc_project_name: str, + dgxc_pvc_claim_name: str, + dgxc_pvc_mount_path: str, +): + logger.info("Hello World") + + rank = os.environ['RANK'] + + exp_name = f"{model_recipe_name}_{model_family_name}" + exp_name += f'_worker{rank}' + if use_recipes: + script_name = ENTRYPOINT_RECIPE + + else: + script_name = ENTRYPOINT_PEFORMANCE + + run_script_path = SCRIPT_DIR / script_name + logger.info(f"Run script path: {run_script_path}") + if not run_script_path.is_file(): + logger.error(f"Specified run script not found: {run_script_path}") + sys.exit(1) + + nemorun_script = run.Script( + path=str(run_script_path), + entrypoint="python", + env={"PYTHONPATH": f"{SCRIPT_DIR}:$PYTHONPATH"}, + args=list(sys.argv[1:]), + ) + + plugins = [] + + if not use_recipes: + plugins.append( + PerfEnvPlugin( + enable_vboost=enable_vboost, + moe_a2a_overlap=moe_a2a_overlap, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + model_family_name=model_family_name, + model_recipe_name=model_recipe_name, + gpu=gpu, + compute_dtype=compute_dtype, + train_task=task, + ) + ) + + if enable_nsys: + plugins.append( + NsysPlugin( + profile_step_start=profiling_start_step, + profile_step_end=profiling_stop_step, + nsys_gpu_metrics=profiling_gpu_metrics, + profile_ranks=profiling_ranks, + ) + ) + + executor = run.LocalExecutor() + run.run( + nemorun_script, + executor=executor, + plugins=plugins, + dryrun=False, + detach=False, + name=exp_name, + ) + + +if __name__ == "__main__": + parser = parse_cli_args() + args, unknown_args = parser.parse_known_args() + + # probably better to use parser.parse_args() and make unknowns an error, + # but for now we'll just issue a warning. + if unknown_args: + logger.warning(f"Ignoring unrecognized arguments: {' '.join(unknown_args)}") + + main( + use_recipes=args.use_recipes, + model_family_name=args.model_family_name, + model_recipe_name=args.model_recipe_name, + task=args.task, + compute_dtype=args.compute_dtype, + gpu=args.gpu, + hf_token=args.hf_token, + detach=args.detach, + dryrun=args.dryrun, + enable_vboost=args.enable_vboost, + enable_nsys=args.enable_nsys, + moe_a2a_overlap=args.moe_a2a_overlap, + tp_size=args.tensor_model_parallel_size, + pp_size=args.pipeline_model_parallel_size, + cp_size=args.context_parallel_size, + wandb_key=args.wandb_key, + wandb_project_name=args.wandb_project_name, + wandb_experiment_name=args.wandb_experiment_name, + wandb_entity_name=args.wandb_entity_name, + profiling_start_step=args.profiling_start_step, + profiling_stop_step=args.profiling_stop_step, + profiling_gpu_metrics=args.profiling_gpu_metrics, + profiling_ranks=args.profiling_ranks, + nemo_home=args.nemo_home, + account=args.account, + partition=args.partition, + log_dir=args.log_dir, + gpus_per_node=args.gpus_per_node, + time_limit=args.time_limit, + container_image=args.container_image, + custom_mounts=args.custom_mounts, + custom_env_vars=args.custom_env_vars, + custom_srun_args=args.custom_srun_args, + pretrained_checkpoint=args.pretrained_checkpoint, + num_gpus=args.num_gpus, + is_long_convergence_run=args.is_long_convergence_run, + additional_slurm_params=args.additional_slurm_params, + golden_values_path=args.golden_values_path, + convergence_params={ + "correlation_threshold": args.correlation_threshold, + "high_loss_tolerance": args.high_loss_tolerance, + "medium_loss_tolerance": args.medium_loss_tolerance, + "low_loss_tolerance": args.low_loss_tolerance, + "final_loss_tolerance": args.final_loss_tolerance, + "max_outlier_ratio": args.max_outlier_ratio, + "outlier_threshold": args.outlier_threshold, + "skip_first_percent_loss": args.skip_first_percent_loss, + }, + performance_params={ + "timing_threshold": args.timing_threshold, + "skip_first_percent_time": args.skip_first_percent_time, + }, + max_retries=args.max_retries, + dgxc_base_url=args.dgxc_base_url, + dgxc_cluster=args.dgxc_cluster, + dgxc_kube_apiserver_url=args.dgxc_kube_apiserver_url, + dgxc_app_id=args.dgxc_app_id, + dgxc_app_secret=args.dgxc_app_secret, + dgxc_project_name=args.dgxc_project_name, + dgxc_pvc_claim_name=args.dgxc_pvc_claim_name, + dgxc_pvc_mount_path=args.dgxc_pvc_mount_path, + ) diff --git a/training/a4x/qwen3-235b/launch_script.sh b/training/a4x/qwen3-235b/launch_script.sh new file mode 100644 index 00000000..dfc0f282 --- /dev/null +++ b/training/a4x/qwen3-235b/launch_script.sh @@ -0,0 +1,151 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [[ "$1" != "" ]]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [[ -z "${config_overrides[*]}" ]]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib:$NCCL_PLUGIN_PATH:$LD_LIBRARY_PATH" +ldconfig "$LD_LIBRARY_PATH" +echo "Added $LD_LIBRARY_PATH to ldconfig:" +ldconfig -p | grep libcuda | sed 's/^/ /' +echo "" + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp "${TOKENIZER_PATH}"/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +# Create the nsys directory. +mkdir -p "${explicit_log_dir}/nsys" + +# Collect diagnostics to a single line +kv="\"kernel_version\": \"$(uname --kernel-release)\"" +if command -v nvidia-smi &> /dev/null; then + cuda_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true) + driver_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true) + vbios_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' | head -n1 || true) + kv="${kv}, \"cuda_version\": \"${cuda_v}\"" + kv="${kv}, \"driver_version\": \"${driver_v}\"" + kv="${kv}, \"vbios_version\": \"${vbios_v}\"" +fi +echo "VERSION_DIAGNOSTICS: {${kv}}" + + +export HF_TOKEN= + +cd /opt +rm -rf Megatron-Bridge +git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git +cd Megatron-Bridge +git checkout 7695d4acbfac19353d20e456509117efe4733d6b +sed -i -e '/pretrain(config=recipe/i \ recipe.dist.distributed_timeout_minutes = 10' scripts/performance/run_script.py +ls + +cp $CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH scripts/performance/ + +worker_command=$(cat <<- EOM + if [ "\$RANK" -eq "0" ]; then + echo "Worker 0 is stalling for a few seconds.." ; + sleep 3 ; + echo "The detected environment within worker rank 0 is:" ; + env | sed 's/^/ /' ; + fi ; + + cd /opt/Megatron-Bridge ; + + numactl \ + --cpunodebind=\$((LOCAL_RANK/2)) \ + --membind=\$((LOCAL_RANK/2)) nsys profile \ + -t nvtx,cuda \ + --cuda-event-trace=false \ + --sample=none \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + --kill none \ + -o "/${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK" \ + --force-overwrite true \ + --session-new "nsys-\$RANDOM-\$RANK" \ + nice -10 \ + python scripts/performance/custom_setup_experiment.py \ + --gpu gb200 \ + --model_family_name qwen \ + --model_recipe_name qwen3_235b_a22b \ + --gpus_per_node 4 \ + --num_gpus 64 \ + --seq_length 4096 \ + --compute_dtype bf16 \ + --global_batch_size 1024 \ + --tensor_model_parallel_size 1 \ + --pipeline_model_parallel_size 8 \ + --context_parallel_size 1 \ + --expert_model_parallel_size 8 \ + --virtual_pipeline_model_parallel_size 3 \ + --micro_batch_size 1 \ + --cuda_graph_impl transformer_engine \ + --cuda_graph_scope moe_router,moe_preprocess,attn \ + --max_steps 30 + +EOM +) + +echo "$worker_command" > worker_command.sh +chmod 777 worker_command.sh + +torchrun \ +--nproc-per-node="4" \ +--nnodes="16" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +--no-python bash worker_command.sh + + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p "${ARTIFACT_DIR}" + cp -r "${explicit_log_dir}"/* "${ARTIFACT_DIR}/" + env > "${ARTIFACT_DIR}/environ.txt" + ls "${ARTIFACT_DIR}" +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4x/qwen3-235b/sbatch_script.sh b/training/a4x/qwen3-235b/sbatch_script.sh new file mode 100644 index 00000000..6fa2dadc --- /dev/null +++ b/training/a4x/qwen3-235b/sbatch_script.sh @@ -0,0 +1,54 @@ +#!/bin/bash +#SBATCH --job-name=rishabhbaghel-ubench-6uxs +#SBATCH --nodes=16 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:4 +#SBATCH --mem=0 + +# Exit early on failures +set -e + +# Validate that the recipe location is setup correctly. +# Recipe is expected to be in "recipe" folder inside current working directory +RECIPE_DIR="$(pwd)/recipe" +LAUNCH_SCRIPT="${RECIPE_DIR}/launch_script.sh" +if [[ ! -f "${LAUNCH_SCRIPT}" ]]; then + echo "Error: Recipe is not located correctly. The recipe is expected to be in "recipe" folder inside current working directory. We could not find the launch script there." >&2 + exit 1 +fi +chmod +x "${LAUNCH_SCRIPT}" + +# Enroot the image if it is not already enrooted. +export ENROOT_CONFIG_PATH=${HOME}/.config/enroot +ORIG_IMAGE=nvcr.io#nvidia/nemo:25.11 +SQSH_IMAGE_PATH=${RECIPE_DIR}/sqsh/nvcr.io_nvidia_nemo:25.11 +if [[ ! -f "${SQSH_IMAGE_PATH}" ]]; then + mkdir -p "$(dirname "${SQSH_IMAGE_PATH}")" + echo "enrooting $ORIG_IMAGE to ${SQSH_IMAGE_PATH}" + enroot import --output "${SQSH_IMAGE_PATH}" -- "docker://${ORIG_IMAGE}" +fi + +# get the master node +master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +master_port=29500 + +ARTIFACT_DIR_HOME="/home/$USER/job_artifacts/${SLURM_JOB_ID}" +mkdir -p "$ARTIFACT_DIR_HOME" + +export NNODES=$SLURM_NNODES +export MASTER_ADDR=$master_addr +export MASTER_PORT=$master_port +export ARTIFACT_DIR=/artifacts +export JOB_NAME=rishabhbaghel-ubench-6uxs +export JOB_IDENTIFIER=rishabhbaghel-ubench-6uxs +export CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH=/recipe/custom_setup_experiment.py + + +export PMIX_MCA_gds="^ds12" +export GLOO_SOCKET_IFNAME=enp0s3 + +srun --container-image="$SQSH_IMAGE_PATH" \ + --container-mounts="${RECIPE_DIR}:/recipe:mkdir,${ARTIFACT_DIR_HOME}:${ARTIFACT_DIR}:mkdir,/usr/local/gib:/usr/local/gib" \ + --container-workdir=/recipe \ + --container-writable \ + bash -c 'export JOB_COMPLETION_INDEX=$SLURM_NODEID; ./launch_script.sh'