From 7b455ffef3b53fcc7f3bc3d890f74c96092d2454 Mon Sep 17 00:00:00 2001 From: ShriyaRishab Date: Wed, 25 Feb 2026 14:24:45 -0800 Subject: [PATCH] Freeze torch with PIP_CONSTRAINT for reproducibility Signed-off-by: ShriyaRishab --- small_llm_pretraining/nemo/Dockerfile.b200 | 16 ++++++++++++++-- small_llm_pretraining/nemo/Dockerfile.h200 | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/small_llm_pretraining/nemo/Dockerfile.b200 b/small_llm_pretraining/nemo/Dockerfile.b200 index 88153ac9d..9a14881ff 100644 --- a/small_llm_pretraining/nemo/Dockerfile.b200 +++ b/small_llm_pretraining/nemo/Dockerfile.b200 @@ -29,13 +29,23 @@ ENV CUSTOM_FROM_IMAGE_NAME ${FROM_IMAGE_NAME} # Custom libraries version WORKDIR /workspace/ -ENV PIP_CONSTRAINT="" - RUN git config --global user.name "a" && \ git config --global user.email "a" WORKDIR /workspace/ +# Freeze the NGC base image's torch stack so later pip installs can't downgrade it. +# The base image ships custom NGC builds of torch/torchvision/torchaudio with +# matching transformer_engine .so files; any version change breaks the ABI. +RUN python3 -c "\ +import importlib, sys; \ +pkgs = ['torch', 'torchvision', 'torchaudio']; \ +lines = []; \ +[lines.append(f'{p}=={importlib.import_module(p).__version__}') for p in pkgs if importlib.util.find_spec(p)]; \ +open('/pip_torch_constraint.txt','w').write('\n'.join(lines)+'\n'); \ +print('--- Pinned torch packages ---'); [print(l) for l in lines]" +ENV PIP_CONSTRAINT=/pip_torch_constraint.txt + RUN pip install numcodecs==0.13.1 ## 3. NeMo @@ -95,6 +105,8 @@ RUN if [ "${MCORE_REVISION}" != SKIP ]; then \ ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM" +# Clear the torch constraint now that all installs are done. +ENV PIP_CONSTRAINT="" WORKDIR /workspace/code diff --git a/small_llm_pretraining/nemo/Dockerfile.h200 b/small_llm_pretraining/nemo/Dockerfile.h200 index 5cf32b704..c8301cc9b 100644 --- a/small_llm_pretraining/nemo/Dockerfile.h200 +++ b/small_llm_pretraining/nemo/Dockerfile.h200 @@ -37,6 +37,18 @@ RUN git config --global user.name "a" && \ WORKDIR /workspace/ +# Freeze the NGC base image's torch stack so later pip installs can't downgrade it. +# The base image ships custom NGC builds of torch/torchvision/torchaudio with +# matching transformer_engine .so files; any version change breaks the ABI. +RUN python3 -c "\ +import importlib, sys; \ +pkgs = ['torch', 'torchvision', 'torchaudio']; \ +lines = []; \ +[lines.append(f'{p}=={importlib.import_module(p).__version__}') for p in pkgs if importlib.util.find_spec(p)]; \ +open('/pip_torch_constraint.txt','w').write('\n'.join(lines)+'\n'); \ +print('--- Pinned torch packages ---'); [print(l) for l in lines]" +ENV PIP_CONSTRAINT=/pip_torch_constraint.txt + RUN pip install numcodecs==0.13.1 ## 1. Apex @@ -127,6 +139,8 @@ RUN if [ "${MCORE_REVISION}" != SKIP ]; then \ ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM" +# Clear the torch constraint now that all installs are done. +ENV PIP_CONSTRAINT="" WORKDIR /workspace/code