Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 5 additions & 28 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,45 +26,22 @@ pr:
- '**/*.md'

jobs:
- job: IntegrationTestA100
displayName: Integration test A100
- job: sglangtest
displayName: SGLANG Test
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
sglang:
containerImage: lmsysorg/sglang:latest

pool:
name: msccl-ci
container:
image: $(containerImage)

steps:
- template: templates/integration-test.yaml
- template: templates/sglang-test.yaml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'

- job: IntegrationTestH100
displayName: Integration test H100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

pool:
name: msccl-ci-h100
container:
image: $(containerImage)

steps:
- template: templates/integration-test.yaml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'
70 changes: 70 additions & 0 deletions .azure-pipelines/sglang-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
trigger:
branches:
include:
- main
- release/*
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'

pr:
branches:
include:
- main
- release/*
drafts: false
paths:
exclude:
- .devcontainer/**
- .github/**
- docker/**
- docs/**
- '**/*.md'

jobs:
- job: IntegrationTestA100
displayName: Integration test A100
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

pool:
name: msccl-ci
container:
image: $(containerImage)

steps:
- template: templates/integration-test.yaml
parameters:
subscription: mscclpp-ci
vmssName: mscclpp-ci
sshKeySecureFile: mscclpp.pem
gpuArch: '80'

- job: IntegrationTestH100
displayName: Integration test H100
strategy:
matrix:
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9

pool:
name: msccl-ci-h100
container:
image: $(containerImage)

steps:
- template: templates/integration-test.yaml
parameters:
subscription: mscclpp-ci-h100
vmssName: mscclpp-h100-ci
sshKeySecureFile: mscclpp.pem
perfBaselineFile: test/deploy/perf_ndmv5.jsonl
gpuArch: '90'
110 changes: 110 additions & 0 deletions .azure-pipelines/templates/sglang-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
parameters:
- name: subscription
type: string
- name: vmssName
type: string
- name: sshKeySecureFile
type: string
- name: perfBaselineFile
type: string
default: 'test/deploy/perf_ndmv4.jsonl'
- name: gpuArch
type: string

steps:
- task: Bash@3
name: BuildMSCCLPP
displayName: Build MSCCL++
inputs:
targetType: inline
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
make -j
cd ..
pip install .
pip install ./python/requirements_cuda12.txt
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: SGLangSetup
displayName: SGLang Setup
inputs:
targetType: inline
script: |
git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git
cd sglang
pip install --uprade pip
pip install -e "python"

- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: inline
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: ${{ parameters.sshKeySecureFile }}

- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp

- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: test/deploy/deploy.sh
arguments: "single-node-test"
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: AllGatherTest
displayName: Run mscclpp AllGather test
inputs:
targetType: inline
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
: > azureuser@10.0.0.4
tail -f azureuser@10.0.0.4 &
CHILD_PID=$!
parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
export PATH=/usr/local/mpi/bin:\$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
kill $CHILD_PID
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: ${{ parameters.subscription }}
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp
Loading