diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index f6fe3a47f..d3e0ab8bf 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -26,14 +26,12 @@ pr: - '**/*.md' jobs: -- job: IntegrationTestA100 - displayName: Integration test A100 +- job: sglangtest + displayName: SGLANG Test strategy: matrix: - cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 - cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + sglang: + containerImage: lmsysorg/sglang:latest pool: name: msccl-ci @@ -41,30 +39,9 @@ jobs: image: $(containerImage) steps: - - template: templates/integration-test.yaml + - template: templates/sglang-test.yaml parameters: subscription: mscclpp-ci vmssName: mscclpp-ci sshKeySecureFile: mscclpp.pem gpuArch: '80' - -- job: IntegrationTestH100 - displayName: Integration test H100 - strategy: - matrix: - cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 - - pool: - name: msccl-ci-h100 - container: - image: $(containerImage) - - steps: - - template: templates/integration-test.yaml - parameters: - subscription: mscclpp-ci-h100 - vmssName: mscclpp-h100-ci - sshKeySecureFile: mscclpp.pem - perfBaselineFile: test/deploy/perf_ndmv5.jsonl - gpuArch: '90' diff --git a/.azure-pipelines/sglang-test.yml b/.azure-pipelines/sglang-test.yml new file mode 100644 index 000000000..f6fe3a47f --- /dev/null +++ b/.azure-pipelines/sglang-test.yml @@ -0,0 +1,70 @@ +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - docker/** + - docs/** + - '**/*.md' + +jobs: +- job: IntegrationTestA100 + displayName: Integration test A100 + strategy: + matrix: + cuda11: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + pool: + name: msccl-ci + container: + image: $(containerImage) + + steps: + - template: templates/integration-test.yaml + parameters: + subscription: mscclpp-ci + vmssName: mscclpp-ci + sshKeySecureFile: mscclpp.pem + gpuArch: '80' + +- job: IntegrationTestH100 + displayName: Integration test H100 + strategy: + matrix: + cuda12: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 + + pool: + name: msccl-ci-h100 + container: + image: $(containerImage) + + steps: + - template: templates/integration-test.yaml + parameters: + subscription: mscclpp-ci-h100 + vmssName: mscclpp-h100-ci + sshKeySecureFile: mscclpp.pem + perfBaselineFile: test/deploy/perf_ndmv5.jsonl + gpuArch: '90' diff --git a/.azure-pipelines/templates/sglang-test.yaml b/.azure-pipelines/templates/sglang-test.yaml new file mode 100644 index 000000000..ae28c246c --- /dev/null +++ b/.azure-pipelines/templates/sglang-test.yaml @@ -0,0 +1,110 @@ +parameters: +- name: subscription + type: string +- name: vmssName + type: string +- name: sshKeySecureFile + type: string +- name: perfBaselineFile + type: string + default: 'test/deploy/perf_ndmv4.jsonl' +- name: gpuArch + type: string + +steps: +- task: Bash@3 + name: BuildMSCCLPP + displayName: Build MSCCL++ + inputs: + targetType: inline + script: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + make -j + cd .. + pip install . + pip install ./python/requirements_cuda12.txt + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: SGLangSetup + displayName: SGLang Setup + inputs: + targetType: inline + script: | + git clone -b release/v0.5.7 https://github.com/caiomcbr/sglang.git + cd sglang + pip install --uprade pip + pip install -e "python" + +- task: Bash@3 + name: InstallPackages + displayName: Install Packages + inputs: + targetType: inline + script: | + sudo apt-get update -y + sudo apt-get install pssh -y + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + +- task: DownloadSecureFile@1 + name: SshKeyFile + displayName: Download key file + inputs: + secureFile: ${{ parameters.sshKeySecureFile }} + +- task: AzureCLI@2 + name: StartVMSS + displayName: Start VMSS + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss start --name ${{ parameters.vmssName }} --resource-group mscclpp + +- task: Bash@3 + name: DeployTestEnv + displayName: Deploy Test Env + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: "single-node-test" + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: AllGatherTest + displayName: Run mscclpp AllGather test + inputs: + targetType: inline + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -o . -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + export PATH=/usr/local/mpi/bin:\$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + cd /root/mscclpp; \ + set -e; \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl; \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl; \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: AzureCLI@2 + name: StopVMSS + displayName: Deallocate VMSS + condition: always() + inputs: + azureSubscription: ${{ parameters.subscription }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az vmss deallocate --name ${{ parameters.vmssName }} --resource-group mscclpp \ No newline at end of file