diff --git a/inference/a3mega/deepseek-r1-671b/vllm-serving-gke/values.yaml b/inference/a3mega/deepseek-r1-671b/vllm-serving-gke/values.yaml index dd8f5a56..992b436d 100644 --- a/inference/a3mega/deepseek-r1-671b/vllm-serving-gke/values.yaml +++ b/inference/a3mega/deepseek-r1-671b/vllm-serving-gke/values.yaml @@ -48,7 +48,7 @@ gpuPlatformSettings: network: ncclSettings: - name: NCCL_DEBUG - value: "VERSION" + value: "TRACE" subnetworks[]: vllm: diff --git a/inference/a3mega/llama-4/vllm-serving-gke/response.json b/inference/a3mega/llama-4/vllm-serving-gke/response.json new file mode 100644 index 00000000..eacfe635 --- /dev/null +++ b/inference/a3mega/llama-4/vllm-serving-gke/response.json @@ -0,0 +1 @@ +{"object":"error","message":"The model `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` does not exist.","type":"NotFoundError","param":null,"code":404} \ No newline at end of file diff --git a/inference/a3mega/llama-4/vllm-serving-gke/values.yaml b/inference/a3mega/llama-4/vllm-serving-gke/values.yaml index 579c0992..3ed100ea 100644 --- a/inference/a3mega/llama-4/vllm-serving-gke/values.yaml +++ b/inference/a3mega/llama-4/vllm-serving-gke/values.yaml @@ -41,14 +41,14 @@ volumes: gpuPlatformSettings: useHostPlugin: false - ncclPluginImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1" - rxdmImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14" - ncclBuildType: 223 + ncclPluginImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.15" + rxdmImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.21" + ncclBuildType: 228 network: ncclSettings: - name: NCCL_DEBUG - value: "VERSION" + value: "TRACE" subnetworks[]: vllm: @@ -58,4 +58,4 @@ vllm: ports: http: 8000 serverArgs: - max-model-len: 32768 \ No newline at end of file + max-model-len: 32768 diff --git a/src/helm-charts/a3mega/vllm-inference/multi-host/templates/lws-deployment.yaml b/src/helm-charts/a3mega/vllm-inference/multi-host/templates/lws-deployment.yaml index 12c7f628..50da2fb3 100644 --- a/src/helm-charts/a3mega/vllm-inference/multi-host/templates/lws-deployment.yaml +++ b/src/helm-charts/a3mega/vllm-inference/multi-host/templates/lws-deployment.yaml @@ -115,7 +115,7 @@ spec: imagePullPolicy: Always volumeMounts: - name: nccl-plugin-volume - mountPath: /usr/local/nccl-plugin + mountPath: /usr/local/tcpxo env: - name: BUILD_TYPE value: "{{ $root.Values.gpuPlatformSettings.ncclBuildType }}" @@ -126,7 +126,7 @@ spec: set -ex chmod 755 /scripts/container_entry.sh /scripts/container_entry.sh install --install-nccl --nccl-buildtype ${BUILD_TYPE} - cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/ + cp -r /var/lib/tcpxo/* /usr/local/tcpxo/ {{- end }} @@ -206,13 +206,13 @@ spec: value: /usr/local/nvidia/lib64 {{- else }} - name: LD_LIBRARY_PATH - value: /usr/local/nccl-plugin/lib64:/usr/local/nvidia/lib64 + value: /usr/local/tcpxo/lib64:/usr/local/nvidia/lib64 - name: NCCL_LIB_DIR - value: /usr/local/nccl-plugin/lib64 + value: /usr/local/tcpxo/lib64 {{- end }} - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY value: /dev/aperture_devices - + # NCCL settings from A3Mega configuration - name: NCCL_FASTRAK_CTRL_DEV value: "eth0" @@ -223,7 +223,7 @@ spec: - name: NCCL_ALGO value: "Ring,Tree" - name: NCCL_PROTO - value: "Simple" + value: "Simple,LL128" - name: NCCL_MIN_NCHANNELS value: "4" - name: NCCL_DYNAMIC_CHUNK_SIZE @@ -253,17 +253,19 @@ spec: - name: NCCL_TUNER_PLUGIN value: "libnccl-tuner.so" - name: NCCL_TUNER_CONFIG_PATH - value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto" + value: "/usr/local/tcpxo/lib64/a3plus_tuner_config.textproto" - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE - value: "/usr/local/nccl-plugin/lib64/a3plus_guest_config.textproto" + value: "/usr/local/tcpxo/lib64/a3plus_guest_config.textproto" - name: NCCL_NVLS_ENABLE - value: "0" + value: "1" - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS value: "600000" - name: CUDA_VISIBLE_DEVICES value: "0,1,2,3,4,5,6,7" - name: NCCL_FASTRAK_IFNAME value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8" + - name: NCCL_NVLSTREE_MAX_CHUNKSIZE + value: "131072" # The following is needed to prevent send-receive stalling execution - name: NVTE_FWD_LAYERNORM_SM_MARGIN @@ -275,6 +277,14 @@ spec: - name: NCCL_P2P_PXN_LEVEL value: "0" + # GPUViz + - name: NCCL_NET_PLUGIN_TELEMETRY_MODE + value: "1" + - name: NCCL_GPUVIZ_ENABLE_MILLISECOND_BANDWIDTH_OUTPUT + value: "1" + - name: NCCL_GPUVIZ_FILE_ROTATION_INTERVAL_IN_SECONDS + value: "300" + {{- range $environment_variable := $root.Values.network.ncclSettings }} - name: {{ $environment_variable.name }} value: "{{ $environment_variable.value }}" @@ -321,7 +331,7 @@ spec: mountPath: /dev/aperture_devices {{- if not $root.Values.gpuPlatformSettings.useHostPlugin }} - name: nccl-plugin-volume - mountPath: /usr/local/nccl-plugin + mountPath: /usr/local/tcpxo {{- end }} - name: sys mountPath: /hostsysfs @@ -430,7 +440,7 @@ spec: imagePullPolicy: Always volumeMounts: - name: nccl-plugin-volume - mountPath: /usr/local/nccl-plugin + mountPath: /usr/local/tcpxo env: - name: BUILD_TYPE value: "{{ $root.Values.gpuPlatformSettings.ncclBuildType }}" @@ -441,7 +451,7 @@ spec: set -ex chmod 755 /scripts/container_entry.sh /scripts/container_entry.sh install --install-nccl --nccl-buildtype ${BUILD_TYPE} - cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/ + cp -r /var/lib/tcpxo/* /usr/local/tcpxo/ {{- end }} @@ -522,9 +532,9 @@ spec: value: /usr/local/nvidia/lib64 {{- else }} - name: LD_LIBRARY_PATH - value: /usr/local/nccl-plugin/lib64:/usr/local/nvidia/lib64 + value: /usr/local/tcpxo/lib64:/usr/local/nvidia/lib64 - name: NCCL_LIB_DIR - value: /usr/local/nccl-plugin/lib64 + value: /usr/local/tcpxo/lib64 {{- end }} - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY value: /dev/aperture_devices @@ -569,9 +579,9 @@ spec: - name: NCCL_TUNER_PLUGIN value: "libnccl-tuner.so" - name: NCCL_TUNER_CONFIG_PATH - value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto" + value: "/usr/local/tcpxo/lib64/a3plus_tuner_config.textproto" - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE - value: "/usr/local/nccl-plugin/lib64/a3plus_guest_config.textproto" + value: "/usr/local/tcpxo/lib64/a3plus_guest_config.textproto" - name: NCCL_NVLS_ENABLE value: "0" - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS @@ -611,7 +621,7 @@ spec: mountPath: /dev/aperture_devices {{- if not $root.Values.gpuPlatformSettings.useHostPlugin }} - name: nccl-plugin-volume - mountPath: /usr/local/nccl-plugin + mountPath: /usr/local/tcpxo {{- end }} - name: sys mountPath: /hostsysfs diff --git a/src/helm-charts/a3mega/vllm-inference/single-host/templates/model-serve-launcher.yaml b/src/helm-charts/a3mega/vllm-inference/single-host/templates/model-serve-launcher.yaml index 18ccbc33..ff411250 100644 --- a/src/helm-charts/a3mega/vllm-inference/single-host/templates/model-serve-launcher.yaml +++ b/src/helm-charts/a3mega/vllm-inference/single-host/templates/model-serve-launcher.yaml @@ -119,7 +119,7 @@ spec: imagePullPolicy: Always volumeMounts: - name: nccl-plugin-volume - mountPath: /usr/local/nccl-plugin + mountPath: /usr/local/tcpxo env: - name: BUILD_TYPE value: "{{ $root.Values.gpuPlatformSettings.ncclBuildType }}" @@ -130,7 +130,7 @@ spec: set -ex chmod 755 /scripts/container_entry.sh /scripts/container_entry.sh install --install-nccl --nccl-buildtype ${BUILD_TYPE} - cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/ + cp -r /var/lib/tcpxo/* /usr/local/tcpxo/ {{- end }} @@ -203,9 +203,9 @@ spec: value: /usr/local/nvidia/lib64 {{- else }} - name: LD_LIBRARY_PATH - value: /usr/local/nccl-plugin/lib64:/usr/local/nvidia/lib64 + value: /usr/local/tcpxo/lib64:/usr/local/nvidia/lib64 - name: NCCL_LIB_DIR - value: /usr/local/nccl-plugin/lib64 + value: /usr/local/tcpxo/lib64 {{- end }} - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY value: /dev/aperture_devices @@ -220,7 +220,7 @@ spec: - name: NCCL_ALGO value: "Ring,Tree" - name: NCCL_PROTO - value: "Simple" + value: "Simple,LL128" - name: NCCL_MIN_NCHANNELS value: "4" - name: NCCL_DYNAMIC_CHUNK_SIZE @@ -250,17 +250,19 @@ spec: - name: NCCL_TUNER_PLUGIN value: "libnccl-tuner.so" - name: NCCL_TUNER_CONFIG_PATH - value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto" + value: "/usr/local/tcpxo/lib64/a3plus_tuner_config.textproto" - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE - value: "/usr/local/nccl-plugin/lib64/a3plus_guest_config.textproto" + value: "/usr/local/tcpxo/lib64/a3plus_guest_config.textproto" - name: NCCL_NVLS_ENABLE - value: "0" + value: "1" - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS value: "600000" - name: CUDA_VISIBLE_DEVICES value: "0,1,2,3,4,5,6,7" - name: NCCL_FASTRAK_IFNAME value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8" + - name: NCCL_NVLSTREE_MAX_CHUNKSIZE + value: "131072" # The following is needed to prevent send-receive stalling execution - name: NVTE_FWD_LAYERNORM_SM_MARGIN @@ -272,6 +274,14 @@ spec: - name: NCCL_P2P_PXN_LEVEL value: "0" + # GPUViz + - name: NCCL_NET_PLUGIN_TELEMETRY_MODE + value: "1" + - name: NCCL_GPUVIZ_ENABLE_MILLISECOND_BANDWIDTH_OUTPUT + value: "1" + - name: NCCL_GPUVIZ_FILE_ROTATION_INTERVAL_IN_SECONDS + value: "300" + {{- range $environment_variable := $root.Values.network.ncclSettings }} - name: {{ $environment_variable.name }} value: "{{ $environment_variable.value }}" @@ -311,7 +321,7 @@ spec: mountPath: /dev/aperture_devices {{- if not $root.Values.gpuPlatformSettings.useHostPlugin }} - name: nccl-plugin-volume - mountPath: /usr/local/nccl-plugin + mountPath: /usr/local/tcpxo {{- end }} - name: sys mountPath: /hostsysfs