Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ gpuPlatformSettings:
network:
ncclSettings:
- name: NCCL_DEBUG
value: "VERSION"
value: "TRACE"
subnetworks[]:

vllm:
Expand Down
1 change: 1 addition & 0 deletions inference/a3mega/llama-4/vllm-serving-gke/response.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"object":"error","message":"The model `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` does not exist.","type":"NotFoundError","param":null,"code":404}
10 changes: 5 additions & 5 deletions inference/a3mega/llama-4/vllm-serving-gke/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ volumes:

gpuPlatformSettings:
useHostPlugin: false
ncclPluginImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1"
rxdmImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14"
ncclBuildType: 223
ncclPluginImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.15"
rxdmImage: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.21"
ncclBuildType: 228

network:
ncclSettings:
- name: NCCL_DEBUG
value: "VERSION"
value: "TRACE"
subnetworks[]:

vllm:
Expand All @@ -58,4 +58,4 @@ vllm:
ports:
http: 8000
serverArgs:
max-model-len: 32768
max-model-len: 32768
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ spec:
imagePullPolicy: Always
volumeMounts:
- name: nccl-plugin-volume
mountPath: /usr/local/nccl-plugin
mountPath: /usr/local/tcpxo
env:
- name: BUILD_TYPE
value: "{{ $root.Values.gpuPlatformSettings.ncclBuildType }}"
Expand All @@ -126,7 +126,7 @@ spec:
set -ex
chmod 755 /scripts/container_entry.sh
/scripts/container_entry.sh install --install-nccl --nccl-buildtype ${BUILD_TYPE}
cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/
cp -r /var/lib/tcpxo/* /usr/local/tcpxo/

{{- end }}

Expand Down Expand Up @@ -206,13 +206,13 @@ spec:
value: /usr/local/nvidia/lib64
{{- else }}
- name: LD_LIBRARY_PATH
value: /usr/local/nccl-plugin/lib64:/usr/local/nvidia/lib64
value: /usr/local/tcpxo/lib64:/usr/local/nvidia/lib64
- name: NCCL_LIB_DIR
value: /usr/local/nccl-plugin/lib64
value: /usr/local/tcpxo/lib64
{{- end }}
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
value: /dev/aperture_devices

# NCCL settings from A3Mega configuration
- name: NCCL_FASTRAK_CTRL_DEV
value: "eth0"
Expand All @@ -223,7 +223,7 @@ spec:
- name: NCCL_ALGO
value: "Ring,Tree"
- name: NCCL_PROTO
value: "Simple"
value: "Simple,LL128"
- name: NCCL_MIN_NCHANNELS
value: "4"
- name: NCCL_DYNAMIC_CHUNK_SIZE
Expand Down Expand Up @@ -253,17 +253,19 @@ spec:
- name: NCCL_TUNER_PLUGIN
value: "libnccl-tuner.so"
- name: NCCL_TUNER_CONFIG_PATH
value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto"
value: "/usr/local/tcpxo/lib64/a3plus_tuner_config.textproto"
- name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE
value: "/usr/local/nccl-plugin/lib64/a3plus_guest_config.textproto"
value: "/usr/local/tcpxo/lib64/a3plus_guest_config.textproto"
- name: NCCL_NVLS_ENABLE
value: "0"
value: "1"
- name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS
value: "600000"
- name: CUDA_VISIBLE_DEVICES
value: "0,1,2,3,4,5,6,7"
- name: NCCL_FASTRAK_IFNAME
value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8"
- name: NCCL_NVLSTREE_MAX_CHUNKSIZE
value: "131072"

# The following is needed to prevent send-receive stalling execution
- name: NVTE_FWD_LAYERNORM_SM_MARGIN
Expand All @@ -275,6 +277,14 @@ spec:
- name: NCCL_P2P_PXN_LEVEL
value: "0"

# GPUViz
- name: NCCL_NET_PLUGIN_TELEMETRY_MODE
value: "1"
- name: NCCL_GPUVIZ_ENABLE_MILLISECOND_BANDWIDTH_OUTPUT
value: "1"
- name: NCCL_GPUVIZ_FILE_ROTATION_INTERVAL_IN_SECONDS
value: "300"

{{- range $environment_variable := $root.Values.network.ncclSettings }}
- name: {{ $environment_variable.name }}
value: "{{ $environment_variable.value }}"
Expand Down Expand Up @@ -321,7 +331,7 @@ spec:
mountPath: /dev/aperture_devices
{{- if not $root.Values.gpuPlatformSettings.useHostPlugin }}
- name: nccl-plugin-volume
mountPath: /usr/local/nccl-plugin
mountPath: /usr/local/tcpxo
{{- end }}
- name: sys
mountPath: /hostsysfs
Expand Down Expand Up @@ -430,7 +440,7 @@ spec:
imagePullPolicy: Always
volumeMounts:
- name: nccl-plugin-volume
mountPath: /usr/local/nccl-plugin
mountPath: /usr/local/tcpxo
env:
- name: BUILD_TYPE
value: "{{ $root.Values.gpuPlatformSettings.ncclBuildType }}"
Expand All @@ -441,7 +451,7 @@ spec:
set -ex
chmod 755 /scripts/container_entry.sh
/scripts/container_entry.sh install --install-nccl --nccl-buildtype ${BUILD_TYPE}
cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/
cp -r /var/lib/tcpxo/* /usr/local/tcpxo/

{{- end }}

Expand Down Expand Up @@ -522,9 +532,9 @@ spec:
value: /usr/local/nvidia/lib64
{{- else }}
- name: LD_LIBRARY_PATH
value: /usr/local/nccl-plugin/lib64:/usr/local/nvidia/lib64
value: /usr/local/tcpxo/lib64:/usr/local/nvidia/lib64
- name: NCCL_LIB_DIR
value: /usr/local/nccl-plugin/lib64
value: /usr/local/tcpxo/lib64
{{- end }}
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
value: /dev/aperture_devices
Expand Down Expand Up @@ -569,9 +579,9 @@ spec:
- name: NCCL_TUNER_PLUGIN
value: "libnccl-tuner.so"
- name: NCCL_TUNER_CONFIG_PATH
value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto"
value: "/usr/local/tcpxo/lib64/a3plus_tuner_config.textproto"
- name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE
value: "/usr/local/nccl-plugin/lib64/a3plus_guest_config.textproto"
value: "/usr/local/tcpxo/lib64/a3plus_guest_config.textproto"
- name: NCCL_NVLS_ENABLE
value: "0"
- name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS
Expand Down Expand Up @@ -611,7 +621,7 @@ spec:
mountPath: /dev/aperture_devices
{{- if not $root.Values.gpuPlatformSettings.useHostPlugin }}
- name: nccl-plugin-volume
mountPath: /usr/local/nccl-plugin
mountPath: /usr/local/tcpxo
{{- end }}
- name: sys
mountPath: /hostsysfs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ spec:
imagePullPolicy: Always
volumeMounts:
- name: nccl-plugin-volume
mountPath: /usr/local/nccl-plugin
mountPath: /usr/local/tcpxo
env:
- name: BUILD_TYPE
value: "{{ $root.Values.gpuPlatformSettings.ncclBuildType }}"
Expand All @@ -130,7 +130,7 @@ spec:
set -ex
chmod 755 /scripts/container_entry.sh
/scripts/container_entry.sh install --install-nccl --nccl-buildtype ${BUILD_TYPE}
cp -r /var/lib/tcpxo/* /usr/local/nccl-plugin/
cp -r /var/lib/tcpxo/* /usr/local/tcpxo/

{{- end }}

Expand Down Expand Up @@ -203,9 +203,9 @@ spec:
value: /usr/local/nvidia/lib64
{{- else }}
- name: LD_LIBRARY_PATH
value: /usr/local/nccl-plugin/lib64:/usr/local/nvidia/lib64
value: /usr/local/tcpxo/lib64:/usr/local/nvidia/lib64
- name: NCCL_LIB_DIR
value: /usr/local/nccl-plugin/lib64
value: /usr/local/tcpxo/lib64
{{- end }}
- name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
value: /dev/aperture_devices
Expand All @@ -220,7 +220,7 @@ spec:
- name: NCCL_ALGO
value: "Ring,Tree"
- name: NCCL_PROTO
value: "Simple"
value: "Simple,LL128"
- name: NCCL_MIN_NCHANNELS
value: "4"
- name: NCCL_DYNAMIC_CHUNK_SIZE
Expand Down Expand Up @@ -250,17 +250,19 @@ spec:
- name: NCCL_TUNER_PLUGIN
value: "libnccl-tuner.so"
- name: NCCL_TUNER_CONFIG_PATH
value: "/usr/local/nccl-plugin/lib64/a3plus_tuner_config.textproto"
value: "/usr/local/tcpxo/lib64/a3plus_tuner_config.textproto"
- name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE
value: "/usr/local/nccl-plugin/lib64/a3plus_guest_config.textproto"
value: "/usr/local/tcpxo/lib64/a3plus_guest_config.textproto"
- name: NCCL_NVLS_ENABLE
value: "0"
value: "1"
- name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS
value: "600000"
- name: CUDA_VISIBLE_DEVICES
value: "0,1,2,3,4,5,6,7"
- name: NCCL_FASTRAK_IFNAME
value: "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8"
- name: NCCL_NVLSTREE_MAX_CHUNKSIZE
value: "131072"

# The following is needed to prevent send-receive stalling execution
- name: NVTE_FWD_LAYERNORM_SM_MARGIN
Expand All @@ -272,6 +274,14 @@ spec:
- name: NCCL_P2P_PXN_LEVEL
value: "0"

# GPUViz
- name: NCCL_NET_PLUGIN_TELEMETRY_MODE
value: "1"
- name: NCCL_GPUVIZ_ENABLE_MILLISECOND_BANDWIDTH_OUTPUT
value: "1"
- name: NCCL_GPUVIZ_FILE_ROTATION_INTERVAL_IN_SECONDS
value: "300"

{{- range $environment_variable := $root.Values.network.ncclSettings }}
- name: {{ $environment_variable.name }}
value: "{{ $environment_variable.value }}"
Expand Down Expand Up @@ -311,7 +321,7 @@ spec:
mountPath: /dev/aperture_devices
{{- if not $root.Values.gpuPlatformSettings.useHostPlugin }}
- name: nccl-plugin-volume
mountPath: /usr/local/nccl-plugin
mountPath: /usr/local/tcpxo
{{- end }}
- name: sys
mountPath: /hostsysfs
Expand Down