Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tools/launcher/common/specdec/read_vllm_files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -euo pipefail
echo "=== pattern_matcher.py lines 305-325 ==="
sed -n '305,325p' /usr/local/lib/python3.12/dist-packages/torch/_inductor/pattern_matcher.py 2>/dev/null || echo "NOT FOUND"
echo "=== post_grad.py lines 345-375 ==="
sed -n '345,375p' /usr/local/lib/python3.12/dist-packages/torch/_inductor/fx_passes/post_grad.py 2>/dev/null || echo "NOT FOUND"
echo "=== post_grad.py lines 1240-1260 ==="
sed -n '1240,1260p' /usr/local/lib/python3.12/dist-packages/torch/_inductor/fx_passes/post_grad.py 2>/dev/null || echo "NOT FOUND"
echo "=== DONE ==="
244 changes: 244 additions & 0 deletions tools/launcher/common/specdec/sglang_smoke_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
#!/bin/bash
# SGLang Speculative Decoding Smoke Test
#
# Starts python -m sglang.launch_server with MTP enabled (EAGLE algorithm +
# SGLANG_ENABLE_SPEC_V2=1), sends 8 test prompts via the OpenAI-compatible
# API, and validates that every prompt returns a non-empty response.
#
# Environment variables (all optional with defaults):
# HF_MODEL_CKPT — model path (default: /hf-local/deepseek-ai/DeepSeek-V4-Flash)
# NUM_SPEC_TOKENS — speculative draft tokens (default: 1)
# DATA_PARALLEL_SIZE — DP size (default: 8)
# TP_SIZE — TP size (default: 1)
# KV_CACHE_DTYPE — e.g. "fp8_e5m2" or "fp8" (default: unset = auto)
# TRUST_REMOTE_CODE — "1" to pass --trust-remote-code
# COPY_MODEL_TO_TMPFS — "1" to rsync model to /dev/shm before loading
# EXPERT_PARALLEL_SIZE — expert parallelism degree (default: unset = no EP)
# ATTENTION_BACKEND — e.g. "trtllm_mha" for Blackwell (default: unset = auto)
# MOE_BACKEND — e.g. "flashinfer_trtllm" for Blackwell (default: unset = auto)
# SGLANG_PORT — server port (default: 8000)
# SERVER_TIMEOUT — seconds to wait for server ready (default: 900)
# MAX_OUTPUT_TOKENS — max tokens per query (default: 1024)
# MIN_ACCEPTANCE_LENGTH — optional regression threshold for mean acceptance length
# SGLANG_EXTRA_ARGS — any extra flags appended verbatim to launch_server

set -euo pipefail

MODEL=${HF_MODEL_CKPT:-/hf-local/deepseek-ai/DeepSeek-V4-Flash}
NUM_SPEC=${NUM_SPEC_TOKENS:-1}
PORT=${SGLANG_PORT:-8000}
DP=${DATA_PARALLEL_SIZE:-8}
TP=${TP_SIZE:-1}

# ── tmpfs copy ────────────────────────────────────────────────────────────────
TMPFS_MODEL=""
cleanup() {
kill "$SERVER_PID" 2>/dev/null || true
sleep 2
kill -9 "$SERVER_PID" 2>/dev/null || true
if [ -n "$TMPFS_MODEL" ] && [ -d "$TMPFS_MODEL" ]; then
echo "Removing tmpfs model copy: $TMPFS_MODEL"
rm -rf "$TMPFS_MODEL"
fi
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Guard cleanup for unset PID and avoid deleting shared tmpfs copies.

With set -u, Line [36] can fail when the script exits before Line [141] assigns SERVER_PID. Also, tmpfs copies discovered at Line [49]-Line [51] are reused but still removed at exit, which can break concurrent jobs using the same path.

Suggested fix
 TMPFS_MODEL=""
+SERVER_PID=""
+TMPFS_MODEL_OWNED=0
 cleanup() {
-    kill "$SERVER_PID" 2>/dev/null || true
-    sleep 2
-    kill -9 "$SERVER_PID" 2>/dev/null || true
-    if [ -n "$TMPFS_MODEL" ] && [ -d "$TMPFS_MODEL" ]; then
+    if [ -n "${SERVER_PID:-}" ] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+        kill "${SERVER_PID}" 2>/dev/null || true
+        sleep 2
+        kill -9 "${SERVER_PID}" 2>/dev/null || true
+    fi
+    if [ "${TMPFS_MODEL_OWNED:-0}" = "1" ] && [ -n "${TMPFS_MODEL:-}" ] && [ -d "${TMPFS_MODEL}" ]; then
         echo "Removing tmpfs model copy: $TMPFS_MODEL"
         rm -rf "$TMPFS_MODEL"
     fi
 }
@@
     if [ -d "$TMPFS_MODEL" ] && [ -f "$TMPFS_MODEL/config.json" ]; then
         echo "Using existing tmpfs model copy: $TMPFS_MODEL"
+        TMPFS_MODEL_OWNED=0
     else
@@
         cp -r "$MODEL" "$TMPFS_MODEL"
         echo "Model copy done: $TMPFS_MODEL"
+        TMPFS_MODEL_OWNED=1
     fi

Also applies to: 46-57, 141-141

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tools/launcher/common/specdec/sglang_smoke_test.sh` around lines 35 - 42, The
cleanup() function should guard use of SERVER_PID and only remove tmpfs copies
this script created: check SERVER_PID exists/ non-empty using parameter
expansion (e.g., test -n "${SERVER_PID-}" or similar) before running kill/kil -9
to avoid failures under set -u, and introduce a TMPFS_MODEL_OWNED (or
TMPFS_MODEL_CREATED) flag when you create or copy the tmpfs model in the code
that discovers TMPFS_MODEL (the logic around the TMPFS discovery at the block
that sets TMPFS_MODEL near where you check Lines 49-51); then in cleanup() only
rm -rf "$TMPFS_MODEL" when TMPFS_MODEL_OWNED is true so shared tmpfs paths are
not deleted by concurrent jobs.

}
trap cleanup EXIT

if [ "${COPY_MODEL_TO_TMPFS:-0}" = "1" ]; then
MODEL_NAME=$(basename "$MODEL")
TMPFS_MODEL="/dev/shm/${MODEL_NAME}"
if [ -d "$TMPFS_MODEL" ] && [ -f "$TMPFS_MODEL/config.json" ]; then
echo "Using existing tmpfs model copy: $TMPFS_MODEL"
else
MODEL_SIZE=$(du -sh "$MODEL" 2>/dev/null | cut -f1 || echo "?")
AVAIL_SHM=$(df -h /dev/shm 2>/dev/null | tail -1 | awk '{print $4}' || echo "?")
echo "Copying model to /dev/shm (${MODEL_SIZE}, available: ${AVAIL_SHM})..."
cp -r "$MODEL" "$TMPFS_MODEL"
echo "Model copy done: $TMPFS_MODEL"
fi
MODEL="$TMPFS_MODEL"
echo "Loading from tmpfs: $MODEL"
fi

# ── container patches ─────────────────────────────────────────────────────────
# Upgrade transformers so newly-registered model types (e.g. deepseek_v4) are
# available without requiring trust_remote_code in the AutoConfig pre-check path.
echo "Upgrading transformers (--pre for deepseek_v4 support)..."
pip install --upgrade --pre transformers -q || echo "WARNING: transformers upgrade failed, continuing"

# Register deepseek_v4 in HF Transformers via a site-packages .pth startup file.
# deepseek_v4 is not in the stable transformers release; the stub class preserves
# all config.json fields (including `architectures`) so SGLang's model registry works.
# The .pth propagates to every spawned worker process automatically.
python3 << 'PYEOF'
import os, site

STUB = r'''
try:
from transformers import AutoConfig, PretrainedConfig
class DeepseekV4Config(PretrainedConfig):
model_type = "deepseek_v4"
def __init__(self, **kwargs):
for k, v in kwargs.items():
object.__setattr__(self, k, v)
super().__init__(**kwargs)
AutoConfig.register("deepseek_v4", DeepseekV4Config, exist_ok=True)
print("[patch] deepseek_v4 registered in AutoConfig")
except Exception as e:
print(f"[patch] deepseek_v4 registration failed: {e}")
'''

for sp in site.getsitepackages() + [site.getusersitepackages()]:
if not os.path.isdir(sp):
continue
try:
with open(os.path.join(sp, '_deepseek_v4_patch.py'), 'w') as f:
f.write(STUB)
with open(os.path.join(sp, 'deepseek_v4.pth'), 'w') as f:
f.write('import _deepseek_v4_patch\n')
print(f"[patch] Wrote deepseek_v4.pth to {sp}")
break
except Exception as e:
print(f"[patch] Could not write to {sp}: {e}")

exec(STUB)
PYEOF
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

cat -n tools/launcher/common/specdec/sglang_smoke_test.sh | sed -n '85,110p'

Repository: NVIDIA/Model-Optimizer

Length of output: 1169


🏁 Script executed:

cat -n tools/launcher/common/specdec/sglang_smoke_test.sh | sed -n '1,95p'

Repository: NVIDIA/Model-Optimizer

Length of output: 4915


🏁 Script executed:

cat -n tools/launcher/common/specdec/sglang_smoke_test.sh | sed -n '105,150p'

Repository: NVIDIA/Model-Optimizer

Length of output: 2286


Fail fast when the .pth patch cannot be installed for spawned processes.

If all writable site-packages attempts fail (lines 90–102), the script continues silently. exec(STUB) at line 103 only patches the current interpreter; the spawned python -m sglang.launch_server process (line 131) depends on the .pth file for automatic patch application at startup. Without the file, the server starts without deepseek_v4 registered, causing opaque failures downstream.

Track whether .pth was successfully written and exit with an error if all site-packages locations are non-writable.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tools/launcher/common/specdec/sglang_smoke_test.sh` around lines 90 - 104,
The loop that tries to write _deepseek_v4_patch.py and deepseek_v4.pth may fail
silently and only exec(STUB) patches the current interpreter, so the spawned
"python -m sglang.launch_server" process won't get the patch; add a boolean flag
(e.g., wrote_pth = False) before iterating site.getsitepackages() +
[site.getusersitepackages()], set it to True when writing deepseek_v4.pth
succeeds, and after the loop check the flag and call sys.exit(1) (and print an
error) if no writable site-packages were found so the script fails fast instead
of continuing to exec(STUB) and spawning the server without the .pth patch.


GPU_CC=$(python3 -c "import torch; cc=torch.cuda.get_device_capability(); print(f'{cc[0]}.{cc[1]}')" 2>/dev/null || echo "unknown")
echo "GPU compute capability: ${GPU_CC}"

# ── build args ────────────────────────────────────────────────────────────────
EXTRA_ARGS=""
[ -n "${KV_CACHE_DTYPE:-}" ] && EXTRA_ARGS="$EXTRA_ARGS --kv-cache-dtype ${KV_CACHE_DTYPE}"
[ "${TRUST_REMOTE_CODE:-}" = "1" ] && EXTRA_ARGS="$EXTRA_ARGS --trust-remote-code"
[ -n "${EXPERT_PARALLEL_SIZE:-}" ] && EXTRA_ARGS="$EXTRA_ARGS --expert-parallel-size ${EXPERT_PARALLEL_SIZE}"
[ -n "${ATTENTION_BACKEND:-}" ] && EXTRA_ARGS="$EXTRA_ARGS --attention-backend ${ATTENTION_BACKEND}"
[ -n "${MOE_BACKEND:-}" ] && EXTRA_ARGS="$EXTRA_ARGS --moe-runner-backend ${MOE_BACKEND}"
[ -n "${SGLANG_EXTRA_ARGS:-}" ] && EXTRA_ARGS="$EXTRA_ARGS ${SGLANG_EXTRA_ARGS}"

# ── start server ──────────────────────────────────────────────────────────────
echo "=== SGLang Speculative Decoding Smoke Test ==="
echo "Model: ${MODEL}"
echo "DP: ${DP}, TP: ${TP}, Spec tokens: ${NUM_SPEC}"

# Speculative decoding (EAGLE MTP) — skip when NUM_SPEC_TOKENS=0
SPEC_ARGS=""
if [ "${NUM_SPEC}" -gt 0 ]; then
export SGLANG_ENABLE_SPEC_V2=1
SPEC_ARGS="--speculative-num-draft-tokens ${NUM_SPEC}"
fi

# shellcheck disable=SC2086
python -m sglang.launch_server \
--model-path "${MODEL}" \
--tp "${TP}" \
--dp "${DP}" \
--enable-dp-attention \
--host 0.0.0.0 \
--port "${PORT}" \
${SPEC_ARGS} \
${EXTRA_ARGS} \
&
SERVER_PID=$!

# ── wait for ready ────────────────────────────────────────────────────────────
SERVER_TIMEOUT=${SERVER_TIMEOUT:-900}
echo "Waiting for SGLang server (timeout: ${SERVER_TIMEOUT}s)..."
for i in $(seq 1 "${SERVER_TIMEOUT}"); do
if curl -s "http://localhost:${PORT}/health" > /dev/null 2>&1; then
echo "Server ready after ${i}s"
break
fi
if ! kill -0 "$SERVER_PID" 2>/dev/null; then
echo "ERROR: Server died"
wait "$SERVER_PID" || true
exit 1
fi
sleep 1
done

if ! curl -s "http://localhost:${PORT}/health" > /dev/null 2>&1; then
echo "ERROR: Server did not become ready within ${SERVER_TIMEOUT}s"
exit 1
fi

# ── test prompts ──────────────────────────────────────────────────────────────
MAX_TOKENS=${MAX_OUTPUT_TOKENS:-1024}
echo ""
echo "=== Test Prompts (max_tokens=${MAX_TOKENS}) ==="
PASS=0
FAIL=0
TOTAL_TOKENS=0
TOTAL_TIME=0

for PROMPT in \
"Write a persuasive email to your manager requesting a four-day work week. Include at least three supporting arguments." \
"You are a medieval blacksmith. A traveler asks you to forge a sword. Describe your process and the qualities of your finest work." \
"A farmer has 17 sheep. All but 9 run away. How many sheep does the farmer have left? Explain your reasoning carefully." \
"Solve the equation 3x + 7 = 22. Show each step of your solution." \
"Write a Python function that takes a list of integers and returns the second largest unique value. Include error handling." \
"Extract all the dates, names, and locations from: On March 15 2024 Dr. Alice Chen presented her findings at the Berlin Conference on Climate Science." \
"Explain the process of photosynthesis. What role does chlorophyll play and why are plants green?" \
"Discuss the main themes in George Orwells 1984. How do they relate to modern society?"; do
START=$(date +%s%N)
RESULT=$(curl -s "http://localhost:${PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d "{\"model\": \"${MODEL}\", \"messages\": [{\"role\": \"user\", \"content\": \"${PROMPT}\"}], \"max_tokens\": ${MAX_TOKENS}, \"temperature\": 0}" \
2>/dev/null)
END=$(date +%s%N)
ELAPSED=$(echo "scale=2; ($END - $START) / 1000000000" | bc 2>/dev/null || echo "0")
TOKENS=$(echo "$RESULT" | python3 -c "import json,sys; r=json.load(sys.stdin); print(r.get('usage',{}).get('completion_tokens',0))" 2>/dev/null || echo "0")
if [ -n "$TOKENS" ] && [ "$TOKENS" -gt 0 ] 2>/dev/null; then
TPS=$(echo "scale=1; $TOKENS / $ELAPSED" | bc 2>/dev/null || echo "?")
echo " PASS: ${TOKENS} tokens in ${ELAPSED}s (${TPS} tok/s) — \"${PROMPT:0:50}...\""
PASS=$((PASS + 1))
TOTAL_TOKENS=$((TOTAL_TOKENS + TOKENS))
TOTAL_TIME=$(echo "$TOTAL_TIME + $ELAPSED" | bc 2>/dev/null || echo "0")
else
echo " FAIL: \"${PROMPT}\""
echo " Response: $(echo "$RESULT" | head -c 200)"
FAIL=$((FAIL + 1))
fi
done

echo ""
echo "Results: ${PASS} passed, ${FAIL} failed"
if [ "$TOTAL_TOKENS" -gt 0 ] 2>/dev/null; then
AVG_TPS=$(echo "scale=1; $TOTAL_TOKENS / $TOTAL_TIME" | bc 2>/dev/null || echo "?")
echo "Total: ${TOTAL_TOKENS} tokens in ${TOTAL_TIME}s (${AVG_TPS} tok/s avg)"
fi

# ── speculative metrics ───────────────────────────────────────────────────────
echo ""
METRICS=$(curl -s "http://localhost:${PORT}/metrics" 2>/dev/null | grep -i "spec\|accept\|draft\|mtp" | head -10 || true)
if [ -n "$METRICS" ]; then
echo "=== Speculative Decoding Metrics ==="
echo "$METRICS"
fi

if [ "$FAIL" -gt 0 ]; then
echo "ERROR: ${FAIL} prompt(s) failed"
exit 1
fi

# ── optional acceptance-length regression check ───────────────────────────────
if [ -n "${MIN_ACCEPTANCE_LENGTH:-}" ]; then
AVG_ACCEPT=$(curl -s "http://localhost:${PORT}/metrics" 2>/dev/null \
| grep -oP 'sglang.*acceptance.*\K[0-9.]+' | tail -1 || true)
if [ -n "$AVG_ACCEPT" ]; then
echo ""
echo "=== Acceptance Length Regression Check ==="
echo " Mean acceptance length: ${AVG_ACCEPT}"
echo " Threshold: ${MIN_ACCEPTANCE_LENGTH}"
PASS_CHECK=$(python3 -c "print('yes' if float('${AVG_ACCEPT}') >= float('${MIN_ACCEPTANCE_LENGTH}') else 'no')")
if [ "$PASS_CHECK" = "yes" ]; then
echo " PASS: ${AVG_ACCEPT} >= ${MIN_ACCEPTANCE_LENGTH}"
else
echo " REGRESSION: ${AVG_ACCEPT} < ${MIN_ACCEPTANCE_LENGTH}"
exit 1
fi
else
echo "WARNING: Could not parse acceptance length from SGLang metrics, skipping regression check"
fi
fi

echo "=== PASS ==="
Loading
Loading