Skip to content

Commit cfca9a1

Browse files
committed
“docs”
1 parent 4b4c6fb commit cfca9a1

File tree

1 file changed

+246
-11
lines changed

1 file changed

+246
-11
lines changed

scripts/run_ci_xpu.sh

Lines changed: 246 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -319,17 +319,252 @@ export XSHMEM_MODE=1
319319
export XSHMEM_QP_NUM_PER_RANK=32
320320
export BKCL_RDMA_VERBS=1
321321

322+
wget -q https://paddle-qa.bj.bcebos.com/xpu_third_party/xDeepEP.tar.gz
323+
tar -xzf xDeepEP.tar.gz
324+
cd xDeepEP
325+
bash build.sh
326+
cd -
327+
328+
export port_num=$((8188 + XPU_ID * 100))
329+
# 启动服务
330+
python -m fastdeploy.entrypoints.openai.api_server \
331+
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
332+
--port $port_num \
333+
--tensor-parallel-size 4 \
334+
--enable-expert-parallel \
335+
--data-parallel-size 1 \
336+
--max-model-len 32768 \
337+
--max-num-seqs 64 \
338+
--quantization "wint4" \
339+
--engine-worker-queue-port $((port_num + 10)) \
340+
--metrics-port $((port_num + 2)) \
341+
--cache-queue-port $((port_num + 47873)) \
342+
--disable-sequence-parallel-moe \
343+
--gpu-memory-utilization 0.9 \
344+
--load-choices "default" > server.log 2>&1 &
345+
346+
sleep 60
347+
# 探活
348+
TIMEOUT=$((15 * 60))
349+
INTERVAL=10
350+
ENDPOINT="http://0.0.0.0:${port_num}/health"
351+
START_TIME=$(date +%s)
352+
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
353+
while true; do
354+
CURRENT_TIME=$(date +%s)
355+
ELAPSED=$((CURRENT_TIME - START_TIME))
356+
if [ $ELAPSED -ge $TIMEOUT ]; then
357+
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
358+
stop_processes
359+
cat server.log
360+
echo "log/workerlog.0"
361+
cat log/workerlog.0
362+
exit 1
363+
fi
364+
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
365+
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
366+
if [ "$HTTP_CODE" = "200" ]; then
367+
echo -e "\n服务启动成功!耗时 ${ELAPSED}"
368+
break
369+
else
370+
sleep $INTERVAL
371+
fi
372+
done
373+
374+
375+
# 执行在线推理验证脚本
376+
python -m pytest -s tests/ci_use/XPU_45T/run_ep_online.py
377+
ep_online_exit_code=$?
378+
echo ep_online_exit_code is ${ep_online_exit_code}
379+
380+
unset BKCL_ENABLE_XDR
381+
unset BKCL_RDMA_NICS
382+
unset BKCL_TRACE_TOPO
383+
unset BKCL_PCIE_RING
384+
unset XSHMEM_MODE
385+
unset XSHMEM_QP_NUM_PER_RANK
386+
unset BKCL_RDMA_VERBS
387+
stop_processes >kill.log 2>&1
388+
389+
if [ ${ep_online_exit_code} -ne 0 ]; then
390+
echo "server.log"
391+
cat server.log
392+
cat log/workerlog.0
393+
echo "EP4TP4 在线服务相关测试失败,请检查pr代码"
394+
exit 1
395+
fi
396+
397+
echo "============================开始 EP4TP1 在线服务测试!============================"
398+
sleep 5
399+
rm -rf log/*
400+
rm -f core*
401+
# pkill -9 python #流水线不执行这个
402+
ipcrm --all=msg
403+
xpu-smi
404+
if [[ "$XPU_ID" == "0" ]]; then
405+
export XPU_VISIBLE_DEVICES="0,1,2,3"
406+
else
407+
export XPU_VISIBLE_DEVICES="4,5,6,7"
408+
fi
409+
export BKCL_ENABLE_XDR=1
410+
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
411+
export BKCL_TRACE_TOPO=1
412+
export BKCL_PCIE_RING=1
413+
export XSHMEM_MODE=1
414+
export XSHMEM_QP_NUM_PER_RANK=32
415+
export BKCL_RDMA_VERBS=1
416+
417+
export port_num=$((8188 + XPU_ID * 100))
418+
# 启动服务
419+
python -m fastdeploy.entrypoints.openai.api_server \
420+
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
421+
--port $port_num \
422+
--tensor-parallel-size 1 \
423+
--enable-expert-parallel \
424+
--data-parallel-size 4 \
425+
--max-model-len 32768 \
426+
--max-num-seqs 64 \
427+
--quantization "wint4" \
428+
--engine-worker-queue-port "$((port_num + 10)),$((port_num + 20)),$((port_num + 30)),$((port_num + 40))" \
429+
--metrics-port $((port_num + 2)) \
430+
--cache-queue-port $((port_num + 47873)) \
431+
--gpu-memory-utilization 0.9 \
432+
--load-choices "default" > server.log 2>&1 &
433+
434+
sleep 60
435+
# 探活(同上)
436+
TIMEOUT=$((15 * 60))
437+
INTERVAL=10
438+
ENDPOINT="http://0.0.0.0:${port_num}/health"
439+
START_TIME=$(date +%s)
440+
while true; do
441+
CURRENT_TIME=$(date +%s)
442+
ELAPSED=$((CURRENT_TIME - START_TIME))
443+
if [ $ELAPSED -ge $TIMEOUT ]; then
444+
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
445+
stop_processes
446+
cat server.log
447+
cat log/workerlog.0
448+
exit 1
449+
fi
450+
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
451+
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
452+
if [ "$HTTP_CODE" = "200" ]; then
453+
echo -e "\n服务启动成功!耗时 ${ELAPSED}"
454+
break
455+
else
456+
sleep $INTERVAL
457+
fi
458+
done
459+
460+
461+
# 执行在线推理验证脚本
462+
python -m pytest -s tests/ci_use/XPU_45T/run_ep_online.py
463+
ep_online_exit_code=$?
464+
echo ep_online_exit_code is ${ep_online_exit_code}
465+
466+
unset BKCL_ENABLE_XDR
467+
unset BKCL_RDMA_NICS
468+
unset BKCL_TRACE_TOPO
469+
unset BKCL_PCIE_RING
470+
unset XSHMEM_MODE
471+
unset XSHMEM_QP_NUM_PER_RANK
472+
unset BKCL_RDMA_VERBS
322473
stop_processes >kill.log 2>&1
323474

324-
export PYTHONPATH=/work/wq/qq/FastDeploy
325-
export XPU_VISIBLE_DEVICES="0"
475+
if [ ${ep_online_exit_code} -ne 0 ]; then
476+
echo "server.log"
477+
cat server.log
478+
cat log/workerlog.0
479+
echo "EP4TP1 在线服务相关测试失败,请检查pr代码"
480+
exit 1
481+
fi
482+
483+
echo "============================开始 EP4TP4 all2all 测试!============================"
484+
sleep 5
485+
rm -rf log/*
486+
rm -f core*
487+
# pkill -9 python #流水线不执行这个
488+
ipcrm --all=msg
489+
xpu-smi
490+
if [[ "$XPU_ID" == "0" ]]; then
491+
export XPU_VISIBLE_DEVICES="0,1,2,3"
492+
else
493+
export XPU_VISIBLE_DEVICES="4,5,6,7"
494+
fi
495+
496+
export BKCL_ENABLE_XDR=1
497+
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
498+
export BKCL_TRACE_TOPO=1
499+
export BKCL_PCIE_RING=1
500+
export XSHMEM_MODE=1
501+
export XSHMEM_QP_NUM_PER_RANK=32
502+
export BKCL_RDMA_VERBS=1
503+
504+
export port_num=$((8188 + XPU_ID * 100))
505+
# 启动服务
326506
python -m fastdeploy.entrypoints.openai.api_server \
327-
--model ../../../models/ERNIE-4.5-0.3B-Paddle \
328-
--port 8188 \
329-
--tensor-parallel-size 1 \
330-
--max-model-len 32768 \
331-
--max-num-seqs 128 \
332-
--quantization "wint8" \
333-
--gpu-memory-utilization 0.9 \
334-
--enable-logprob \
335-
--max-logprobs 5
507+
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
508+
--port $port_num \
509+
--tensor-parallel-size 4 \
510+
--enable-expert-parallel \
511+
--data-parallel-size 1 \
512+
--max-model-len 32768 \
513+
--max-num-seqs 64 \
514+
--quantization "wint4" \
515+
--engine-worker-queue-port $((port_num + 10)) \
516+
--metrics-port $((port_num + 2)) \
517+
--cache-queue-port $((port_num + 47873)) \
518+
--gpu-memory-utilization 0.9 \
519+
--load-choices "default" > server.log 2>&1 &
520+
521+
sleep 60
522+
# 探活
523+
TIMEOUT=$((15 * 60))
524+
INTERVAL=10
525+
ENDPOINT="http://0.0.0.0:${port_num}/health"
526+
START_TIME=$(date +%s)
527+
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
528+
while true; do
529+
CURRENT_TIME=$(date +%s)
530+
ELAPSED=$((CURRENT_TIME - START_TIME))
531+
if [ $ELAPSED -ge $TIMEOUT ]; then
532+
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
533+
stop_processes
534+
cat server.log
535+
echo "log/workerlog.0"
536+
cat log/workerlog.0
537+
exit 1
538+
fi
539+
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
540+
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
541+
if [ "$HTTP_CODE" = "200" ]; then
542+
echo -e "\n服务启动成功!耗时 ${ELAPSED}"
543+
break
544+
else
545+
sleep $INTERVAL
546+
fi
547+
done
548+
549+
550+
# 执行在线推理验证脚本
551+
python -m pytest -s tests/ci_use/XPU_45T/run_ep_online.py
552+
ep_online_exit_code=$?
553+
echo ep_online_exit_code is ${ep_online_exit_code}
554+
555+
unset BKCL_ENABLE_XDR
556+
unset BKCL_RDMA_NICS
557+
unset BKCL_TRACE_TOPO
558+
unset BKCL_PCIE_RING
559+
unset XSHMEM_MODE
560+
unset XSHMEM_QP_NUM_PER_RANK
561+
unset BKCL_RDMA_VERBS
562+
stop_processes >kill.log 2>&1
563+
564+
if [ ${ep_online_exit_code} -ne 0 ]; then
565+
echo "server.log"
566+
cat server.log
567+
cat log/workerlog.0
568+
echo "EP4TP4 all2all 在线服务相关测试失败,请检查pr代码"
569+
exit 1
570+
fi

0 commit comments

Comments
 (0)