diff --git a/examples/intel_hpu/bench_gsm8k.py b/examples/intel_hpu/bench_gsm8k.py new file mode 100644 index 00000000000..fe6f07c3ed5 --- /dev/null +++ b/examples/intel_hpu/bench_gsm8k.py @@ -0,0 +1,246 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Metric evaluation for Fastdeploy + ERNIE-4.5-Turbo""" +# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py +import argparse +import ast +import json +import re +import time +from concurrent.futures import ThreadPoolExecutor + +import numpy as np +import requests +from tqdm import tqdm + +INVALID = -9999999 + + +def call_generate(prompt, **kwargs): + """ + Generates response based on the input prompt. + + Args: + prompt (str): The input prompt text. + **kwargs: Keyword arguments, including server IP address and port number. + + Returns: + str: The response generated based on the prompt. + + """ + url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions" + headers = {"Content-Type": "application/json"} + data = { + "messages": [ + { + "role": "user", + "content": prompt, + } + ], + "temperature": 0.6, + "max_tokens": 2047, + "top_p": 0.95, + "do_sample": True, + } + + response = requests.post(url, headers=headers, data=json.dumps(data)) + out = response.json() + return out["choices"][0]["message"]["content"] + + +def get_one_example(lines, i, include_answer): + """ + Retrieves a question-answer example from the given list of text lines. + + Args: + lines (list of dict): A list of question-answer pairs. + i (int): The index of the question-answer pair to retrieve from lines. + include_answer (bool): Whether to include the answer in the returned string. + + Returns: + str: A formatted question-answer string in the format "Question: \nAnswer: ". + + """ + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines, k): + """ + Selects k examples from the given list of text lines and concatenates them into a single string. + + Args: + lines (list): A list containing text lines. + k (int): The number of examples to select. + + Returns: + str: A string composed of k examples, separated by two newline characters. + """ + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str): + """ + Extracts numerical values from an answer string and returns them. + + Args: + answer_str (str): The string containing the answer. + + Returns: + The extracted numerical value; returns "INVALID" if extraction fails. + """ + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +def read_jsonl(filename: str): + """ + Reads a JSONL file. + + Args: + filename (str): Path to the JSONL file. + + Yields: + dict: A dictionary object corresponding to each line in the JSONL file. + """ + with open(filename) as fin: + for line in fin: + if line.startswith("#"): + continue + yield json.loads(line) + + +def main(args): + """ + Process inputs and generate answers by calling the model in parallel using a thread pool. + + Args: + args (argparse.Namespace): + - num_questions (int): Number of questions to process. + - num_shots (int): Number of few-shot learning examples. + - ip (str): IP address of the model service. + - port (int): Port number of the model service. + - parallel (int): Number of questions to process in parallel. + - result_file (str): File path to store the results. + + Returns: + None + + """ + # Read data + filename = "test.jsonl" + + lines = list(read_jsonl(filename)) + + # Construct prompts + num_questions = args.num_questions + num_shots = args.num_shots + few_shot_examples = get_few_shot_examples(lines, num_shots) + + questions = [] + labels = [] + for i in range(len(lines[:num_questions])): + questions.append(get_one_example(lines, i, False)) + labels.append(get_answer_value(lines[i]["answer"])) + assert all(l != INVALID for l in labels) + + states = [None] * len(labels) + + # Use thread pool + def get_one_answer(i): + answer = call_generate( + prompt=few_shot_examples + questions[i], + # stop=["Question", "Assistant:", "<|separator|>"], + ip=args.ip, + port=args.port, + ) + states[i] = answer + + tic = time.time() + if args.parallel == 1: + for i in tqdm(range(len(questions))): + get_one_answer(i) + else: + with ThreadPoolExecutor(args.parallel) as executor: + list( + tqdm( + executor.map(get_one_answer, list(range(len(questions)))), + total=len(questions), + ) + ) + + latency = time.time() - tic + preds = [] + + with open(args.acc_log, "w") as fout: + for i in range(len(states)): + preds.append(get_answer_value(states[i])) + answer = get_answer_value(states[i]) + fout.write("\n################################################################\n") + fout.write("-----------prompt--------------\n") + fout.write(f"{few_shot_examples + questions[i]}\n") + fout.write("-----------answer--------------\n") + fout.write(f"answer= {states[i]}\n") + fout.write("-----------accuracy--------------\n") + fout.write(f"Correct={answer==labels[i]}, pred={answer}, label={labels[i]} \n") + + # Compute accuracy + acc = np.mean(np.array(preds) == np.array(labels)) + invalid = np.mean(np.array(preds) == INVALID) + + # Print results + print(f"Accuracy: {acc:.3f}") + print(f"Invalid: {invalid:.3f}") + print(f"Latency: {latency:.3f} s") + + with open(args.result_file, "a") as fout: + value = { + "task": "gsm8k", + "backend": "paddlepaddle", + "num_gpus": 1, + "latency": round(latency, 3), + "accuracy": round(acc, 3), + "num_requests": args.num_questions, + "other": { + "num_questions": args.num_questions, + "parallel": args.parallel, + }, + } + fout.write(json.dumps(value) + "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--ip", type=str, default="127.0.0.1") + parser.add_argument("--port", type=str, default="8188") + parser.add_argument("--num-shots", type=int, default=10) + parser.add_argument("--data-path", type=str, default="test.jsonl") + parser.add_argument("--num-questions", type=int, default=1319) + parser.add_argument("--result-file", type=str, default="result.jsonl") + parser.add_argument("--parallel", type=int, default=1) + parser.add_argument("--acc-log", type=str, default="accuracy.log") + args = parser.parse_args() + main(args) diff --git a/examples/intel_hpu/benchmark_paddle_hpu_cli.sh b/examples/intel_hpu/benchmark_paddle_hpu_cli.sh new file mode 100755 index 00000000000..929a83f71d0 --- /dev/null +++ b/examples/intel_hpu/benchmark_paddle_hpu_cli.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# set -x + +model="ERNIE-4.5-21B-A3B-Paddle" +model_log_name="ERNIE-4.5-21B-A3B-Paddle" +model_yaml="yaml/eb45-21b-a3b-32k-bf16.yaml" +# model="ERNIE-4.5-300B-A47B-Paddle" +# model_log_name="ERNIE-4.5-300B-A47B-Paddle" +# model_yaml="yaml/eb45-300b-a47b-32k-bf16.yaml" + +export SERVER_PORT=8188 +export no_proxy=localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24 + +input_lengths=(1024 2048) +output_lengths=(1024) +batch_sizes=(1 2 4 8 16 32 64 128) + +workspace=$(pwd) +cd $workspace +log_home=$workspace/benchmark_fastdeploy_logs/$(TZ='Asia/Shanghai' date '+WW%V')_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)_${model_log_name}_FixedLen + +mkdir -p ${log_home} + +for input_length in "${input_lengths[@]}" +do + for output_length in "${output_lengths[@]}" + do + for batch_size in "${batch_sizes[@]}" + do + > log/hpu_model_runner_profile.log + num_prompts=$(( batch_size * 3)) + log_name_prefix="benchmarkdata_${model_log_name}_inputlength_${input_length}_outputlength_${output_length}_batchsize_${batch_size}_numprompts_${num_prompts}" + log_name=${log_name_prefix}_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S) + echo "running benchmark with input length ${input_length}, output length ${output_length}, batch size ${batch_size}, log name ${log_name}" + cmd="python ../../benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model $model \ + --endpoint /v1/chat/completions \ + --host 0.0.0.0 \ + --port ${SERVER_PORT} \ + --dataset-name random \ + --random-input-len ${input_length} \ + --random-output-len ${output_length} \ + --random-range-ratio 0 \ + --hyperparameter-path ../../benchmarks/${model_yaml} \ + --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \ + --metric-percentiles 80,95,99,99.9,99.95,99.99 \ + --num-prompts ${num_prompts} \ + --max-concurrency ${batch_size} \ + --ignore-eos" + echo $cmd | tee -a ${log_home}/${log_name}.log + eval $cmd >> ${log_home}/${log_name}.log 2>&1 + + cp log/hpu_model_runner_profile.log ${log_home}/${log_name}_profile.log + done + done +done diff --git a/examples/intel_hpu/benchmark_paddle_hpu_cli_sharegpt.sh b/examples/intel_hpu/benchmark_paddle_hpu_cli_sharegpt.sh new file mode 100755 index 00000000000..cc3e1737a0c --- /dev/null +++ b/examples/intel_hpu/benchmark_paddle_hpu_cli_sharegpt.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# set -x + +model="ERNIE-4.5-21B-A3B-Paddle" +model_log_name="ERNIE-4.5-21B-A3B-Paddle" +model_yaml="yaml/eb45-21b-a3b-32k-bf16.yaml" +# model="ERNIE-4.5-300B-A47B-Paddle" +# model_log_name="ERNIE-4.5-300B-A47B-Paddle" +# model_yaml="yaml/eb45-300b-a47b-32k-bf16.yaml" +export SERVER_PORT=8188 +export no_proxy=.intel.com,intel.com,localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24 + +CARD_NUM=$1 + +if [[ "$CARD_NUM" == "1" ]]; then + batch_size=128 +else + batch_size=64 +fi + +num_prompts=2000 + +workspace=$(pwd) +cd $workspace +log_home=$workspace/benchmark_fastdeploy_logs/$(TZ='Asia/Shanghai' date '+WW%V')_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)_${model_log_name} + +mkdir -p ${log_home} + +log_name_prefix="benchmarkdata_${model_log_name}_sharegpt" +log_name=${log_name_prefix}_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S) +echo "running benchmark with sharegpt log name ${log_name}" +cmd="python ../../benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --model $model \ + --endpoint /v1/chat/completions \ + --host 0.0.0.0 \ + --port ${SERVER_PORT} \ + --dataset-name EBChat \ + --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \ + --hyperparameter-path ../../benchmarks/${model_yaml} \ + --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \ + --metric-percentiles 80,95,99,99.9,99.95,99.99 \ + --max-concurrency ${batch_size} \ + --num-prompts ${num_prompts} \ + --sharegpt-output-len 4096 \ + --save-result " +echo $cmd | tee -a ${log_home}/${log_name}.log +eval $cmd >> ${log_home}/${log_name}.log 2>&1 +cp log/hpu_model_runner_profile.log ${log_home}/${log_name}_profile.log diff --git a/examples/intel_hpu/benchmark_paddle_hpu_server.sh b/examples/intel_hpu/benchmark_paddle_hpu_server.sh new file mode 100755 index 00000000000..9078bdfc9b8 --- /dev/null +++ b/examples/intel_hpu/benchmark_paddle_hpu_server.sh @@ -0,0 +1,49 @@ +#!/bin/bash +export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH +export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export PADDLE_DISTRI_BACKEND=xccl +export PADDLE_XCCL_BACKEND=intel_hpu +# export FLAGS_intel_hpu_recipe_cache_config=/tmp/recipe,false,10240 +export FLAGS_intel_hpu_recipe_cache_num=20480 +export SERVER_PORT=8188 +export ENGINE_WORKER_QUEUE_PORT=8002 +export METRICS_PORT=8001 +export CACHE_QUEUE_PORT=8003 +export HABANA_PROFILE=0 +export HPU_VISIBLE_DEVICES=0 +rm -rf log 2>/dev/null +FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \ + python -m fastdeploy.entrypoints.openai.api_server \ + --model ERNIE-4.5-21B-A3B-Paddle \ + --port ${SERVER_PORT} \ + --engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} \ + --metrics-port ${METRICS_PORT} \ + --cache-queue-port ${CACHE_QUEUE_PORT} \ + --tensor-parallel-size 1 \ + --max-model-len 32768 \ + --max-num-seqs 128 \ + --block-size 128 \ + --num-gpu-blocks-override 3100 \ + --kv-cache-ratio 0.991 \ + --no-enable-prefix-caching \ + --graph-optimization-config '{"use_cudagraph":false}' + +# (2k + 1k) / 128(block_size) * 128(batch) = 3072 +# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +# rm -rf log 2>/dev/null +# FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \ +# python -m fastdeploy.entrypoints.openai.api_server \ +# --model ERNIE-4.5-300B-A47B-Paddle \ +# --port ${SERVER_PORT} \ +# --engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} \ +# --metrics-port ${METRICS_PORT} \ +# --cache-queue-port ${CACHE_QUEUE_PORT} \ +# --tensor-parallel-size 8 \ +# --max-model-len 32768 \ +# --max-num-seqs 128 \ +# --block-size 128 \ +# --num-gpu-blocks-override 3100 \ +# --kv-cache-ratio 0.991 \ +# --no-enable-prefix-caching \ +# --graph-optimization-config '{"use_cudagraph":false}' diff --git a/examples/intel_hpu/benchmark_paddle_hpu_server_sharegpt.sh b/examples/intel_hpu/benchmark_paddle_hpu_server_sharegpt.sh new file mode 100755 index 00000000000..9a160d29911 --- /dev/null +++ b/examples/intel_hpu/benchmark_paddle_hpu_server_sharegpt.sh @@ -0,0 +1,35 @@ +#!/bin/bash +export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH +export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export PADDLE_DISTRI_BACKEND=xccl +export PADDLE_XCCL_BACKEND=intel_hpu +# export FLAGS_intel_hpu_recipe_cache_config=/tmp/recipe,false,10240 +export FLAGS_intel_hpu_recipe_cache_num=20480 +export SERVER_PORT=8188 +export ENGINE_WORKER_QUEUE_PORT=8002 +export METRICS_PORT=8001 +export CACHE_QUEUE_PORT=8003 +export HABANA_PROFILE=0 + +CARD_NUM=$1 + +if [[ "$CARD_NUM" == "1" ]]; then + export HPU_VISIBLE_DEVICES=0 + export MODEL="ERNIE-4.5-21B-A3B-Paddle" + export GPU_BLOCKS=5000 +elif [[ "$CARD_NUM" == "8" ]]; then + export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export MODEL="ERNIE-4.5-300B-A47B-Paddle" + export GPU_BLOCKS=3000 +else + exit 0 +fi + +rm -rf log 2>/dev/null +FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \ + python -m fastdeploy.entrypoints.openai.api_server --model ${MODEL} --port ${SERVER_PORT} \ + --engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} --metrics-port ${METRICS_PORT} \ + --cache-queue-port ${CACHE_QUEUE_PORT} --tensor-parallel-size ${CARD_NUM} --max-model-len 16384 \ + --max-num-seqs 128 --block-size 128 --kv-cache-ratio 0.5 --num-gpu-blocks-override ${GPU_BLOCKS} \ + --graph-optimization-config '{"use_cudagraph":false}' diff --git a/examples/intel_hpu/draw_benchmark_data.py b/examples/intel_hpu/draw_benchmark_data.py new file mode 100644 index 00000000000..978967780f9 --- /dev/null +++ b/examples/intel_hpu/draw_benchmark_data.py @@ -0,0 +1,173 @@ +import csv +import os +import re +import sys +from datetime import datetime + +import matplotlib.dates as mdates +import matplotlib.pyplot as plt + +log_patterns = [ + re.compile( + r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*_profile\.log$" + ), +] + + +def draw_time_graph(log_dir, log_filename, max_num_seqs, mode): + # Store extracted time and BT values + timestamps_model = [] + times_model = [] + bt_values_model = [] + block_list_shapes_model = [] + block_indices_shapes_model = [] + timestamps_pp = [] + times_pp = [] + bt_values_pp = [] + + # Use regex to extract Model execution time and BT information + pattern_model = re.compile( + r"(\d+-\d+-\d+ \d+:\d+:\d+,\d+) .* Model execution time\(ms\): ([\d\.]+), BT=(\d+), block_list_shape=\[(\d+)\], block_indices_shape=\[(\d+)\]" + ) + pattern_pp = re.compile( + r"(\d+-\d+-\d+ \d+:\d+:\d+,\d+) .* PostProcessing execution time\(ms\): ([\d\.]+), BT=(\d+)" + ) + # Read log file + with open(os.path.join(log_dir, log_filename), "r") as file: + for line in file: + match_model = pattern_model.search(line) + if match_model: + bt_value = int(match_model.group(3)) + timestamps_model.append(datetime.strptime(match_model.group(1), "%Y-%m-%d %H:%M:%S,%f")) + if mode == "prefill" and bt_value <= max_num_seqs: + times_model.append(None) + bt_values_model.append(None) + continue + if mode == "decode" and bt_value > max_num_seqs: + times_model.append(None) + bt_values_model.append(None) + continue + times_model.append(float(match_model.group(2))) + bt_values_model.append(bt_value) + block_list_shapes_model.append(int(match_model.group(4))) + block_indices_shapes_model.append(int(match_model.group(5))) + else: + match_pp = pattern_pp.search(line) + if match_pp: + bt_value = int(match_pp.group(3)) + timestamps_pp.append(datetime.strptime(match_pp.group(1), "%Y-%m-%d %H:%M:%S,%f")) + if mode == "prefill" and bt_value <= max_num_seqs: + times_pp.append(None) + bt_values_pp.append(None) + continue + if mode == "decode" and bt_value > max_num_seqs: + times_pp.append(None) + bt_values_pp.append(None) + continue + times_pp.append(float(match_pp.group(2))) + bt_values_pp.append(bt_value) + + # Plot graphs + plt.figure(figsize=(15, 7)) + + date_format = mdates.DateFormatter("%m-%d %H:%M:%S") + # Plot time graph + plt.subplot(2, 1, 1) + ax1 = plt.gca() + ax2 = ax1.twinx() + ax1.plot(timestamps_model, times_model, label="Model Execution Time (ms)", color="blue") + ax2.plot(timestamps_pp, times_pp, label="PostProcessing Time (ms)", color="red") + ax1.set_ylabel("Model Execution Time (ms)") + ax2.set_ylabel("PostProcessing Time (ms)") + ax1.xaxis.set_major_formatter(date_format) + # Merge legends + lines_1, labels_1 = ax1.get_legend_handles_labels() + lines_2, labels_2 = ax2.get_legend_handles_labels() + ax1.legend(lines_1 + lines_2, labels_1 + labels_2) + + # Plot BT value graph + plt.subplot(2, 1, 2) + plt.plot(timestamps_model, bt_values_model, label="BT [" + mode + "]", color="orange") + plt.ylabel("BT Value") + plt.xlabel(log_filename, fontsize=8) + + plt.gca().xaxis.set_major_formatter(date_format) + plt.legend() + + plt.tight_layout() + output_filename = log_filename[:-4] + "_analysis_" + mode + ".png" + plt.savefig(os.path.join(log_dir, output_filename), dpi=300) + plt.close() + + # Write to CSV file + if mode == "all": + csv_filename = log_filename[:-4] + "_analysis.csv" + with open(os.path.join(log_dir, csv_filename), "w", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + [ + "Timestamp", + "ModelTime(ms)", + "BT", + "block_list_shape", + "block_indices_shape", + "Timestamp", + "PostProcessing(ms)", + "BT", + ] + ) + for i in range(len(times_model)): + writer.writerow( + [ + timestamps_model[i], + times_model[i], + bt_values_model[i], + block_list_shapes_model[i], + block_indices_shapes_model[i], + timestamps_pp[i], + times_pp[i], + bt_values_pp[i], + ] + ) + + +def main(): + if len(sys.argv) > 1: + log_dir = sys.argv[1] + else: + log_dir = "." + try: + from natsort import natsorted + + natsort_available = True + except ImportError: + natsort_available = False + + files = [] + for f in os.listdir(log_dir): + for pat in log_patterns: + if pat.match(f): + files.append(f) + break + if natsort_available: + files = natsorted(files) + else: + import re as _re + + def natural_key(s): + return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)] + + files.sort(key=natural_key) + + for file in files: + for idx, pat in enumerate(log_patterns): + m = pat.match(file) + if m: + draw_time_graph(log_dir, file, 128, "prefill") + draw_time_graph(log_dir, file, 128, "decode") + draw_time_graph(log_dir, file, 128, "all") + + +if __name__ == "__main__": + print("Starting to draw logs...") + main() diff --git a/examples/intel_hpu/eb45-300b-a47b-32k-bf16.yaml b/examples/intel_hpu/eb45-300b-a47b-32k-bf16.yaml new file mode 100644 index 00000000000..6b817941092 --- /dev/null +++ b/examples/intel_hpu/eb45-300b-a47b-32k-bf16.yaml @@ -0,0 +1,5 @@ +max_model_len: 32768 +max_num_seqs: 128 +kv_cache_ratio: 0.75 +tensor_parallel_size: 8 +max_num_batched_tokens: 32768 diff --git a/examples/intel_hpu/intel_hpu_serving_benchmark.md b/examples/intel_hpu/intel_hpu_serving_benchmark.md new file mode 100644 index 00000000000..05b7e45698a --- /dev/null +++ b/examples/intel_hpu/intel_hpu_serving_benchmark.md @@ -0,0 +1,70 @@ +# Intel HPU serving benchmark +These scripts are used to launch FastDeploy Paddle large model inference service for performance and stress testing. + +## Main HPU-Specific Parameter +- `HPU_WARMUP_BUCKET`: Whether to enable warmup (1 means enabled) +- `HPU_WARMUP_MODEL_LEN`: Model length for warmup (including input and output) +- `MAX_PREFILL_NUM`: Maximum batch in prefill stage, default 3 +- `BATCH_STEP_PREFILL`: Batch step in prefill stage, default 1 +- `SEQUENCE_STEP_PREFILL`: Sequence step in prefill stage, default 128, same as block size +- `CONTEXT_BLOCK_STEP_PREFILL`: Step size for block hit when prefill caching is enabled, default 1 +- `BATCH_STEP_DECODE`: Batch step in decode stage, default 4 +- `BLOCK_STEP_DECODE`: Block step in decode stage, default 16 +- `FLAGS_intel_hpu_recipe_cache_num`: Limit for HPU recipe cache number +- `FLAGS_intel_hpu_recipe_cache_config`: HPU recipe cache config, can be used for warmup optimization +- `GC_KERNEL_PATH`: The default path of the HPU TPC kernels library +- `HABANA_PROFILE`: Whether to enable profiler (1 means enabled) +- `PROFILE_START`: Profiler start step. +- `PROFILE_END`: Profiler end step. + +## Usage +### 1. Start server +There are different setup scripts are provided to start the vllm server, one for RandomDataset and the other for ShareGPT. + +Before running, please make sure to correctly set the model path and port number in the script. +```bash +./benchmark_paddle_hpu_server.sh +./benchmark_paddle_hpu_server_sharegpt.sh +``` +You can use HPU_VISIBLE_DEVICES in the script to select the HPU card. + +### 2. Run client +Correspondingly, there are different client test scripts. `benchmark_paddle_hpu_cli.sh` supports both variable and fixed length tests. + +Before running, please make sure to correctly set the model path, port number, and input/output settings in the script. +```bash +./benchmark_paddle_hpu_cli.sh +./benchmark_paddle_hpu_cli_sharegpt.sh +``` + +### 3. Parse logs +After batch testing, run the following script to automatically parse the logs and generate a CSV file. +```python +python parse_benchmark_logs.py benchmark_fastdeploy_logs/[the targeted folder] +``` +The performance data will be saved as a CSV file. + +### 4. Analyse logs +During HPU_MODEL_RUNNER execution, performance logs are generated. The following script can parse these logs and produce performance graphs to help identify bottlenecks. +```python +python draw_benchmark_data.py benchmark_fastdeploy_logs/[the targeted folder] +``` +The script will save the model execution times and batch tokens as a CSV file and plot them in a graph. + +### 5. Accuracy test +Accuracy testing uses GSM8K. Use the following conversion to generate the test file. +```python +>>> import pandas as pd +>>> df = pd.read_parquet('tests/ce/accuracy_cases/gsm8k.parquet', engine='pyarrow') +>>> df.to_json('test.jsonl', orient='records', lines=True) +``` +Run the following command to perform the accuracy test. +```bash +python -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 64 +``` + +### 6. Offline demo +To run a offline demo on HPU quickly, after set model_path in offline_demo.py, run the start script directly. +```bash +./run_offline_demo.sh +``` diff --git a/examples/intel_hpu/offline_demo.py b/examples/intel_hpu/offline_demo.py new file mode 100644 index 00000000000..ee62921911f --- /dev/null +++ b/examples/intel_hpu/offline_demo.py @@ -0,0 +1,53 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from fastdeploy.engine.sampling_params import SamplingParams +from fastdeploy.entrypoints.llm import LLM + +model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle" +# model_name_or_path = "ERNIE-4.5-300B-A47B-Paddle" + +# Hyperparameter settings +input_bs = 1 +input_seq = None # 1000 +max_out_tokens = 128 +server_max_bs = 128 +TP = 1 + +# num_gpu_blocks_override = ceil((input_seq + max_out_tokens) / 128) * server_max_bs +num_gpu_blocks_override = 2000 +sampling_params = SamplingParams(max_tokens=max_out_tokens) +graph_optimization_config = {"use_cudagraph": False} +llm = LLM( + model=model_name_or_path, + tensor_parallel_size=TP, + engine_worker_queue_port=8602, + num_gpu_blocks_override=num_gpu_blocks_override, + block_size=128, + max_model_len=32768, + max_num_seqs=server_max_bs, + graph_optimization_config=graph_optimization_config, +) + +if input_seq is None: + prompt = "user: who are you?" +else: + prompt = "hi " * input_seq +prompts = [prompt] * input_bs +for i in range(2): + output = llm.generate(prompts=prompts, use_tqdm=True, sampling_params=sampling_params) + +print(output) diff --git a/examples/intel_hpu/parse_benchmark_logs.py b/examples/intel_hpu/parse_benchmark_logs.py new file mode 100644 index 00000000000..ee58011e3c0 --- /dev/null +++ b/examples/intel_hpu/parse_benchmark_logs.py @@ -0,0 +1,195 @@ +import csv +import os +import re +import sys + +log_patterns = [ + re.compile( + r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*(? 1: + log_dir = sys.argv[1] + else: + log_dir = "." + try: + from natsort import natsorted + + natsort_available = True + except ImportError: + natsort_available = False + all_files = set(os.listdir(log_dir)) + files = [] + for f in os.listdir(log_dir): + for pat in log_patterns: + if pat.match(f): + files.append(f) + break + if natsort_available: + files = natsorted(files) + else: + import re as _re + + def natural_key(s): + return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)] + + files.sort(key=natural_key) + rows = [] + + for file in files: + m = None + matched_idx = -1 + for idx, pat in enumerate(log_patterns): + m = pat.match(file) + if m: + matched_idx = idx + break + if not m: + continue + # model_name, input_len, output_len, batch_size, num_prompts + # model_name, num_prompts, max_concurrency + if matched_idx == 0: + model_name, input_len, output_len, batch_size, num_prompts = m.groups() + elif matched_idx == 1: + model_name, num_prompts, max_concurrency = m.groups() + input_len = "-" + output_len = "-" + if file.endswith(".log"): + profile_file = file[:-4] + "_profile.log" + else: + profile_file = "" + model_first = model_average = postprocessing_average = steppaddle_average = "" + if profile_file in all_files: + prepare_input_times, model_times, postprocessing_times, steppaddle_times = parse_profile_log_file( + os.path.join(log_dir, profile_file) + ) + _, pia = calculate_times(prepare_input_times, False) + mf, ma = calculate_times(model_times, True) + _, pa = calculate_times(postprocessing_times, False) + _, sa = calculate_times(steppaddle_times, False) + prepare_input_average = pia if pia is not None else "" + model_first = mf if mf is not None else "" + model_average = ma if ma is not None else "" + postprocessing_average = pa if pa is not None else "" + steppaddle_average = sa if sa is not None else "" + data = parse_benchmark_log_file(os.path.join(log_dir, file)) + data["dataset"] = "Fixed-Length" if matched_idx == 0 else "ShareGPT" + data["model_name"] = model_name + data["input_length"] = input_len + data["output_length"] = output_len + data["batch_size"] = batch_size if matched_idx == 0 else max_concurrency + data["num_prompts"] = num_prompts + data["prepare_input_average"] = prepare_input_average + data["model_execute_first"] = model_first + data["model_execute_average"] = model_average + data["postprocessing_execute_average"] = postprocessing_average + data["steppaddle_execute_average"] = steppaddle_average + rows.append(data) + + import datetime + + import pytz + + shanghai_tz = pytz.timezone("Asia/Shanghai") + now = datetime.datetime.now(shanghai_tz) + ts = now.strftime("%Y%m%d_%H%M%S") + log_dir_name = os.path.basename(os.path.abspath(log_dir)) + if log_dir_name == "" or log_dir == "." or log_dir == "/": + csv_filename = f"benchmark_summary_{ts}.csv" + else: + csv_filename = f"benchmark_summary_{log_dir_name}_{ts}.csv" + fieldnames = ( + [ + "model_name", + "dataset", + "input_length", + "output_length", + "batch_size", + "num_prompts", + ] + + [name for name, _ in metrics] + + [ + "prepare_input_average", + "model_execute_first", + "model_execute_average", + "postprocessing_execute_average", + "steppaddle_execute_average", + ] + ) + with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + writer.writerow(row) + print(f"CSV saved as: {csv_filename}") + + +if __name__ == "__main__": + print("Starting to parse logs...") + main() diff --git a/examples/intel_hpu/run_offline_demo.sh b/examples/intel_hpu/run_offline_demo.sh new file mode 100755 index 00000000000..0bc8c1ce8dc --- /dev/null +++ b/examples/intel_hpu/run_offline_demo.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH +export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export PADDLE_DISTRI_BACKEND=xccl +export PADDLE_XCCL_BACKEND=intel_hpu +# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export HPU_VISIBLE_DEVICES=0 +export HABANA_PROFILE=0 +export PROFILE_START=1 +export PROFILE_END=3 +# export HABANA_LOGS=hpu_logs +# export LOG_LEVEL_ALL=0 +# export FLAGS_intel_hpu_runtime_debug=1 +# export FLAGS_intel_hpu_reciperunner_debug=1 + +rm -rf log +FD_ATTENTION_BACKEND=HPU_ATTN python offline_demo.py diff --git a/scripts/run_ci_hpu.sh b/scripts/run_ci_hpu.sh index 112e1046112..cc8e8d554d5 100755 --- a/scripts/run_ci_hpu.sh +++ b/scripts/run_ci_hpu.sh @@ -26,8 +26,8 @@ python -m pip uninstall fastdeploy_intel_hpu -y #to install paddlepaddle pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ #to install paddlecustomdevice? (paddle-intel-hpu) -pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.1-cp310-cp310-linux_x86_64.whl -pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.0-cp310-cp310-linux_x86_64.whl +pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.2-cp310-cp310-linux_x86_64.whl +pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.2-cp310-cp310-linux_x86_64.whl #to build and install fastdeploy echo "build whl"