Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 8f75eb1

Browse files
authored
[LLM Runtime] Sync inference scripts (#961)
Sync inference scripts
1 parent f29c1ec commit 8f75eb1

File tree

11 files changed

+542
-272
lines changed

11 files changed

+542
-272
lines changed

.github/workflows/cpp-graph-test.yml

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ on:
55
branches: [main]
66
paths:
77
- '.github/workflows/cpp-graph-test.yml'
8-
- '.github/workflows/script/models/cpp_graph_inference.sh'
98
- 'intel_extension_for_transformers/llm/runtime/graph/**'
109
- 'intel_extension_for_transformers/llm/library/jblas/**'
11-
- '!intel_extension_for_transformers/llm/runtime/graph/*.md'
10+
- '!**/*.md'
1211
workflow_dispatch:
1312
inputs:
1413
compiler_version:
@@ -70,8 +69,19 @@ jobs:
7069
7170
- name: BF16 Benchmark
7271
run: |
73-
cd ${{ github.workspace }}/.github/workflows/script/models
74-
bash cpp_graph_inference.sh cpp-graph-test ${{ matrix.modelName }} ${{ env.INPUT_COMPILER_VERSION }}
72+
WORKSPACE=${{ env.WORKING_DIR }} bash -eo pipefail ${{ env.GRAPH_DIR }}/scripts/ci/cpp_graph_inference.sh \
73+
--local_models="${{ github.workspace }}/.github/workflows/script/models/local_models.json" \
74+
--cores_list="48," \
75+
--input_list="32,1024" \
76+
-- \
77+
cpp-graph-test \
78+
${{ matrix.modelName }} \
79+
${{ env.GRAPH_DIR }} \
80+
${{ env.WORKING_DIR }} \
81+
${{ env.INPUT_COMPILER_VERSION }}
82+
env:
83+
WORKSPACE: ${{ env.WORKING_DIR }}
84+
GRAPH_DIR: ${{ env.WORKING_DIR }}/intel_extension_for_transformers/llm/runtime/graph
7585

7686
- name: Rename summary
7787
run: |
@@ -137,7 +147,7 @@ jobs:
137147
/usr/bin/bash generate_report.sh --workflow=deploy
138148
sed -n '/<body>/,/<\/body>/p' generated/report.html | sed -r '/^$/d' | sed -r 's/^ +//g' >> $GITHUB_STEP_SUMMARY
139149
env:
140-
RUN_DISPLAY_URL: https://github.com/VincyZhang/intel-extension-for-transformers/actions/runs/${{ github.run_id }}
150+
RUN_DISPLAY_URL: https://github.com/${{github.repository}}/actions/runs/${{ github.run_id }}
141151
BUILD_NUMBER: ${{ github.run_id }}
142152
JOB_STATUS: succeed
143153
MR_source_branch: ${{ github.head_ref }}

.github/workflows/script/models/cpp_graph_inference.sh

Lines changed: 0 additions & 249 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"meta-llama/Llama-2-7b-chat-hf": "/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf",
3+
"EleutherAI/gpt-j-6b": "/tf_dataset2/models/pytorch/gpt-j-6B",
4+
"EleutherAI/gpt-neox-20b": "/tf_dataset2/models/nlp_toolkit/gpt-neox-20b",
5+
"mosaicml/mpt-7b": "/tf_dataset2/models/nlp_toolkit/mpt-7b",
6+
"tiiuae/falcon-7b": "/tf_dataset2/models/nlp_toolkit/falcon-7b",
7+
"bigcode/starcoder": "/tf_dataset2/models/pytorch/starcode_3b",
8+
"bigscience/bloom-7b1": "/tf_dataset2/models/pytorch/bloom-7b1",
9+
"facebook/opt-1.3b": "/tf_dataset2/models/pytorch/opt-1.3b",
10+
"databricks/dolly-v2-3b": "/tf_dataset2/models/pytorch/dolly_v2_3b",
11+
"THUDM/chatglm2-6b": "/tf_dataset2/models/pytorch/chatglm2-6b",
12+
"THUDM/chatglm-6b": "/tf_dataset2/models/pytorch/chatglm-6b",
13+
"baichuan-inc/Baichuan2-13B-Chat": "/tf_dataset2/models/pytorch/Baichuan2-13B-Chat",
14+
"baichuan-inc/Baichuan-13B-Chat": "/tf_dataset2/models/pytorch/Baichuan-13B-Chat",
15+
"mistralai/Mistral-7B-v0.1": "/tf_dataset2/models/pytorch/Mistral-7B-v0.1",
16+
"Qwen/Qwen-7B-Chat": "/tf_dataset2/models/nlp_toolkit/Qwen-7B-Chat"
17+
}
Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1 @@
1-
torch
2-
transformers
3-
numpy
4-
sentencepiece
5-
protobuf<3.20
6-
einops
7-
accelerate
8-
peft
9-
datasets
10-
transformers_stream_generator
11-
tiktoken
1+
-r scripts/requirements/common.txt

.github/workflows/script/models/calculate_percentage.py renamed to intel_extension_for_transformers/llm/runtime/graph/scripts/ci/calculate_percentiles.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,31 @@
1+
# Copyright (c) 2023 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
import numpy as np
216
import re
317
import sys
418
import os
19+
20+
521
def calculate_percentile(data, percentile):
622
return np.percentile(data, percentile, method="closest_observation")
723

24+
825
def calculate_mean(data):
926
return np.mean(data)
1027

28+
1129
def parse_output_file(file_path):
1230
predictions = []
1331
with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file:
@@ -17,6 +35,8 @@ def parse_output_file(file_path):
1735
prediction_time = float(match.group(1)) # Assuming the prediction time is in the second column
1836
predictions.append(prediction_time)
1937
return predictions
38+
39+
2040
def parse_memory_file(memory_file):
2141
memory_values = []
2242
if os.path.exists(memory_file):
@@ -44,14 +64,15 @@ def parse_memory_file(memory_file):
4464
batch_size = sys.argv[5]
4565
model_input = sys.argv[6]
4666
model_output = sys.argv[7]
47-
memory_file = os.environ.get("WORKING_DIR") + "/memory.txt"
67+
memory_file = os.environ.get("WORKSPACE") + "/memory.txt"
4868
predictions = parse_output_file(output_file)
69+
assert len(predictions) > 0, "Model has no ouput tokens!"
4970
first_token_latency = predictions[0]
5071
p90 = calculate_percentile(predictions, 90)
5172
p99 = calculate_percentile(predictions, 99)
5273
latency_mean = calculate_mean(predictions[1:])
5374
total_latency = np.sum(predictions)
54-
75+
5576
print("P90: {:.2f} ms".format(p90))
5677
print("P99: {:.2f} ms".format(p99))
5778
print("average_latency: {:.2f} ms".format(latency_mean))
@@ -63,9 +84,10 @@ def parse_memory_file(memory_file):
6384
memory_mean = calculate_mean(top_50_percent)
6485

6586
print("Memory Mean (Top 50%): {:.2f}".format(memory_mean))
66-
log_file = os.environ.get("WORKING_DIR") + "/cpp_graph_summary.log"
67-
link = os.environ.get("WORKING_DIR") + os.path.basename(output_file)
68-
with open (log_file, 'a') as f:
87+
log_file = os.environ.get("WORKSPACE") + "/cpp_graph_summary.log"
88+
log_prefix = os.environ.get("log_prefix")
89+
link = str(log_prefix) + os.path.basename(output_file)
90+
with open(log_file, 'a') as f:
6991
f.write("engine,")
7092
f.write("latency,")
7193
f.write(model + ",")
@@ -81,8 +103,8 @@ def parse_memory_file(memory_file):
81103
f.write(link + ",")
82104
f.write("{:.2f},".format(p90))
83105
f.write("{:.2f},".format(p99))
84-
#f.write(",latency:")
85-
#for latency in predictions:
106+
# f.write(",latency:")
107+
# for latency in predictions:
86108
# f.write(",{:.2f}".format(latency))
87109
f.write("\n")
88110
f.close()

0 commit comments

Comments
 (0)