intel
diff --git a/‎.github/workflows/cpp-graph-test.yml‎
Lines changed: 15 additions & 5 deletions b/‎.github/workflows/cpp-graph-test.yml‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎.github/workflows/script/models/cpp_graph_inference.sh‎
Lines changed: 0 additions & 249 deletions b/‎.github/workflows/script/models/cpp_graph_inference.sh‎
Lines changed: 0 additions & 249 deletions
diff --git a/‎.github/workflows/script/models/local_models.json‎
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/script/models/local_models.json‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/requirements.txt‎
Lines changed: 1 addition & 11 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/requirements.txt‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎.github/workflows/script/models/calculate_percentage.py‎ renamed to ‎intel_extension_for_transformers/llm/runtime/graph/scripts/ci/calculate_percentiles.py‎
Lines changed: 29 additions & 7 deletions b/‎.github/workflows/script/models/calculate_percentage.py‎ renamed to ‎intel_extension_for_transformers/llm/runtime/graph/scripts/ci/calculate_percentiles.py‎
Lines changed: 29 additions & 7 deletions
@@ -5,10 +5,9 @@ on:
     branches: [main]
     paths:
        - '.github/workflows/cpp-graph-test.yml'
-       - '.github/workflows/script/models/cpp_graph_inference.sh'
        - 'intel_extension_for_transformers/llm/runtime/graph/**'
        - 'intel_extension_for_transformers/llm/library/jblas/**'
-       - '!intel_extension_for_transformers/llm/runtime/graph/*.md'
+       - '!**/*.md'
   workflow_dispatch:
     inputs:
       compiler_version:
@@ -70,8 +69,19 @@ jobs:
 
       - name: BF16 Benchmark
         run: |
-          cd ${{ github.workspace }}/.github/workflows/script/models
-          bash cpp_graph_inference.sh cpp-graph-test ${{ matrix.modelName }} ${{ env.INPUT_COMPILER_VERSION }}
+          WORKSPACE=${{ env.WORKING_DIR }} bash -eo pipefail ${{ env.GRAPH_DIR }}/scripts/ci/cpp_graph_inference.sh \
+            --local_models="${{ github.workspace }}/.github/workflows/script/models/local_models.json" \
+            --cores_list="48," \
+            --input_list="32,1024" \
+            -- \
+            cpp-graph-test \
+            ${{ matrix.modelName }} \
+            ${{ env.GRAPH_DIR }} \
+            ${{ env.WORKING_DIR }} \
+            ${{ env.INPUT_COMPILER_VERSION }}
+        env:
+          WORKSPACE: ${{ env.WORKING_DIR }}
+          GRAPH_DIR: ${{ env.WORKING_DIR }}/intel_extension_for_transformers/llm/runtime/graph
 
       - name: Rename summary
         run: |
@@ -137,7 +147,7 @@ jobs:
           /usr/bin/bash generate_report.sh --workflow=deploy
           sed -n '/<body>/,/<\/body>/p' generated/report.html | sed -r '/^$/d' | sed -r 's/^ +//g' >> $GITHUB_STEP_SUMMARY
         env:
-          RUN_DISPLAY_URL: https://github.com/VincyZhang/intel-extension-for-transformers/actions/runs/${{ github.run_id }}
+          RUN_DISPLAY_URL: https://github.com/${{github.repository}}/actions/runs/${{ github.run_id }}
           BUILD_NUMBER: ${{ github.run_id }}
           JOB_STATUS: succeed
           MR_source_branch: ${{ github.head_ref }}
 
@@ -0,0 +1,17 @@
+{
+    "meta-llama/Llama-2-7b-chat-hf": "/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf",
+    "EleutherAI/gpt-j-6b": "/tf_dataset2/models/pytorch/gpt-j-6B",
+    "EleutherAI/gpt-neox-20b": "/tf_dataset2/models/nlp_toolkit/gpt-neox-20b",
+    "mosaicml/mpt-7b": "/tf_dataset2/models/nlp_toolkit/mpt-7b",
+    "tiiuae/falcon-7b": "/tf_dataset2/models/nlp_toolkit/falcon-7b",
+    "bigcode/starcoder": "/tf_dataset2/models/pytorch/starcode_3b",
+    "bigscience/bloom-7b1": "/tf_dataset2/models/pytorch/bloom-7b1",
+    "facebook/opt-1.3b": "/tf_dataset2/models/pytorch/opt-1.3b",
+    "databricks/dolly-v2-3b": "/tf_dataset2/models/pytorch/dolly_v2_3b",
+    "THUDM/chatglm2-6b": "/tf_dataset2/models/pytorch/chatglm2-6b",
+    "THUDM/chatglm-6b": "/tf_dataset2/models/pytorch/chatglm-6b",
+    "baichuan-inc/Baichuan2-13B-Chat": "/tf_dataset2/models/pytorch/Baichuan2-13B-Chat",
+    "baichuan-inc/Baichuan-13B-Chat": "/tf_dataset2/models/pytorch/Baichuan-13B-Chat",
+    "mistralai/Mistral-7B-v0.1": "/tf_dataset2/models/pytorch/Mistral-7B-v0.1",
+    "Qwen/Qwen-7B-Chat": "/tf_dataset2/models/nlp_toolkit/Qwen-7B-Chat"
+}
@@ -1,11 +1 @@
-torch
-transformers
-numpy
-sentencepiece
-protobuf<3.20
-einops
-accelerate
-peft
-datasets
-transformers_stream_generator
-tiktoken
+-r scripts/requirements/common.txt
@@ -1,13 +1,31 @@
+#  Copyright (c) 2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import numpy as np
 import re
 import sys
 import os
+
+
 def calculate_percentile(data, percentile):
     return np.percentile(data, percentile, method="closest_observation")
 
+
 def calculate_mean(data):
     return np.mean(data)
 
+
 def parse_output_file(file_path):
     predictions = []
     with open(file_path, 'r', encoding='UTF-8', errors='ignore') as file:
@@ -17,6 +35,8 @@ def parse_output_file(file_path):
                 prediction_time = float(match.group(1))  # Assuming the prediction time is in the second column
                 predictions.append(prediction_time)
     return predictions
+
+
 def parse_memory_file(memory_file):
     memory_values = []
     if os.path.exists(memory_file):
@@ -44,14 +64,15 @@ def parse_memory_file(memory_file):
     batch_size = sys.argv[5]
     model_input = sys.argv[6]
     model_output = sys.argv[7]
-    memory_file = os.environ.get("WORKING_DIR") + "/memory.txt"
+    memory_file = os.environ.get("WORKSPACE") + "/memory.txt"
     predictions = parse_output_file(output_file)
+    assert len(predictions) > 0, "Model has no ouput tokens!"
     first_token_latency = predictions[0]
     p90 = calculate_percentile(predictions, 90)
     p99 = calculate_percentile(predictions, 99)
     latency_mean = calculate_mean(predictions[1:])
     total_latency = np.sum(predictions)
-     
+
     print("P90: {:.2f} ms".format(p90))
     print("P99: {:.2f} ms".format(p99))
     print("average_latency: {:.2f} ms".format(latency_mean))
@@ -63,9 +84,10 @@ def parse_memory_file(memory_file):
     memory_mean = calculate_mean(top_50_percent)
 
     print("Memory Mean (Top 50%): {:.2f}".format(memory_mean))
-    log_file = os.environ.get("WORKING_DIR") + "/cpp_graph_summary.log"
-    link = os.environ.get("WORKING_DIR") + os.path.basename(output_file)
-    with open (log_file, 'a') as f:
+    log_file = os.environ.get("WORKSPACE") + "/cpp_graph_summary.log"
+    log_prefix = os.environ.get("log_prefix")
+    link = str(log_prefix) + os.path.basename(output_file)
+    with open(log_file, 'a') as f:
         f.write("engine,")
         f.write("latency,")
         f.write(model + ",")
@@ -81,8 +103,8 @@ def parse_memory_file(memory_file):
         f.write(link + ",")
         f.write("{:.2f},".format(p90))
         f.write("{:.2f},".format(p99))
-        #f.write(",latency:")
-        #for latency in predictions:
+        # f.write(",latency:")
+        # for latency in predictions:
         #    f.write(",{:.2f}".format(latency))
         f.write("\n")
         f.close()