Skip to content

Commit 68d224f

Browse files
add benchmarking tools
1 parent 855aa8d commit 68d224f

File tree

6 files changed

+271
-0
lines changed

6 files changed

+271
-0
lines changed

benchmarks/download_models.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
set -eo pipefail
2+
3+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4+
mkdir -p $SCRIPT_DIR/models
5+
#huggingface-cli download TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
6+
#huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
7+
#huggingface-cli download TheBloke/Llama-2-7B-GGUF llama-2-7b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
8+
#huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
9+
#huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
10+
huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
11+
#huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False

benchmarks/run.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import os
2+
import sys
3+
import time
4+
import psutil
5+
import argparse
6+
import subprocess
7+
from utils.benchmark import parse_threads_range
8+
9+
10+
def get_file_dir():
11+
return os.path.dirname(os.path.realpath(__file__))
12+
13+
14+
def docker_init():
15+
tag = "amperecomputingai/llama.cpp:1.2.3"
16+
if subprocess.run(
17+
["docker", "pull", tag]).returncode != 0:
18+
print("Docker pull process failed!")
19+
sys.exit(1)
20+
container_name = "llama_benchmark"
21+
subprocess.run(["docker", "rm", "-f", container_name])
22+
memory = (psutil.virtual_memory().total >> 30) - 30 # leave 30GB for OS
23+
assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
24+
if subprocess.run(
25+
["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
26+
f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
27+
print("Docker run process failed!")
28+
sys.exit(1)
29+
return container_name
30+
31+
32+
def docker_restart(docker_name):
33+
break_time = 15
34+
35+
def docker_stop():
36+
if subprocess.run(["docker", "stop", docker_name]).returncode != 0:
37+
print(f"Stopping docker container {docker_name} failed, retrying in {break_time} seconds.")
38+
time.sleep(break_time)
39+
docker_stop()
40+
41+
def docker_start():
42+
if subprocess.run(["docker", "start", docker_name]).returncode != 0:
43+
print(f"Starting docker container {docker_name} failed, retrying in {break_time} seconds.")
44+
time.sleep(break_time)
45+
docker_start()
46+
47+
print(f"\nRestarting docker container {docker_name} ...")
48+
docker_stop()
49+
docker_start()
50+
51+
52+
def benchmark(docker_container_name, args):
53+
num_available_threads = len(parse_threads_range(args.threads_range))
54+
if num_available_threads < max(args.num_threads):
55+
print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
56+
sys.exit(1)
57+
58+
docker_restart(docker_container_name)
59+
for model in args.model_names:
60+
for prompt_size in sorted(args.prompt_sizes):
61+
for batch_size in sorted(args.batch_sizes):
62+
for num_threads in sorted(args.num_threads):
63+
num_processes = int(num_available_threads / num_threads)
64+
case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
65+
print(f"\nRunning {case}")
66+
67+
cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
68+
f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
69+
cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
70+
71+
print(f"Executing: {' '.join(cmd)}")
72+
success = False
73+
start = time.time()
74+
p = subprocess.Popen(cmd, start_new_session=True)
75+
while time.time() - start < args.timeout:
76+
time.sleep(1)
77+
exit_code = p.poll()
78+
if exit_code is not None:
79+
success = exit_code == 0
80+
break
81+
if success:
82+
print(f"SUCCESS: {case}")
83+
else:
84+
print(f"FAIL: {case}")
85+
docker_restart(docker_container_name)
86+
87+
88+
def parse_args():
89+
parser = argparse.ArgumentParser(description="Run set of benchmarks.")
90+
parser.add_argument("-m", "--model_names",
91+
type=str, required=True, nargs="+",
92+
help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
93+
parser.add_argument("-t", "--num_threads",
94+
type=int, required=True, nargs="+",
95+
help="number of threads per process to use")
96+
parser.add_argument("-b", "--batch_sizes",
97+
type=int, required=True, nargs="+",
98+
help="batch sizes to cover")
99+
parser.add_argument("-p", "--prompt_sizes",
100+
type=int, required=True, nargs="+",
101+
help="prompt sizes to cover")
102+
parser.add_argument("-r", "--threads_range",
103+
type=str, required=True,
104+
help="range of threads to use in offline mode, e.g. '0-63,128-191', threads will be divided "
105+
"between processes - hint: 'lscpu | grep NUMA'")
106+
parser.add_argument("--timeout",
107+
type=float, default=900,
108+
help="timeout to apply per single benchmark case")
109+
return parser.parse_args()
110+
111+
112+
def main():
113+
args = parse_args()
114+
benchmark(docker_init(), args)
115+
116+
117+
if __name__ == "__main__":
118+
main()

benchmarks/run.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
set -e
2+
3+
python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
4+
rm -f /tmp/log_power

benchmarks/setup_deb.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set -eo pipefail
2+
3+
apt update && apt install -y docker.io
4+
apt-get update && apt-get install -y python3 python3-pip
5+
pip3 install huggingface-hub psutil
5.71 KB
Binary file not shown.

benchmarks/utils/benchmark.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import os
2+
import csv
3+
import sys
4+
import uuid
5+
import time
6+
import argparse
7+
import subprocess
8+
9+
TOKENS = 256
10+
11+
online_threads = None
12+
13+
14+
def parse_args():
15+
parser = argparse.ArgumentParser(description="Run offline benchmark.")
16+
parser.add_argument("-m", "--model",
17+
type=str, required=True,
18+
help="name of the model")
19+
parser.add_argument("-b", "--batch_size",
20+
type=int, required=True,
21+
help="batch size to feed the model with")
22+
parser.add_argument("-p", "--prompt_size",
23+
type=int, required=True,
24+
help="prompt size to feed the model with")
25+
parser.add_argument("-r", "--threads_range",
26+
type=str, required=True,
27+
help="range of threads to use, e.g. '0-63,128-191', threads will be divided between processes "
28+
"- hint: 'lscpu | grep NUMA'")
29+
parser.add_argument("--kv_cache",
30+
type=int, default=65536,
31+
help="kv cache size")
32+
parser.add_argument("-n", "--num_processes",
33+
type=int, default=1,
34+
help="number of processes to spawn")
35+
parser.add_argument("-t", "--num_threads",
36+
type=int, default=1,
37+
help="number of threads to use per process")
38+
return parser.parse_args()
39+
40+
41+
def parse_threads_range(threads_range: str) -> list[int]:
42+
threads_range = [s.split("-") for s in threads_range.split(",")]
43+
if not all([len(s) == 2 for s in threads_range]):
44+
print("Format of --threads_range argument must be '{idx}-{idx},{idx}-{idx},...', "
45+
"e.g. '88-88' to use just thread idx 88")
46+
sys.exit(1)
47+
designated_threads = []
48+
for s in threads_range:
49+
s_0, s_1 = int(s[0]), int(s[1])
50+
if s_1 < s_0:
51+
print(f"Range {s_0}-{s_1} is not valid, second value has to be equal to or greater than the first value")
52+
sys.exit(1)
53+
designated_threads += [i for i in range(s_0, s_1 + 1)]
54+
return designated_threads
55+
56+
57+
def gen_threads_config(num_threads, process_id):
58+
threads_to_use = [str(t) for t in online_threads[num_threads * process_id:num_threads * (process_id + 1)]]
59+
assert len(threads_to_use) == num_threads
60+
return ",".join(threads_to_use)
61+
62+
63+
def summarize_results(logs_dir, args, start, finish):
64+
ttfts = []
65+
tg_lats = []
66+
for n in range(args.num_processes):
67+
results = open(f"{logs_dir}/log_{n}", "r").readlines()[-9].split("|")
68+
prompt_size = int(results[1])
69+
assert prompt_size == args.prompt_size
70+
tokens_generated = int(results[2])
71+
assert tokens_generated == TOKENS
72+
batch_size = int(results[3])
73+
assert batch_size == args.batch_size
74+
ttfts.append(float(results[5]))
75+
tg_lats.append(float(results[7]))
76+
77+
pp_throughput = sum([args.batch_size * args.prompt_size / ttft for ttft in ttfts])
78+
avg_pp_latency = sum(ttfts) / len(ttfts)
79+
tg_throughput = sum([args.batch_size * TOKENS / lat for lat in tg_lats])
80+
tg_per_token_lats = [lat / TOKENS for lat in tg_lats]
81+
avg_tg_latency = sum(tg_per_token_lats) / len(tg_per_token_lats)
82+
avg_total_speed = args.num_processes * args.batch_size * (args.prompt_size + TOKENS) / max([ttft + tg_lat for ttft, tg_lat in zip(ttfts, tg_lats)])
83+
84+
results_filename = f"{args.model.split('/')[-1]}@PP{str(args.prompt_size)}@TG{str(TOKENS)}.csv"
85+
if os.path.exists(results_filename):
86+
first_write = False
87+
else:
88+
first_write = True
89+
with open(results_filename, "a") as f:
90+
writer = csv.writer(f)
91+
if first_write:
92+
writer.writerow(
93+
["n_proc", "n_threads", "batch_size", "prompt_size", "output_tokens", "pp_throughput_tps",
94+
"pp_avg_latency_sec", "tg_throughput_tps", "tg_avg_latency_sec", "pp+tg_throughput_tps", "concurrency", "start", "finish"])
95+
writer.writerow(
96+
[args.num_processes, args.num_threads, args.batch_size, args.prompt_size, TOKENS, pp_throughput,
97+
avg_pp_latency, tg_throughput, avg_tg_latency, avg_total_speed, args.batch_size * args.num_processes, start, finish])
98+
print(f"Result saved in {results_filename}")
99+
100+
101+
def main():
102+
global online_threads
103+
104+
args = parse_args()
105+
106+
designated_threads = parse_threads_range(args.threads_range)
107+
numa_config = subprocess.run(["numactl", "--show"], capture_output=True, text=True, check=True)
108+
online_threads = [int(t) for t in numa_config.stdout.split("physcpubind: ")[1].split(" \ncpubind:")[0].split()
109+
if int(t) in designated_threads]
110+
if len(online_threads) < args.num_processes * args.num_threads:
111+
print(f"Requested config requires {args.num_processes * args.num_threads} threads, while only {len(online_threads)} threads are both online and designated")
112+
sys.exit(1)
113+
114+
logs_dir = os.path.join("/tmp", str(uuid.uuid4()))
115+
os.mkdir(logs_dir)
116+
current_subprocesses = list()
117+
for n in range(args.num_processes):
118+
logfile = f"{logs_dir}/log_{n}"
119+
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
120+
"/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
121+
str(args.batch_size), str(args.num_threads)]
122+
current_subprocesses.append(
123+
subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb')))
124+
start = time.time()
125+
if any(p.wait() != 0 for p in current_subprocesses):
126+
print("FAIL: At least one process returned exit code other than 0 or died!")
127+
sys.exit(1)
128+
finish = time.time()
129+
summarize_results(logs_dir, args, start, finish)
130+
131+
132+
if __name__ == "__main__":
133+
main()

0 commit comments

Comments
 (0)