add benchmarking tools

jan-grzybek-ampere · jan-grzybek-ampere · commit 68d224f4ba82 · 2024-06-04T19:21:35.000Z
diff --git a/benchmarks/download_models.sh b/benchmarks/download_models.sh
@@ -0,0 +1,11 @@
+set -eo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+mkdir -p $SCRIPT_DIR/models
+#huggingface-cli download TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+#huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+#huggingface-cli download TheBloke/Llama-2-7B-GGUF llama-2-7b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+#huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+#huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+#huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -0,0 +1,118 @@
+import os
+import sys
+import time
+import psutil
+import argparse
+import subprocess
+from utils.benchmark import parse_threads_range
+
+
+def get_file_dir():
+    return os.path.dirname(os.path.realpath(__file__))
+
+
+def docker_init():
+    tag = "amperecomputingai/llama.cpp:1.2.3"
+    if subprocess.run(
+            ["docker", "pull", tag]).returncode != 0:
+        print("Docker pull process failed!")
+        sys.exit(1)
+    container_name = "llama_benchmark"
+    subprocess.run(["docker", "rm", "-f", container_name])
+    memory = (psutil.virtual_memory().total >> 30) - 30  # leave 30GB for OS
+    assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
+    if subprocess.run(
+            ["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
+             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
+        print("Docker run process failed!")
+        sys.exit(1)
+    return container_name
+
+
+def docker_restart(docker_name):
+    break_time = 15
+
+    def docker_stop():
+        if subprocess.run(["docker", "stop", docker_name]).returncode != 0:
+            print(f"Stopping docker container {docker_name} failed, retrying in {break_time} seconds.")
+            time.sleep(break_time)
+            docker_stop()
+
+    def docker_start():
+        if subprocess.run(["docker", "start", docker_name]).returncode != 0:
+            print(f"Starting docker container {docker_name} failed, retrying in {break_time} seconds.")
+            time.sleep(break_time)
+            docker_start()
+
+    print(f"\nRestarting docker container {docker_name} ...")
+    docker_stop()
+    docker_start()
+
+
+def benchmark(docker_container_name, args):
+    num_available_threads = len(parse_threads_range(args.threads_range))
+    if num_available_threads < max(args.num_threads):
+        print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
+        sys.exit(1)
+
+    docker_restart(docker_container_name)
+    for model in args.model_names:
+        for prompt_size in sorted(args.prompt_sizes):
+            for batch_size in sorted(args.batch_sizes):
+                for num_threads in sorted(args.num_threads):
+                    num_processes = int(num_available_threads / num_threads)
+                    case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
+                    print(f"\nRunning {case}")
+    
+                    cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
+                           f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
+                    cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
+    
+                    print(f"Executing: {' '.join(cmd)}")
+                    success = False
+                    start = time.time()
+                    p = subprocess.Popen(cmd, start_new_session=True)
+                    while time.time() - start < args.timeout:
+                        time.sleep(1)
+                        exit_code = p.poll()
+                        if exit_code is not None:
+                            success = exit_code == 0
+                            break
+                    if success:
+                        print(f"SUCCESS: {case}")
+                    else:
+                        print(f"FAIL: {case}")
+                        docker_restart(docker_container_name)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run set of benchmarks.")
+    parser.add_argument("-m", "--model_names",
+                        type=str, required=True, nargs="+",
+                        help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
+    parser.add_argument("-t", "--num_threads",
+                        type=int, required=True, nargs="+",
+                        help="number of threads per process to use")
+    parser.add_argument("-b", "--batch_sizes",
+                        type=int, required=True, nargs="+",
+                        help="batch sizes to cover")
+    parser.add_argument("-p", "--prompt_sizes",
+                        type=int, required=True, nargs="+",
+                        help="prompt sizes to cover")
+    parser.add_argument("-r", "--threads_range",
+                        type=str, required=True,
+                        help="range of threads to use in offline mode, e.g. '0-63,128-191', threads will be divided "
+                             "between processes - hint: 'lscpu | grep NUMA'")
+    parser.add_argument("--timeout",
+                        type=float, default=900,
+                        help="timeout to apply per single benchmark case")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    benchmark(docker_init(), args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -0,0 +1,4 @@
+set -e
+
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+rm -f /tmp/log_power
diff --git a/benchmarks/setup_deb.sh b/benchmarks/setup_deb.sh
@@ -0,0 +1,5 @@
+set -eo pipefail
+
+apt update && apt install -y docker.io
+apt-get update && apt-get install -y python3 python3-pip
+pip3 install huggingface-hub psutil
diff --git a/benchmarks/utils/__pycache__/benchmark.cpython-310.pyc b/benchmarks/utils/__pycache__/benchmark.cpython-310.pyc
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
@@ -0,0 +1,133 @@
+import os
+import csv
+import sys
+import uuid
+import time
+import argparse
+import subprocess
+
+TOKENS = 256
+
+online_threads = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run offline benchmark.")
+    parser.add_argument("-m", "--model",
+                        type=str, required=True,
+                        help="name of the model")
+    parser.add_argument("-b", "--batch_size",
+                        type=int, required=True,
+                        help="batch size to feed the model with")
+    parser.add_argument("-p", "--prompt_size",
+                        type=int, required=True,
+                        help="prompt size to feed the model with")
+    parser.add_argument("-r", "--threads_range",
+                        type=str, required=True,
+                        help="range of threads to use, e.g. '0-63,128-191', threads will be divided between processes "
+                             "- hint: 'lscpu | grep NUMA'")
+    parser.add_argument("--kv_cache",
+                        type=int, default=65536,
+                        help="kv cache size")
+    parser.add_argument("-n", "--num_processes",
+                        type=int, default=1,
+                        help="number of processes to spawn")
+    parser.add_argument("-t", "--num_threads",
+                        type=int, default=1,
+                        help="number of threads to use per process")
+    return parser.parse_args()
+
+
+def parse_threads_range(threads_range: str) -> list[int]:
+    threads_range = [s.split("-") for s in threads_range.split(",")]
+    if not all([len(s) == 2 for s in threads_range]):
+        print("Format of --threads_range argument must be '{idx}-{idx},{idx}-{idx},...', "
+              "e.g. '88-88' to use just thread idx 88")
+        sys.exit(1)
+    designated_threads = []
+    for s in threads_range:
+        s_0, s_1 = int(s[0]), int(s[1])
+        if s_1 < s_0:
+            print(f"Range {s_0}-{s_1} is not valid, second value has to be equal to or greater than the first value")
+            sys.exit(1)
+        designated_threads += [i for i in range(s_0, s_1 + 1)]
+    return designated_threads
+
+
+def gen_threads_config(num_threads, process_id):
+    threads_to_use = [str(t) for t in online_threads[num_threads * process_id:num_threads * (process_id + 1)]]
+    assert len(threads_to_use) == num_threads
+    return ",".join(threads_to_use)
+
+
+def summarize_results(logs_dir, args, start, finish):
+    ttfts = []
+    tg_lats = []
+    for n in range(args.num_processes):
+        results = open(f"{logs_dir}/log_{n}", "r").readlines()[-9].split("|")
+        prompt_size = int(results[1])
+        assert prompt_size == args.prompt_size
+        tokens_generated = int(results[2])
+        assert tokens_generated == TOKENS
+        batch_size = int(results[3])
+        assert batch_size == args.batch_size
+        ttfts.append(float(results[5]))
+        tg_lats.append(float(results[7]))
+
+    pp_throughput = sum([args.batch_size * args.prompt_size / ttft for ttft in ttfts])
+    avg_pp_latency = sum(ttfts) / len(ttfts)
+    tg_throughput = sum([args.batch_size * TOKENS / lat for lat in tg_lats])
+    tg_per_token_lats = [lat / TOKENS for lat in tg_lats]
+    avg_tg_latency = sum(tg_per_token_lats) / len(tg_per_token_lats)
+    avg_total_speed = args.num_processes * args.batch_size * (args.prompt_size + TOKENS) / max([ttft + tg_lat for ttft, tg_lat in zip(ttfts, tg_lats)])
+
+    results_filename = f"{args.model.split('/')[-1]}@PP{str(args.prompt_size)}@TG{str(TOKENS)}.csv"
+    if os.path.exists(results_filename):
+        first_write = False
+    else:
+        first_write = True
+    with open(results_filename, "a") as f:
+        writer = csv.writer(f)
+        if first_write:
+            writer.writerow(
+                ["n_proc", "n_threads", "batch_size", "prompt_size", "output_tokens", "pp_throughput_tps",
+                 "pp_avg_latency_sec", "tg_throughput_tps", "tg_avg_latency_sec", "pp+tg_throughput_tps", "concurrency", "start", "finish"])
+        writer.writerow(
+            [args.num_processes, args.num_threads, args.batch_size, args.prompt_size, TOKENS, pp_throughput,
+             avg_pp_latency, tg_throughput, avg_tg_latency, avg_total_speed, args.batch_size * args.num_processes, start, finish])
+    print(f"Result saved in {results_filename}")
+
+
+def main():
+    global online_threads
+
+    args = parse_args()
+
+    designated_threads = parse_threads_range(args.threads_range)
+    numa_config = subprocess.run(["numactl", "--show"], capture_output=True, text=True, check=True)
+    online_threads = [int(t) for t in numa_config.stdout.split("physcpubind: ")[1].split(" \ncpubind:")[0].split()
+                      if int(t) in designated_threads]
+    if len(online_threads) < args.num_processes * args.num_threads:
+        print(f"Requested config requires {args.num_processes * args.num_threads} threads, while only {len(online_threads)} threads are both online and designated")
+        sys.exit(1)
+
+    logs_dir = os.path.join("/tmp", str(uuid.uuid4()))
+    os.mkdir(logs_dir)
+    current_subprocesses = list()
+    for n in range(args.num_processes):
+        logfile = f"{logs_dir}/log_{n}"
+        cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
+               "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
+               str(args.batch_size), str(args.num_threads)]
+        current_subprocesses.append(
+            subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb')))
+    start = time.time()
+    if any(p.wait() != 0 for p in current_subprocesses):
+        print("FAIL: At least one process returned exit code other than 0 or died!")
+        sys.exit(1)
+    finish = time.time()
+    summarize_results(logs_dir, args, start, finish)
+
+
+if __name__ == "__main__":
+    main()

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +set -e
++
 +python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
 +rm -f /tmp/log_power