Skip to content

Commit 0c31b38

Browse files
authored
Merge pull request #11 from davidz-ampere/main
adjust for 2P run.
2 parents 9d153ec + 922ff5e commit 0c31b38

File tree

3 files changed

+63
-12
lines changed

3 files changed

+63
-12
lines changed

benchmarks/run.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@ def get_file_dir():
1111
return os.path.dirname(os.path.realpath(__file__))
1212

1313

14-
def docker_init():
14+
def docker_init(node):
1515
tag = "amperecomputingai/llama.cpp:2.0.0"
1616
if subprocess.run(
1717
["docker", "pull", tag]).returncode != 0:
1818
print("Docker pull process failed!")
1919
sys.exit(1)
20-
container_name = "llama_benchmark"
20+
container_name = f"llama_benchmark_n{node}"
2121
subprocess.run(["docker", "rm", "-f", container_name])
2222
memory = (psutil.virtual_memory().total >> 30) - 30 # leave 30GB for OS
2323
assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
2424
if subprocess.run(
25-
["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
25+
["docker", "run", "--privileged=true", "--cpuset-mems", f"{str(node)}", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
2626
f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
2727
print("Docker run process failed!")
2828
sys.exit(1)
@@ -106,12 +106,16 @@ def parse_args():
106106
parser.add_argument("--timeout",
107107
type=float, default=900,
108108
help="timeout to apply per single benchmark case")
109+
parser.add_argument("-n", "--numa",
110+
type=int, default=0,
111+
help="numa mode of the docker container")
112+
109113
return parser.parse_args()
110114

111115

112116
def main():
113117
args = parse_args()
114-
benchmark(docker_init(), args)
118+
benchmark(docker_init(args.numa), args)
115119

116120

117121
if __name__ == "__main__":

benchmarks/run_2p.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
set -e
2+
3+
sync
4+
echo 3 | sudo tee /proc/sys/vm/drop_caches
5+
echo 1 | sudo tee /proc/sys/vm/swappiness
6+
echo 8 | sudo tee /proc/sys/vm/dirty_ratio
7+
echo 1 | sudo tee /proc/sys/vm/zone_reclaim_mode
8+
echo 0 | sudo tee /proc/sys/kernel/numa_balancing
9+
10+
VAR_PAGESIZE=$(getconf PAGESIZE)
11+
if [ $VAR_PAGESIZE = 4096 ]; then
12+
echo always | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
13+
elif [ $VAR_PAGESIZE = 65536 ]; then
14+
echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
15+
fi
16+
17+
# Warm up
18+
python3 run.py -m DeepSeek-R1-Distill-Qwen-7B-Q8R16_n0.gguf -t 80 -b 1 -p 512 -r 0-79 -n 0
19+
python3 run.py -m DeepSeek-R1-Distill-Qwen-7B-Q8R16_n1.gguf -t 80 -b 1 -p 512 -r 80-159 -n 1
20+
21+
# Run
22+
python3 run.py -m DeepSeek-R1-Distill-Qwen-7B-Q8R16_n0.gguf -t 80 64 48 40 32 24 20 16 12 10 8 -b 1 2 4 8 -p 512 -r 0-79 -n 0 &
23+
python3 run.py -m DeepSeek-R1-Distill-Qwen-7B-Q8R16_n1.gguf -t 80 64 48 40 32 24 20 16 12 10 8 -b 1 2 4 8 -p 512 -r 80-159 -n 1 &
24+
wait

benchmarks/utils/benchmark.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ def parse_args():
3535
parser.add_argument("-t", "--num_threads",
3636
type=int, default=1,
3737
help="number of threads to use per process")
38+
parser.add_argument("--mp",
39+
type=str, default="local",
40+
help="memory placement policy, 'local','interleave' or 'none'")
3841
return parser.parse_args()
3942

4043

@@ -93,8 +96,9 @@ def summarize_results(logs_dir, args, start, finish):
9396
["n_proc", "n_threads", "batch_size", "prompt_size", "output_tokens", "pp_throughput_tps",
9497
"pp_avg_latency_sec", "tg_throughput_tps", "tg_avg_latency_sec", "pp+tg_throughput_tps", "concurrency", "start", "finish"])
9598
writer.writerow(
96-
[args.num_processes, args.num_threads, args.batch_size, args.prompt_size, TOKENS, pp_throughput,
97-
avg_pp_latency, tg_throughput, avg_tg_latency, avg_total_speed, args.batch_size * args.num_processes, start, finish])
99+
[args.num_processes, args.num_threads, args.batch_size, args.prompt_size, TOKENS, f"{pp_throughput:.3f}",
100+
f"{avg_pp_latency:.3f}", f"{tg_throughput:.3f}", f"{avg_tg_latency:.3f}", f"{avg_total_speed:.3f}", args.batch_size * args.num_processes, f"{start:.3f}", f"{finish:.3f}"])
101+
98102
print(f"Result saved in {results_filename}")
99103

100104

@@ -114,21 +118,40 @@ def main():
114118
logs_dir = os.path.join("/tmp", str(uuid.uuid4()))
115119
os.mkdir(logs_dir)
116120
current_subprocesses = list()
121+
if args.mp == "local":
122+
mem_place = "--localalloc"
123+
elif args.mp == "interleave":
124+
mem_place = "--interleave=all"
125+
else:
126+
mem_place = "none"
127+
117128
for n in range(args.num_processes):
118129
logfile = f"{logs_dir}/log_{n}"
119130
if os.path.exists("/llm/batched-bench"):
120131
# command-line for v1
121-
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
122-
"/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
123-
str(args.batch_size), str(args.num_threads)]
132+
if mem_place == "none":
133+
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
134+
"/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
135+
str(args.batch_size), str(args.num_threads)]
136+
else:
137+
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}", str(mem_place),
138+
"/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
139+
str(args.batch_size), str(args.num_threads)]
124140
elif os.path.exists("/llm/llama-batched-bench"):
125141
# command-line for v2
126-
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
127-
"/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
128-
"-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)]
142+
if mem_place == "none":
143+
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
144+
"/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
145+
"-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)]
146+
else:
147+
cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",str(mem_place),
148+
"/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
149+
"-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)]
150+
129151
else:
130152
print("FAIL: batched-bench not found!")
131153
sys.exit(1)
154+
132155
current_subprocesses.append(
133156
subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb')))
134157
start = time.time()

0 commit comments

Comments
 (0)