Skip to content

Commit 27ed848

Browse files
Release of Gaudi Backend for TGI (#3091)
* feat(gaudi): release ready (docs, docker image and vlm ready) * fix(gaudi): add default argument for the dockerfile * fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices
1 parent 83ef364 commit 27ed848

File tree

10 files changed

+635
-12
lines changed

10 files changed

+635
-12
lines changed

.github/workflows/build.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,15 @@ jobs:
124124
export extra_pytest="--neuron"
125125
export target=""
126126
;;
127+
gaudi)
128+
export dockerfile="Dockerfile_gaudi"
129+
export label_extension="-gaudi"
130+
export docker_volume="/mnt/cache"
131+
export docker_devices=""
132+
export runs_on="ubuntu-latest"
133+
export platform=""
134+
export extra_pytest=""
135+
export target=""
127136
esac
128137
echo $dockerfile
129138
echo "Dockerfile=${dockerfile}"

.github/workflows/ci_build.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ on:
2121
- "Dockerfile_amd"
2222
- "Dockerfile_intel"
2323
- "Dockerfile.neuron"
24+
- "Dockerfile_gaudi"
2425
branches:
2526
- "main"
2627
workflow_dispatch:
@@ -38,7 +39,7 @@ jobs:
3839
# fail-fast is true by default
3940
fail-fast: false
4041
matrix:
41-
hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron"]
42+
hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"]
4243
uses: ./.github/workflows/build.yaml # calls the one above ^
4344
permissions:
4445
contents: write

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@ server/fbgemmm
2828
hl-smi_log*.txt
2929
.graph_dumps
3030
out
31+
hqt_output

Dockerfile_gaudi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Those arguments are required to build the image
2-
ARG HABANA_VERSION
3-
ARG PYTORCH_VERSION
2+
ARG HABANA_VERSION=1.19.0
3+
ARG PYTORCH_VERSION=2.5.1
44

55
# Rust builder
66
FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize:
5353

5454
clean:
5555
rm -rf target aml
56+
57+
preview_doc:
58+
doc-builder preview text-generation-inference docs/source --not_python_module
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
# Examples of Docker Commands for Gaudi Backend
2+
3+
This page gives a list of examples of docker run commands for some of the most popular models.
4+
5+
> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size.
6+
7+
## Default Precision (BF16)
8+
9+
### Llama3.1-8B on 1 card (BF16)
10+
11+
```bash
12+
model=meta-llama/Meta-Llama-3.1-8B-Instruct
13+
hf_token=YOUR_ACCESS_TOKEN
14+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
15+
16+
docker run -p 8080:80 \
17+
--runtime=habana \
18+
--cap-add=sys_nice \
19+
--ipc=host \
20+
-v $volume:/data \
21+
-e HF_TOKEN=$hf_token \
22+
-e MAX_TOTAL_TOKENS=2048 \
23+
-e PREFILL_BATCH_BUCKET_SIZE=2 \
24+
-e BATCH_BUCKET_SIZE=32 \
25+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
26+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
27+
--model-id $model \
28+
--max-input-tokens 1024 --max-total-tokens 2048 \
29+
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
30+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
31+
```
32+
33+
### Llama3.1-70B 8 cards (BF16)
34+
35+
```bash
36+
model=meta-llama/Meta-Llama-3.1-70B-Instruct
37+
hf_token=YOUR_ACCESS_TOKEN
38+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
39+
40+
docker run -p 8080:80 \
41+
--runtime=habana \
42+
--cap-add=sys_nice \
43+
--ipc=host \
44+
-v $volume:/data \
45+
-e HF_TOKEN=$hf_token \
46+
-e MAX_TOTAL_TOKENS=2048 \
47+
-e BATCH_BUCKET_SIZE=256 \
48+
-e PREFILL_BATCH_BUCKET_SIZE=4 \
49+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
50+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
51+
--model-id $model \
52+
--sharded true --num-shard 8 \
53+
--max-input-tokens 1024 --max-total-tokens 2048 \
54+
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
55+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
56+
```
57+
58+
### Llama2-7B on 1 Card (BF16)
59+
60+
```bash
61+
model=meta-llama/Llama-2-7b-chat-hf
62+
hf_token=YOUR_ACCESS_TOKEN
63+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
64+
65+
docker run -p 8080:80 \
66+
--runtime=habana \
67+
--cap-add=sys_nice \
68+
--ipc=host \
69+
-v $volume:/data \
70+
-e HF_TOKEN=$hf_token \
71+
-e MAX_TOTAL_TOKENS=2048 \
72+
-e PREFILL_BATCH_BUCKET_SIZE=2 \
73+
-e BATCH_BUCKET_SIZE=32 \
74+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
75+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
76+
--model-id $model \
77+
--max-input-tokens 1024 --max-total-tokens 2048 \
78+
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
79+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
80+
```
81+
82+
### Llama2-70B on 8 cards (BF16)
83+
84+
```bash
85+
model=meta-llama/Llama-2-70b-chat-hf
86+
hf_token=YOUR_ACCESS_TOKEN
87+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
88+
89+
docker run -p 8080:80 \
90+
--runtime=habana \
91+
--cap-add=sys_nice \
92+
--ipc=host \
93+
-v $volume:/data \
94+
-e HF_TOKEN=$hf_token \
95+
-e MAX_TOTAL_TOKENS=2048 \
96+
-e BATCH_BUCKET_SIZE=256 \
97+
-e PREFILL_BATCH_BUCKET_SIZE=4 \
98+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
99+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
100+
--model-id $model \
101+
--sharded true --num-shard 8 \
102+
--max-input-tokens 1024 --max-total-tokens 2048 \
103+
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
104+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
105+
```
106+
107+
### Llava-v1.6-Mistral-7B on 1 card (BF16)
108+
109+
```bash
110+
model=llava-hf/llava-v1.6-mistral-7b-hf
111+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
112+
113+
docker run -p 8080:80 \
114+
--runtime=habana \
115+
--cap-add=sys_nice \
116+
--ipc=host \
117+
-v $volume:/data \
118+
-e PREFILL_BATCH_BUCKET_SIZE=1 \
119+
-e BATCH_BUCKET_SIZE=1 \
120+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
121+
--model-id $model \
122+
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
123+
--max-total-tokens 8192 --max-batch-size 4
124+
```
125+
126+
## FP8 Precision
127+
128+
Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
129+
130+
## Llama3.1-8B on 1 Card (FP8)
131+
132+
```bash
133+
model=meta-llama/Meta-Llama-3.1-8B-Instruct
134+
hf_token=YOUR_ACCESS_TOKEN
135+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
136+
137+
docker run -p 8080:80 \
138+
--runtime=habana \
139+
--cap-add=sys_nice \
140+
--ipc=host \
141+
-v $volume:/data \
142+
-v $PWD/quantization_config:/usr/src/quantization_config \
143+
-v $PWD/hqt_output:/usr/src/hqt_output \
144+
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
145+
-e HF_TOKEN=$hf_token \
146+
-e MAX_TOTAL_TOKENS=2048 \
147+
-e PREFILL_BATCH_BUCKET_SIZE=2 \
148+
-e BATCH_BUCKET_SIZE=32 \
149+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
150+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
151+
--model-id $model \
152+
--max-input-tokens 1024 --max-total-tokens 2048 \
153+
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
154+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
155+
```
156+
157+
## Llama3.1-70B on 8 cards (FP8)
158+
159+
```bash
160+
model=meta-llama/Meta-Llama-3.1-70B-Instruct
161+
hf_token=YOUR_ACCESS_TOKEN
162+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
163+
164+
docker run -p 8080:80 \
165+
--runtime=habana \
166+
--cap-add=sys_nice \
167+
--ipc=host \
168+
-v $volume:/data \
169+
-v $PWD/quantization_config:/usr/src/quantization_config \
170+
-v $PWD/hqt_output:/usr/src/hqt_output \
171+
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
172+
-e HF_TOKEN=$hf_token \
173+
-e MAX_TOTAL_TOKENS=2048 \
174+
-e BATCH_BUCKET_SIZE=256 \
175+
-e PREFILL_BATCH_BUCKET_SIZE=4 \
176+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
177+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
178+
--model-id $model \
179+
--sharded true --num-shard 8 \
180+
--max-input-tokens 1024 --max-total-tokens 2048 \
181+
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
182+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
183+
```
184+
185+
## Llama2-7B on 1 Card (FP8)
186+
187+
```bash
188+
model=meta-llama/Llama-2-7b-chat-hf
189+
hf_token=YOUR_ACCESS_TOKEN
190+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
191+
192+
docker run -p 8080:80 \
193+
--runtime=habana \
194+
--cap-add=sys_nice \
195+
--ipc=host \
196+
-v $volume:/data \
197+
-v $PWD/quantization_config:/usr/src/quantization_config \
198+
-v $PWD/hqt_output:/usr/src/hqt_output \
199+
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
200+
-e HF_TOKEN=$hf_token \
201+
-e MAX_TOTAL_TOKENS=2048 \
202+
-e PREFILL_BATCH_BUCKET_SIZE=2 \
203+
-e BATCH_BUCKET_SIZE=32 \
204+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
205+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
206+
--model-id $model \
207+
--max-input-tokens 1024 --max-total-tokens 2048 \
208+
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
209+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
210+
```
211+
212+
## Llama2-70B on 8 Cards (FP8)
213+
214+
```bash
215+
model=meta-llama/Llama-2-70b-chat-hf
216+
hf_token=YOUR_ACCESS_TOKEN
217+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
218+
219+
docker run -p 8080:80 \
220+
--runtime=habana \
221+
--cap-add=sys_nice \
222+
--ipc=host \
223+
-v $volume:/data \
224+
-v $PWD/quantization_config:/usr/src/quantization_config \
225+
-v $PWD/hqt_output:/usr/src/hqt_output \
226+
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
227+
-e HF_TOKEN=$hf_token \
228+
-e MAX_TOTAL_TOKENS=2048 \
229+
-e BATCH_BUCKET_SIZE=256 \
230+
-e PREFILL_BATCH_BUCKET_SIZE=4 \
231+
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
232+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
233+
--model-id $model \
234+
--sharded true --num-shard 8 \
235+
--max-input-tokens 1024 --max-total-tokens 2048 \
236+
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
237+
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
238+
```
239+
240+
## Llava-v1.6-Mistral-7B on 1 Card (FP8)
241+
242+
```bash
243+
model=llava-hf/llava-v1.6-mistral-7b-hf
244+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
245+
246+
docker run -p 8080:80 \
247+
--runtime=habana \
248+
--cap-add=sys_nice \
249+
--ipc=host \
250+
-v $volume:/data \
251+
-v $PWD/quantization_config:/usr/src/quantization_config \
252+
-v $PWD/hqt_output:/usr/src/hqt_output \
253+
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
254+
-e PREFILL_BATCH_BUCKET_SIZE=1 \
255+
-e BATCH_BUCKET_SIZE=1 \
256+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
257+
--model-id $model \
258+
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
259+
--max-total-tokens 8192 --max-batch-size 4
260+
```
261+
262+
## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
263+
264+
```bash
265+
model=llava-hf/llava-v1.6-mistral-7b-hf
266+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
267+
268+
docker run -p 8080:80 \
269+
--runtime=habana \
270+
--cap-add=sys_nice \
271+
--ipc=host \
272+
-v $volume:/data \
273+
-v $PWD/quantization_config:/usr/src/quantization_config \
274+
-v $PWD/hqt_output:/usr/src/hqt_output \
275+
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
276+
-e PREFILL_BATCH_BUCKET_SIZE=1 \
277+
-e BATCH_BUCKET_SIZE=1 \
278+
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
279+
--model-id $model \
280+
--sharded true --num-shard 8 \
281+
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
282+
--max-total-tokens 8192 --max-batch-size 4
283+
```

0 commit comments

Comments
 (0)