huggingface
diff --git a/‎.github/workflows/build.yaml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/build.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.github/workflows/ci_build.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ci_build.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile_gaudi‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile_gaudi‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 3 additions & 0 deletions b/‎Makefile‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/gaudi/examples/docker_commands/docker_commands.md‎
Lines changed: 283 additions & 0 deletions b/‎backends/gaudi/examples/docker_commands/docker_commands.md‎
Lines changed: 283 additions & 0 deletions
@@ -124,6 +124,15 @@ jobs:
                 export extra_pytest="--neuron"
                 export target=""
                 ;;
+            gaudi)
+                export dockerfile="Dockerfile_gaudi"
+                export label_extension="-gaudi"
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform=""
+                export extra_pytest=""
+                export target=""
           esac
           echo $dockerfile
           echo "Dockerfile=${dockerfile}"
 
@@ -21,6 +21,7 @@ on:
       - "Dockerfile_amd"
       - "Dockerfile_intel"
       - "Dockerfile.neuron"
+      - "Dockerfile_gaudi"
     branches:
       - "main"
   workflow_dispatch:
@@ -38,7 +39,7 @@ jobs:
       # fail-fast is true by default
       fail-fast: false
       matrix:
-        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron"]
+        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"]
     uses: ./.github/workflows/build.yaml # calls the one above ^
     permissions:
       contents: write
 
@@ -28,3 +28,4 @@ server/fbgemmm
 hl-smi_log*.txt
 .graph_dumps
 out
+hqt_output
@@ -1,6 +1,6 @@
 # Those arguments are required to build the image
-ARG HABANA_VERSION
-ARG PYTORCH_VERSION
+ARG HABANA_VERSION=1.19.0
+ARG PYTORCH_VERSION=2.5.1
 
 # Rust builder
 FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
 
@@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize:
 
 clean:
 	rm -rf target aml
+
+preview_doc:
+	doc-builder preview text-generation-inference docs/source --not_python_module
@@ -0,0 +1,283 @@
+# Examples of Docker Commands for Gaudi Backend
+
+This page gives a list of examples of docker run commands for some of the most popular models.
+
+> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size.
+
+## Default Precision (BF16)
+
+### Llama3.1-8B on 1 card (BF16)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+### Llama3.1-70B 8 cards (BF16)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### Llama2-7B on 1 Card (BF16)
+
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+### Llama2-70B on 8 cards (BF16)
+
+```bash
+model=meta-llama/Llama-2-70b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### Llava-v1.6-Mistral-7B on 1 card (BF16)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+## FP8 Precision
+
+Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
+
+## Llama3.1-8B on 1 Card (FP8)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+## Llama3.1-70B on 8 cards (FP8)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+## Llama2-7B on 1 Card (FP8)
+
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+## Llama2-70B on 8 Cards (FP8)
+
+```bash
+model=meta-llama/Llama-2-70b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+## Llava-v1.6-Mistral-7B on 1 Card (FP8)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```