Merge pull request #7 from VectorInstitute/develop

XkunW · web-flow · commit 2c43a25a78dc · 2024-07-06T11:57:00.000-04:00
Add CodeLlama
diff --git a/models/codellama/README.md b/models/codellama/README.md
@@ -0,0 +1,12 @@
+# [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933) 
+
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [**`7b-hf`**](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
+| [`34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
+| [`34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
+| [`70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
+| [`70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
diff --git a/models/codellama/config.sh b/models/codellama/config.sh
@@ -0,0 +1,5 @@
+export MODEL_NAME="CodeLlama"
+export MODEL_VARIANT="7b-hf"
+export NUM_NODES=1
+export NUM_GPUS=1
+export VLLM_MAX_LOGPROBS=32000
diff --git a/models/llama2/README.md b/models/llama2/README.md
@@ -2,9 +2,9 @@
 
 | Variant | Suggested resource allocation |
 |:----------:|:----------:|
-| [**`7b`**](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 | 
-| [`7b-chat`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 | 
-| [`13b`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 | 
-| [`13b-chat`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
-| [`70b`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
-| [`70b-chat`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
+| [**`7b-hf`**](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 | 
+| [`7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 | 
+| [`13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 | 
+| [`13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
+| [`70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
+| [`70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vector-inference"
-version = "0.2.0"
+version = "0.2.1"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
 authors = ["XkunW <marshall.wang@vectorinstitute.ai>"]
 license = "MIT license"