Skip to content
This repository was archived by the owner on Oct 9, 2024. It is now read-only.

Commit 4a316ba

Browse files
authored
Support for deepspeed int8 model (#36)
1 parent 556ccac commit 4a316ba

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

Makefile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ bloomz-176b:
3434
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
3535
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
3636

37+
bloomz-176b-int8:
38+
TOKENIZERS_PARALLELISM=false \
39+
MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
40+
MODEL_CLASS=AutoModelForCausalLM \
41+
DEPLOYMENT_FRAMEWORK=ds_inference \
42+
DTYPE=int8 \
43+
MAX_INPUT_LENGTH=2048 \
44+
MAX_BATCH_SIZE=4 \
45+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
46+
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
47+
3748
bloom-560m:
3849
TOKENIZERS_PARALLELISM=false \
3950
MODEL_NAME=bigscience/bloom-560m \

0 commit comments

Comments
 (0)