Support for deepspeed int8 model (#36)

arvindsun · web-flow · commit 4a316baa075e · 2022-11-22T12:36:31.000+05:30
diff --git a/Makefile b/Makefile
@@ -34,6 +34,17 @@ bloomz-176b:
 	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
 
+bloomz-176b-int8:
+	TOKENIZERS_PARALLELISM=false \
+	MODEL_NAME=microsoft/bloom-deepspeed-inference-int8 \
+	MODEL_CLASS=AutoModelForCausalLM \
+	DEPLOYMENT_FRAMEWORK=ds_inference \
+	DTYPE=int8 \
+	MAX_INPUT_LENGTH=2048 \
+	MAX_BATCH_SIZE=4 \
+	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
+
 bloom-560m:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=bigscience/bloom-560m \