add model_class argument (#29)

mayank31398 · web-flow · commit 25e19f55014a · 2022-11-11T17:36:50.000+05:30
diff --git a/Makefile b/Makefile
@@ -15,6 +15,7 @@ gen-proto:
 bloom-176b:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
+	MODEL_CLASS=AutoModelForCausalLM \
 	DEPLOYMENT_FRAMEWORK=ds_inference \
 	DTYPE=fp16 \
 	MAX_INPUT_LENGTH=2048 \
@@ -25,16 +26,18 @@ bloom-176b:
 bloomz-176b:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=bigscience/bloomz \
+	MODEL_CLASS=AutoModelForCausalLM \
 	DEPLOYMENT_FRAMEWORK=ds_inference \
 	DTYPE=fp16 \
 	MAX_INPUT_LENGTH=2048 \
 	MAX_BATCH_SIZE=4 \
 	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 	gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
 
-bloomz-560m:
+bloom-560m:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=bigscience/bloom-560m \
+	MODEL_CLASS=AutoModelForCausalLM \
 	DEPLOYMENT_FRAMEWORK=hf_accelerate \
 	DTYPE=bf16 \
 	MAX_INPUT_LENGTH=2048 \
@@ -45,6 +48,7 @@ bloomz-560m:
 flan-t5-xxl:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=google/flan-t5-xxl \
+	MODEL_CLASS=AutoModelForSeq2SeqLM
 	DEPLOYMENT_FRAMEWORK=hf_accelerate \
 	DTYPE=fp \
 	MAX_INPUT_LENGTH=2048 \
@@ -55,6 +59,7 @@ flan-t5-xxl:
 ul2:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=google/ul2 \
+	MODEL_CLASS=AutoModelForSeq2SeqLM \
 	DEPLOYMENT_FRAMEWORK=hf_accelerate \
 	DTYPE=fp16 \
 	MAX_INPUT_LENGTH=2048 \
@@ -65,6 +70,7 @@ ul2:
 codegen-mono:
 	TOKENIZERS_PARALLELISM=false \
 	MODEL_NAME=Salesforce/codegen-16B-mono \
+	MODEL_CLASS=AutoModelForSeq2SeqLM \
 	DEPLOYMENT_FRAMEWORK=hf_accelerate \
 	DTYPE=fp16 \
 	MAX_INPUT_LENGTH=2048 \
diff --git a/inference_server/README.md b/inference_server/README.md
@@ -17,7 +17,7 @@ CFLAGS="-I$CONDA_PREFIX/include/" LDFLAGS="-L$CONDA_PREFIX/lib/" TORCH_CUDA_ARCH
 
 All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B (fp16/bf16) and 4 A100 80GB GPUs for BLOOM 176B (int8). These scripts might not work for other models or a different number of GPUs.
 
-DS inference is deployed using the DeepSpeed MII library which requires the resharded checkpoints for 8 x Tensor Parallel.
+DS inference is deployed using logic borrowed from DeepSpeed MII library.
 
 Note: Sometimes GPU memory is not freed when DS inference deployment crashes. You can free this memory by running `killall python` in terminal.
 
diff --git a/inference_server/model_handler/deployment.py b/inference_server/model_handler/deployment.py
@@ -1,3 +1,6 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
 import argparse
 import asyncio
 import os
diff --git a/inference_server/model_handler/launch.py b/inference_server/model_handler/launch.py
@@ -1,3 +1,6 @@
+"""
+Copyright 2022 The Microsoft DeepSpeed Team
+"""
 import argparse
 
 import torch.distributed as dist
diff --git a/inference_server/models/ds_inference.py b/inference_server/models/ds_inference.py
@@ -33,7 +33,7 @@ def __init__(self, args: Namespace) -> None:
         # the actual weights while calling deepspeed.init_inference in the
         # following code
         with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-            self.model = get_hf_model_class(args.model_name).from_config(
+            self.model = get_hf_model_class(args.model_class).from_config(
                 AutoConfig.from_pretrained(downloaded_model_path), torch_dtype=torch.bfloat16
             )
         self.model = self.model.eval()
diff --git a/inference_server/models/ds_zero.py b/inference_server/models/ds_zero.py
@@ -57,7 +57,9 @@ def __init__(self, args: Namespace) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path)
         self.pad = self.tokenizer.pad_token_id
 
-        self.model = get_hf_model_class(args.model_name).from_pretrained(downloaded_model_path, torch_dtype=args.dtype)
+        self.model = get_hf_model_class(args.model_class).from_pretrained(
+            downloaded_model_path, torch_dtype=args.dtype
+        )
         self.model = self.model.eval()
 
         # convert model to a fully sharded model using ZeRO
diff --git a/inference_server/models/hf_accelerate.py b/inference_server/models/hf_accelerate.py
@@ -32,7 +32,7 @@ def __init__(self, args: Namespace) -> None:
 
         # this is the CUDA device for the current process. This will be used
         # later to identify the GPU on which to transfer tensors
-        self.model = get_hf_model_class(args.model_name).from_pretrained(**kwargs)
+        self.model = get_hf_model_class(args.model_class).from_pretrained(**kwargs)
 
         self.model.requires_grad_(False)
         self.model.eval()
diff --git a/inference_server/models/model.py b/inference_server/models/model.py
@@ -5,6 +5,7 @@
 
 import torch
 
+import transformers
 from huggingface_hub import snapshot_download
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM
 from transformers.utils import is_offline_mode
@@ -112,10 +113,5 @@ def check_batch_size(batch_size: int, max_batch_size: int) -> None:
 
 
 # this is a hack for now
-def get_hf_model_class(model_name: str) -> Union[AutoModelForCausalLM, AutoModelForSeq2SeqLM]:
-    if "bloom" in model_name:
-        return AutoModelForCausalLM
-    elif "t5" in model_name:
-        return AutoModelForSeq2SeqLM
-    elif "ul2" in model_name:
-        return AutoModelForSeq2SeqLM
+def get_hf_model_class(model_class: str) -> Union[AutoModelForCausalLM, AutoModelForSeq2SeqLM]:
+    return getattr(transformers, model_class)
diff --git a/inference_server/server.py b/inference_server/server.py
@@ -29,6 +29,7 @@ class Args:
     def __init__(self) -> None:
         self.deployment_framework = os.getenv("DEPLOYMENT_FRAMEWORK", HF_ACCELERATE)
         self.model_name = os.getenv("MODEL_NAME")
+        self.model_class = os.getenv("MODEL_CLASS")
         self.dtype = get_torch_dtype(os.getenv("DTYPE"))
         self.allowed_max_new_tokens = int(os.getenv("ALLOWED_MAX_NEW_TOKENS", 100))
         self.max_input_length = int(os.getenv("MAX_INPUT_LENGTH", 512))
diff --git a/inference_server/utils/utils.py b/inference_server/utils/utils.py
@@ -38,7 +38,13 @@ def get_argument_parser() -> argparse.ArgumentParser:
         "--model_name",
         type=str,
         required=True,
-        help="model to use",
+        help="model name to use",
+    )
+    group.add_argument(
+        "--model_class",
+        type=str,
+        required=True,
+        help="model class to use",
     )
     group.add_argument("--dtype", type=str, required=True, choices=["bf16", "fp16", "int8"], help="dtype for model")
     group.add_argument(

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def __init__(self, args: Namespace) -> None:`
`33`	`33`	`# the actual weights while calling deepspeed.init_inference in the`
`34`	`34`	`# following code`
`35`	`35`	`with deepspeed.OnDevice(dtype=torch.float16, device="meta"):`
`36`		`- self.model = get_hf_model_class(args.model_name).from_config(`
	`36`	`+ self.model = get_hf_model_class(args.model_class).from_config(`
`37`	`37`	`AutoConfig.from_pretrained(downloaded_model_path), torch_dtype=torch.bfloat16`
`38`	`38`	`)`
`39`	`39`	`self.model = self.model.eval()`