[Neural Speed]load model from modelscope (#1382)

LJ-underdog · VincyZhang · web-flow · commit 20ae003938f7 · 2024-03-27T19:45:50.000+08:00
Co-authored-by: Wenxin Zhang &lt;wenxin.zhang@intel.com&gt;
diff --git a/README.md b/README.md
@@ -216,6 +216,22 @@ model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
 outputs = model.generate(inputs)
 ```
 
+You can also load PyTorch Model from Modelscope
+>**Note**:require modelscope
+```python
+from transformers import TextStreamer
+from modelscope import AutoTokenizer
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
+model_name = "qwen/Qwen-7B"     # Modelscope model_id or local model
+prompt = "Once upon a time, there existed a little girl,"
+
+model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, model_hub="modelscope")
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
+```
+
 You can also load the low-bit model quantized by GPTQ/AWQ/RTN/AutoRound algorithm.
 ```python
 from transformers import AutoTokenizer
diff --git a/examples/huggingface/neural_speed/perplexity/requirements.txt b/examples/huggingface/neural_speed/perplexity/requirements.txt
@@ -13,4 +13,4 @@ tiktoken
 py-cpuinfo
 cmake
 gguf
-neural-speed
+neural-speed==1.0a0
diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt
@@ -1,5 +1,5 @@
 intel_extension_for_transformers
-neural-speed
+neural-speed==1.0a0
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
 sentencepiece
 gguf
diff --git a/intel_extension_for_transformers/llm/runtime/neural_speed/requirements.txt b/intel_extension_for_transformers/llm/runtime/neural_speed/requirements.txt
@@ -0,0 +1,17 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+accelerate
+auto-gptq
+cmake
+datasets
+einops
+gguf
+neural-speed==1.0a0
+numpy
+peft
+protobuf<3.20
+py-cpuinfo
+sentencepiece
+tiktoken
+torch==2.2.0+cpu
+transformers
+transformers_stream_generator
diff --git a/intel_extension_for_transformers/neural_chat/requirements.txt b/intel_extension_for_transformers/neural_chat/requirements.txt
@@ -8,7 +8,7 @@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2
 huggingface_hub
 intel_extension_for_pytorch==2.2.0
 neural-compressor
-neural_speed
+neural_speed==1.0a0
 numpy==1.23.5
 onnx>=1.15.0
 optimum
diff --git a/intel_extension_for_transformers/neural_chat/requirements_cpu.txt b/intel_extension_for_transformers/neural_chat/requirements_cpu.txt
@@ -7,7 +7,7 @@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2
 huggingface_hub
 intel_extension_for_pytorch==2.2.0
 neural-compressor
-neural_speed
+neural_speed==1.0a0
 numpy==1.23.5
 optimum
 optimum-intel
diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
@@ -40,7 +40,7 @@ langid
 librosa
 markdown
 neural-compressor
-neural_speed
+neural_speed==1.0a0
 num2words
 numba
 numpy==1.23.5
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -281,6 +281,8 @@ class _BaseQBitsAutoModelClass:
         "qwen",
         "phi",
         "whisper",
+        "qwen2",
+        "gemma",
     ]
 
     model_type_list_for_gptq = [
@@ -361,12 +363,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         )
 
         config = kwargs.pop("config", None)
+        model_hub = kwargs.pop("model_hub", "huggingface")
 
         if not isinstance(config, PretrainedConfig):
-            config, _ = AutoConfig.from_pretrained(
-                pretrained_model_name_or_path,
-                return_unused_kwargs=True,
-                **kwargs,
+            if model_hub == "modelscope":
+                import modelscope # pylint: disable=E0401
+                config = modelscope.AutoConfig.from_pretrained(pretrained_model_name_or_path,
+                                            trust_remote_code=True)
+            else:
+                config, _ = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    return_unused_kwargs=True,
+                    **kwargs,
+
             )
 
         quantization_config = kwargs.pop("quantization_config", None)
@@ -541,7 +550,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 from neural_speed import Model
 
                 model = Model()
-                model.init(
+                model.init( # pylint: disable=E1123
                     pretrained_model_name_or_path,
                     weight_dtype=quantization_config.weight_dtype,
                     alg=quantization_config.scheme,
@@ -557,6 +566,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     use_gptq=quantization_config.quant_method.value == "gptq"
                     or quantization_config.quant_method.value == "autoround",
                     use_awq=quantization_config.quant_method.value == "awq",
+                    model_hub=model_hub,
                 )
                 model.quantization_config = quantization_config
                 return model
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -11,7 +11,7 @@ git+https://github.com/intel/neural-compressor.git
 intel-extension-for-pytorch==2.2.0
 intel-tensorflow==2.14.0
 mlflow
-neural-speed
+neural-speed==1.0a0
 nlpaug==1.1.9
 onnx==1.15.0
 onnxruntime==1.17.1