Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 1967445

Browse files
authored
[LLM] Fix llm models extension issue (#955)
1 parent ab408d7 commit 1967445

File tree

11 files changed

+116
-52
lines changed

11 files changed

+116
-52
lines changed

examples/huggingface/pytorch/code-generation/quantization/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ python run_generation.py \
8383
--allow_code_execution \
8484
--temperature 0.2 \
8585
--do_sample \
86-
--tasks "humaneval" \
86+
--tasks "humaneval"
8787
# mixedprecision
8888
python run_generation.py \
8989
--model bigcode/starcoder \
@@ -94,7 +94,7 @@ python run_generation.py \
9494
--allow_code_execution \
9595
--temperature 0.2 \
9696
--do_sample \
97-
--tasks "humaneval" \
97+
--tasks "humaneval"
9898
# smoothquant
9999
# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
100100
python run_generation.py \
@@ -108,7 +108,7 @@ python run_generation.py \
108108
--allow_code_execution \
109109
--temperature 0.2 \
110110
--do_sample \
111-
--tasks "humaneval" \
111+
--tasks "humaneval"
112112
# weightonlyquant
113113
python run_generation.py \
114114
--model bigcode/starcoder \
@@ -120,7 +120,7 @@ python run_generation.py \
120120
--allow_code_execution \
121121
--temperature 0.2 \
122122
--do_sample \
123-
--tasks "humaneval" \
123+
--tasks "humaneval"
124124
# load_in_4bit
125125
python run_generation.py \
126126
--model bigcode/starcoder \
@@ -131,7 +131,7 @@ python run_generation.py \
131131
--allow_code_execution \
132132
--temperature 0.2 \
133133
--do_sample \
134-
--tasks "humaneval" \
134+
--tasks "humaneval"
135135
# load_in_8bit
136136
python run_generation.py \
137137
--model bigcode/starcoder \
@@ -142,7 +142,7 @@ python run_generation.py \
142142
--allow_code_execution \
143143
--temperature 0.2 \
144144
--do_sample \
145-
--tasks "humaneval" \
145+
--tasks "humaneval"
146146
```
147147

148148
>Note:

examples/huggingface/pytorch/code-generation/quantization/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ sentencepiece != 0.1.92
77
torch==2.1.0+cpu
88
peft==0.6.2
99
transformers >= 4.35.0
10+
tiktoken #code_gen
1011
neural-compressor
1112
intel_extension_for_pytorch
1213
git+https://github.com/huggingface/optimum.git@927e94739447b13f7eefe085c8d3662654b6a11c

examples/huggingface/pytorch/code-generation/quantization/run_generation.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase"
2929
)
3030
parser.add_argument("--trust_remote_code", default=False)
31-
parser.add_argument("--revision", default="main", type=str)
31+
parser.add_argument("--_commit_hash", default="main", type=str)
3232
parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp")
3333
parser.add_argument("--dtype", type=str, default="int8")
3434
parser.add_argument(
@@ -137,7 +137,9 @@
137137
args.model,
138138
truncation_side="left",
139139
padding_side="right",
140+
trust_remote_code=args.trust_remote_code
140141
)
142+
141143
config = AutoConfig.from_pretrained(
142144
args.model,
143145
torchscript=True
@@ -149,7 +151,7 @@
149151
else False, # torchscript will force `return_dict=False` to avoid jit errors
150152
use_cache=True, # to use kv cache.
151153
trust_remote_code=args.trust_remote_code,
152-
revision=args.revision,
154+
_commit_hash=args._commit_hash,
153155
)
154156
if not tokenizer.eos_token:
155157
if tokenizer.bos_token:
@@ -206,7 +208,7 @@
206208
args.model,
207209
quantization_config=quantization_config,
208210
trust_remote_code=args.trust_remote_code,
209-
revision=args.revision,
211+
_commit_hash=args._commit_hash,
210212
use_llm_runtime=False,
211213
)
212214
elif args.load_in_4bit or args.load_in_8bit:
@@ -215,15 +217,15 @@
215217
args.model,
216218
load_in_4bit=args.load_in_4bit,
217219
load_in_8bit=args.load_in_8bit,
218-
revision=args.revision,
220+
_commit_hash=args._commit_hash,
219221
use_llm_runtime=False,
220222
)
221223
elif not args.int8 and not args.int8_bf16_mixed:
222224
user_model = AutoModelForCausalLM.from_pretrained(
223225
args.model,
224226
config=config,
225227
trust_remote_code=args.trust_remote_code,
226-
revision=args.revision,
228+
_commit_hash=args._commit_hash,
227229
use_llm_runtime=False,
228230
)
229231

@@ -248,7 +250,7 @@
248250
args.output_dir,
249251
file_name="best_model.pt",
250252
trust_remote_code=args.trust_remote_code,
251-
revision=args.revision,
253+
_commit_hash=args._commit_hash,
252254
)
253255

254256
if args.benchmark:

examples/huggingface/pytorch/text-generation/quantization/requirements.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@ peft
44
protobuf
55
sentencepiece != 0.1.92
66
--extra-index-url https://download.pytorch.org/whl/cpu
7-
torch==2.1.0+cpu
7+
torch==2.1.1+cpu
88
transformers
99
intel_extension_for_pytorch
10+
bitsandbytes #baichuan
11+
transformers_stream_generator
12+
tiktoken #qwen
13+
einops #qwen
1014
git+https://github.com/intel/neural-compressor.git
1115
git+https://github.com/huggingface/optimum-intel.git@f95dea1ae8966dee4d75d622e7b2468c514ba02d
1216
git+https://github.com/huggingface/optimum.git@927e94739447b13f7eefe085c8d3662654b6a11c

examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ function run_benchmark {
129129
elif [ "${topology}" = "baichuan_13b" ]; then
130130
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
131131
extra_cmd=$extra_cmd" --trust_remote_code True"
132-
extra_cmd=$extra_cmd" --revision 14d5b0e204542744900f6fb52422c6d633bdcb00"
132+
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
133133
pip install transformers==4.33
134134
elif [ "${topology}" = "baichuan2_7b" ]; then
135135
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
@@ -142,12 +142,16 @@ function run_benchmark {
142142
elif [ "${topology}" = "qwen_7b" ]; then
143143
model_name_or_path="Qwen/Qwen-7B"
144144
extra_cmd=$extra_cmd" --trust_remote_code True"
145+
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
146+
pip install transformers==4.35.2
145147
elif [ "${topology}" = "mistral_7b" ]; then
146148
model_name_or_path="Intel/neural-chat-7b-v3"
147149
elif [ "${topology}" = "phi_1b" ]; then
148150
model_name_or_path="susnato/phi-1_dev"
151+
pip install transformers==4.36.1
149152
elif [ "${topology}" = "phi_1_5b" ]; then
150153
model_name_or_path="susnato/phi-1_5_dev"
154+
pip install transformers==4.36.1
151155
fi
152156

153157
if [[ ${int8} == "true" ]]; then

examples/huggingface/pytorch/text-generation/quantization/run_generation.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@
125125
# ============AutoModel parameters==============
126126
parser.add_argument("--load_in_4bit", type=bool, default=False)
127127
parser.add_argument("--load_in_8bit", type=bool, default=False)
128-
parser.add_argument("--revision", default="main", type=str)
128+
parser.add_argument("--_commit_hash", default="main", type=str)
129129
parser.add_argument("--trust_remote_code", default=False)
130130
parser.add_argument("--use_llm_runtime", action="store_true")
131131
# =======================================
@@ -156,7 +156,7 @@
156156
else False, # torchscript will force `return_dict=False` to avoid jit errors
157157
use_cache=True, # to use kv cache.
158158
trust_remote_code=args.trust_remote_code,
159-
revision=args.revision,
159+
_commit_hash=args._commit_hash,
160160
)
161161

162162
# chatglm
@@ -255,32 +255,33 @@
255255
args.model,
256256
quantization_config=quantization_config,
257257
trust_remote_code=args.trust_remote_code,
258-
revision=args.revision,
258+
_commit_hash=args._commit_hash,
259259
use_llm_runtime=args.use_llm_runtime,
260+
260261
)
261262
elif args.load_in_4bit or args.load_in_8bit:
262263
# CPU device usage is provided by intel-extension-for-transformers.
263264
user_model = AutoModelForCausalLM.from_pretrained(
264265
args.model,
265266
load_in_4bit=args.load_in_4bit,
266267
load_in_8bit=args.load_in_8bit,
267-
revision=args.revision,
268+
_commit_hash=args._commit_hash,
268269
use_llm_runtime=args.use_llm_runtime,
269270
)
270271
elif (not args.int8 and not args.int8_bf16_mixed) or args.restore:
271272
if args.peft_model_id is not None:
272273
user_model = AutoModelForCausalLM.from_pretrained(
273274
args.peft_model_id,
274275
trust_remote_code=args.trust_remote_code,
275-
revision=args.revision,
276+
_commit_hash=args._commit_hash,
276277
use_llm_runtime=args.use_llm_runtime,
277278
)
278279
else:
279280
user_model = AutoModelForCausalLM.from_pretrained(
280281
args.model,
281282
config=config,
282283
trust_remote_code=args.trust_remote_code,
283-
revision=args.revision,
284+
_commit_hash=args._commit_hash,
284285
use_llm_runtime=args.use_llm_runtime,
285286
)
286287

@@ -389,8 +390,8 @@
389390
+ ",tokenizer="
390391
+ args.model
391392
+ ",dtype=float32"
392-
+ ",revision="
393-
+ args.revision
393+
+ ",_commit_hash="
394+
+ args._commit_hash
394395
+ ",trust_remote_code="
395396
+ str(args.trust_remote_code),
396397
user_model=user_model,

examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ function run_tuning {
145145
model_name_or_path="tiiuae/falcon-7b-instruct"
146146
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
147147
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
148-
pip install transformers==4.33
148+
pip install transformers==4.33.3
149149
elif [ "${topology}" = "baichuan_7b" ]; then
150150
alpha=0.85
151151
model_name_or_path="baichuan-inc/Baichuan-7B"
@@ -159,7 +159,7 @@ function run_tuning {
159159
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
160160
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
161161
extra_cmd=$extra_cmd" --trust_remote_code True"
162-
extra_cmd=$extra_cmd" --revision 14d5b0e204542744900f6fb52422c6d633bdcb00"
162+
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
163163
pip install transformers==4.33
164164
elif [ "${topology}" = "baichuan2_7b" ]; then
165165
alpha=0.85
@@ -181,6 +181,8 @@ function run_tuning {
181181
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
182182
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
183183
extra_cmd=$extra_cmd" --trust_remote_code True"
184+
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
185+
pip install transformers==4.35.2
184186
elif [ "${topology}" = "mistral_7b" ]; then
185187
alpha=0.8
186188
model_name_or_path="Intel/neural-chat-7b-v3"
@@ -193,12 +195,14 @@ function run_tuning {
193195
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
194196
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
195197
extra_cmd=$extra_cmd" --trust_remote_code True"
198+
pip install transformers==4.36.1
196199
elif [ "${topology}" = "phi_1_5b" ]; then
197200
alpha=0.5
198201
model_name_or_path="susnato/phi-1_5_dev"
199202
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
200203
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
201204
extra_cmd=$extra_cmd" --trust_remote_code True"
205+
pip install tranformers==4.36.1
202206
fi
203207

204208
if [ ${script} = "run_generation.py" ];then

intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ def __init__(
115115
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
116116
bnb_4bit_use_double_quant: Optional[bool] = False,
117117
init_empty_weights: Optional[bool] = False,
118-
model_format: Optional[str] = "torch"
118+
model_format: Optional[str] = "torch",
119+
_commit_hash: Optional[str] = None
119120
):
120121
"""Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
121122
Args:

intel_extension_for_transformers/llm/evaluation/models.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from transformers.modeling_outputs import CausalLMOutputWithPast
2323
from optimum.intel.generation.modeling import TSModelForCausalLM
2424
from intel_extension_for_transformers.transformers.utils.utility import (
25-
generate_dummy_past_key_values,
25+
generate_dummy_past_key_values_for_inference,
2626
generate_dummy_past_key_values_for_opt_llm,
2727
MODEL_TYPES_REQUIRING_POSITION_IDS,
2828
IPEX_OPT_LLM_SUPPORTED,
@@ -166,18 +166,16 @@ def forward(
166166
input_bs, input_len = input_ids.shape
167167
if self.use_cache and past_key_values is None:
168168
if model_type in IPEX_OPT_LLM_SUPPORTED:
169-
if (model_type == "falcon" and transformers.__version__ > "4.33") or (
170-
model_type == "llama" and transformers.__version__ >= "4.36"
171-
):
172-
past_key_values = generate_dummy_past_key_values(
169+
if model_type == "llama" and transformers.__version__ >= "4.36":
170+
past_key_values = generate_dummy_past_key_values_for_inference(
173171
config=self.config, input_bs=input_bs
174172
)
175173
else:
176174
past_key_values = generate_dummy_past_key_values_for_opt_llm(
177175
config=self.config, input_bs=input_bs, num_beams=1
178176
)
179177
else:
180-
past_key_values = generate_dummy_past_key_values(
178+
past_key_values = generate_dummy_past_key_values_for_inference(
181179
config=self.config, input_bs=input_bs
182180
)
183181
inputs["past_key_values"] = past_key_values
@@ -195,7 +193,6 @@ def forward(
195193
inputs["position_ids"] = position_ids
196194
else:
197195
inputs["position_ids"] = torch.arange(input_len).repeat(input_bs, 1)
198-
199196
outputs = self.model(**inputs)
200197

201198
if isinstance(outputs, (list, tuple)):

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
228228
model = model.float()
229229
model.eval()
230230
model_type = model.config.model_type.replace("_", "-")
231-
if "falcon" in model_type and transformers.__version__ > "4.33":
232-
ipex.nn.utils._model_convert.replace_customized_linear_with_linear(
233-
model.eval()
231+
if "falcon" in model_type:
232+
logger.warning(
233+
"Please use transformers 4.33.3 if you would like to apply smoothquant to Falcon."
234234
)
235-
quantization_config.ipex_opt_llm = False
236235
if "llama" in model_type and transformers.__version__ >= "4.36.0":
237236
quantization_config.ipex_opt_llm = False
238237
logger.info("Applying SmoothQuant.")
@@ -334,7 +333,11 @@ def collate_batch(batch):
334333
)
335334

336335
last_ind.append(input_ids.shape[0] - 1)
337-
attention_mask = torch.ones(len(input_ids))
336+
if model_type in ["bloom", "qwen"]:
337+
attention_mask = torch.ones(len(input_ids) +1)
338+
attention_mask[0] = 0
339+
else:
340+
attention_mask = torch.ones(len(input_ids))
338341
position_ids = torch.arange(len(input_ids))
339342
input_ids_padded.append(input_ids)
340343
attention_mask_padded.append(attention_mask)
@@ -450,17 +453,6 @@ def calib_func(model):
450453
"position_ids": inputs["position_ids"],
451454
"past_key_values": inputs["past_key_values"],
452455
}
453-
elif model_type == "falcon":
454-
input_bs, input_len = inputs["input_ids"].shape
455-
outputs = model(inputs["input_ids"])
456-
example_inputs["past_key_values"] = outputs[1]
457-
example_inputs["attention_mask"] = torch.ones(
458-
input_bs, input_len
459-
)
460-
example_inputs["position_ids"] = (
461-
inputs["position_ids"][:, -1:] + 1
462-
)
463-
example_inputs["input_ids"] = inputs["input_ids"][:, -1:]
464456
else:
465457
example_inputs = inputs
466458
else:

0 commit comments

Comments
 (0)