Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 2ebe14d

Browse files
[vLLM] QBits Perf Enhence (#1581)
* add customized config & update benchmark script Signed-off-by: Zhenzhong1 <109137058+Zhenzhong1@users.noreply.github.com> --------- Signed-off-by: Zhenzhong1 <109137058+Zhenzhong1@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 50676da commit 2ebe14d

File tree

3 files changed

+46
-36
lines changed

3 files changed

+46
-36
lines changed

examples/vllm/vllm_acceleration_example.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,42 +37,46 @@ def main(args_in: Optional[List[str]] = None) -> None:
3737
print(args)
3838

3939
if args.benchmark:
40-
if args.use_neural_speed:
41-
os.environ["NEURAL_SPEED_VERBOSE"] = "1"
42-
woq_config = RtnConfig(bits=4, weight_dtype="int4", compute_dtype="int8", scale_dtype="bf16")
43-
model_with_ns = AutoModelForCausalLM.from_pretrained(args.model_path, quantization_config=woq_config)
44-
45-
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
46-
inputs = tokenizer(args.prompt, return_tensors="pt").input_ids
40+
sampling_params = SamplingParams(max_tokens=32)
41+
config = RtnConfig(compute_dtype="int8",
42+
group_size=128,
43+
scale_dtype="bf16",
44+
weight_dtype="int4_clip",
45+
bits=4)
46+
print(config)
47+
prompts = [args.prompt]
48+
llm = LLM(model=args.model_path, trust_remote_code=True)
49+
model = AutoModelForCausalLM.from_pretrained(args.model_path, use_vllm=True, config=config)
4750

48-
T5 = time.time()
49-
output = model_with_ns.generate(inputs, max_new_tokens=32)
50-
T6 = time.time()
51-
print("neural speed output = ", output)
51+
for prompt in prompts:
52+
vllm_outputs = llm.generate(prompt, sampling_params) # Generate texts from the prompts.
53+
qbits_output = model.generate(prompt, sampling_params)
5254

53-
llm = LLM(model=args.model_path, trust_remote_code=True)
54-
sampling_params = SamplingParams(max_tokens=32)
55-
T1 = time.time()
56-
original_outputs = llm.generate(args.prompt, sampling_params) # Generate texts from the prompts.
57-
T2 = time.time()
58-
vllm_latency = (T2 - T1) * 1000
55+
print("vLLM input_tokens_length = ", len(vllm_outputs[0].prompt_token_ids),
56+
"output_tokens_length = ", len(vllm_outputs[0].outputs[0].token_ids))
57+
print('The vLLM generate = ',
58+
vllm_outputs[0].metrics.finished_time - vllm_outputs[0].metrics.arrival_time, "s")
59+
print("The vLLM first token time = ",
60+
vllm_outputs[0].metrics.first_token_time - vllm_outputs[0].metrics.first_scheduled_time)
5961

60-
model = AutoModelForCausalLM.from_pretrained(args.model_path, use_vllm=True)
61-
T3 = time.time()
62-
optimized_output = model.generate(args.prompt, sampling_params)
63-
T4 = time.time()
64-
qbits_latency = (T4 - T3) * 1000
62+
print("QBits_vLLM input_tokens_length = ", len(qbits_output[0].prompt_token_ids),
63+
"output_tokens_length = ", len(qbits_output[0].outputs[0].token_ids))
64+
print('The QBits optimized generate = ',
65+
qbits_output[0].metrics.finished_time - qbits_output[0].metrics.arrival_time, "s")
66+
print("The QBits first token time = ",
67+
qbits_output[0].metrics.first_token_time - qbits_output[0].metrics.first_scheduled_time)
6568

66-
print("original outputs = ", original_outputs)
67-
print("input_tokens_length = ", len(original_outputs[0].prompt_token_ids))
68-
print("output_tokens_length = ", len(original_outputs[0].outputs[0].token_ids))
69+
if args.use_neural_speed:
70+
os.environ["NEURAL_SPEED_VERBOSE"] = "1"
71+
woq_config = RtnConfig(bits=4, weight_dtype="int4", compute_dtype="int8", scale_dtype="bf16")
72+
model_with_ns = AutoModelForCausalLM.from_pretrained(args.model_path,
73+
quantization_config=woq_config)
6974

70-
print("optimized outputs = ", optimized_output)
71-
print("input_tokens_length = ", len(optimized_output[0].prompt_token_ids))
72-
print("output_tokens_length = ", len(optimized_output[0].outputs[0].token_ids))
75+
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
76+
inputs = tokenizer(args.prompt, return_tensors="pt").input_ids
7377

74-
print('The qbits optimized generate:%.2f ms' % qbits_latency)
75-
print('The original vLLM generate:%.2f ms' % vllm_latency)
78+
output = model_with_ns.generate(inputs, max_new_tokens=32)
79+
print("neural speed output = ", output)
7680

7781
return
7882

intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,10 @@ def forward(self, x: torch.Tensor):
146146
bias = None if self.bias is None else self.bias.data.float()
147147
if not x.is_contiguous():
148148
x = x.contiguous()
149+
150+
# Only FP32 activation supports gemv which benefits next-token.
149151
out = matmul_kbit(
150-
x.view(m, shape[-1]),
152+
x.view(m, shape[-1]).float(),
151153
self.weight,
152154
bias,
153155
out,

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -419,11 +419,15 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
419419
model.load_weights(weights_iterator)
420420

421421
print("INC quantizing...")
422-
config = RtnConfig(compute_dtype="bf16",
423-
group_size=128,
424-
scale_dtype="bf16",
425-
weight_dtype="int4_clip",
426-
bits=4)
422+
config = kwargs.pop("config", None)
423+
if config is None:
424+
config = RtnConfig(compute_dtype="int8",
425+
group_size=128,
426+
scale_dtype="bf16",
427+
weight_dtype="int4_clip",
428+
bits=4)
429+
print("using default RTNConfig = ", config)
430+
print("Using customized config = ", config)
427431
model = convert_to_quantized_model(model, config)
428432

429433
return llm

0 commit comments

Comments
 (0)