@@ -37,42 +37,46 @@ def main(args_in: Optional[List[str]] = None) -> None:
3737 print (args )
3838
3939 if args .benchmark :
40- if args .use_neural_speed :
41- os .environ ["NEURAL_SPEED_VERBOSE" ] = "1"
42- woq_config = RtnConfig (bits = 4 , weight_dtype = "int4" , compute_dtype = "int8" , scale_dtype = "bf16" )
43- model_with_ns = AutoModelForCausalLM .from_pretrained (args .model_path , quantization_config = woq_config )
44-
45- tokenizer = AutoTokenizer .from_pretrained (args .model_path , trust_remote_code = True )
46- inputs = tokenizer (args .prompt , return_tensors = "pt" ).input_ids
40+ sampling_params = SamplingParams (max_tokens = 32 )
41+ config = RtnConfig (compute_dtype = "int8" ,
42+ group_size = 128 ,
43+ scale_dtype = "bf16" ,
44+ weight_dtype = "int4_clip" ,
45+ bits = 4 )
46+ print (config )
47+ prompts = [args .prompt ]
48+ llm = LLM (model = args .model_path , trust_remote_code = True )
49+ model = AutoModelForCausalLM .from_pretrained (args .model_path , use_vllm = True , config = config )
4750
48- T5 = time .time ()
49- output = model_with_ns .generate (inputs , max_new_tokens = 32 )
50- T6 = time .time ()
51- print ("neural speed output = " , output )
51+ for prompt in prompts :
52+ vllm_outputs = llm .generate (prompt , sampling_params ) # Generate texts from the prompts.
53+ qbits_output = model .generate (prompt , sampling_params )
5254
53- llm = LLM ( model = args . model_path , trust_remote_code = True )
54- sampling_params = SamplingParams ( max_tokens = 32 )
55- T1 = time . time ()
56- original_outputs = llm . generate ( args . prompt , sampling_params ) # Generate texts from the prompts.
57- T2 = time . time ()
58- vllm_latency = ( T2 - T1 ) * 1000
55+ print ( "vLLM input_tokens_length = " , len ( vllm_outputs [ 0 ]. prompt_token_ids ),
56+ "output_tokens_length = " , len ( vllm_outputs [ 0 ]. outputs [ 0 ]. token_ids ) )
57+ print ( 'The vLLM generate = ' ,
58+ vllm_outputs [ 0 ]. metrics . finished_time - vllm_outputs [ 0 ]. metrics . arrival_time , "s" )
59+ print ( "The vLLM first token time = " ,
60+ vllm_outputs [ 0 ]. metrics . first_token_time - vllm_outputs [ 0 ]. metrics . first_scheduled_time )
5961
60- model = AutoModelForCausalLM .from_pretrained (args .model_path , use_vllm = True )
61- T3 = time .time ()
62- optimized_output = model .generate (args .prompt , sampling_params )
63- T4 = time .time ()
64- qbits_latency = (T4 - T3 ) * 1000
62+ print ("QBits_vLLM input_tokens_length = " , len (qbits_output [0 ].prompt_token_ids ),
63+ "output_tokens_length = " , len (qbits_output [0 ].outputs [0 ].token_ids ))
64+ print ('The QBits optimized generate = ' ,
65+ qbits_output [0 ].metrics .finished_time - qbits_output [0 ].metrics .arrival_time , "s" )
66+ print ("The QBits first token time = " ,
67+ qbits_output [0 ].metrics .first_token_time - qbits_output [0 ].metrics .first_scheduled_time )
6568
66- print ("original outputs = " , original_outputs )
67- print ("input_tokens_length = " , len (original_outputs [0 ].prompt_token_ids ))
68- print ("output_tokens_length = " , len (original_outputs [0 ].outputs [0 ].token_ids ))
69+ if args .use_neural_speed :
70+ os .environ ["NEURAL_SPEED_VERBOSE" ] = "1"
71+ woq_config = RtnConfig (bits = 4 , weight_dtype = "int4" , compute_dtype = "int8" , scale_dtype = "bf16" )
72+ model_with_ns = AutoModelForCausalLM .from_pretrained (args .model_path ,
73+ quantization_config = woq_config )
6974
70- print ("optimized outputs = " , optimized_output )
71- print ("input_tokens_length = " , len (optimized_output [0 ].prompt_token_ids ))
72- print ("output_tokens_length = " , len (optimized_output [0 ].outputs [0 ].token_ids ))
75+ tokenizer = AutoTokenizer .from_pretrained (args .model_path , trust_remote_code = True )
76+ inputs = tokenizer (args .prompt , return_tensors = "pt" ).input_ids
7377
74- print ( 'The qbits optimized generate:%.2f ms' % qbits_latency )
75- print ( 'The original vLLM generate:%.2f ms' % vllm_latency )
78+ output = model_with_ns . generate ( inputs , max_new_tokens = 32 )
79+ print ( "neural speed output = " , output )
7680
7781 return
7882
0 commit comments