File tree Expand file tree Collapse file tree 1 file changed +35
-0
lines changed
examples/quantization_w4a4_fp4 Expand file tree Collapse file tree 1 file changed +35
-0
lines changed Original file line number Diff line number Diff line change 1+ from transformers import AutoModelForCausalLM , AutoTokenizer
2+
3+ from llmcompressor import oneshot
4+ from llmcompressor .modifiers .quantization import QuantizationModifier
5+ from llmcompressor .utils import dispatch_for_generation
6+
7+ MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
8+
9+ # Load model.
10+ model = AutoModelForCausalLM .from_pretrained (MODEL_ID , torch_dtype = "auto" )
11+ tokenizer = AutoTokenizer .from_pretrained (MODEL_ID )
12+
13+ # Configure the quantization algorithm and scheme.
14+ # In this case, we:
15+ # * quantize the weights to fp4 with per group 16 via ptq
16+ recipe = QuantizationModifier (targets = "Linear" , scheme = "MXFP4A16" , ignore = ["lm_head" ])
17+
18+ # Apply quantization.
19+ oneshot (model = model , recipe = recipe )
20+
21+ print ("\n \n " )
22+ print ("========== SAMPLE GENERATION ==============" )
23+ dispatch_for_generation (model )
24+ input_ids = tokenizer ("Hello my name is" , return_tensors = "pt" ).input_ids .to (
25+ model .device
26+ )
27+ output = model .generate (input_ids , max_new_tokens = 100 )
28+ print (tokenizer .decode (output [0 ]))
29+ print ("==========================================\n \n " )
30+
31+
32+ # Save to disk in compressed-tensors format.
33+ SAVE_DIR = MODEL_ID .rstrip ("/" ).split ("/" )[- 1 ] + "-MXFP4A16"
34+ model .save_pretrained (SAVE_DIR , save_compressed = True )
35+ tokenizer .save_pretrained (SAVE_DIR )
You can’t perform that action at this time.
0 commit comments