Skip to content

Commit e0e5fcf

Browse files
committed
update
1 parent 4cfc0e6 commit e0e5fcf

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
3+
from llmcompressor import oneshot
4+
from llmcompressor.modifiers.quantization import QuantizationModifier
5+
from llmcompressor.utils import dispatch_for_generation
6+
7+
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
8+
9+
# Load model.
10+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
11+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12+
13+
# Configure the quantization algorithm and scheme.
14+
# In this case, we:
15+
# * quantize the weights to fp4 with per group 16 via ptq
16+
recipe = QuantizationModifier(targets="Linear", scheme="MXFP4A16", ignore=["lm_head"])
17+
18+
# Apply quantization.
19+
oneshot(model=model, recipe=recipe)
20+
21+
print("\n\n")
22+
print("========== SAMPLE GENERATION ==============")
23+
dispatch_for_generation(model)
24+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
25+
model.device
26+
)
27+
output = model.generate(input_ids, max_new_tokens=100)
28+
print(tokenizer.decode(output[0]))
29+
print("==========================================\n\n")
30+
31+
32+
# Save to disk in compressed-tensors format.
33+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-MXFP4A16"
34+
model.save_pretrained(SAVE_DIR, save_compressed=True)
35+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)