Skip to content

Commit 848e553

Browse files
committed
add scripts for inspecting torchao and llm-compressor output
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 7cd115e commit 848e553

File tree

3 files changed

+85
-0
lines changed

3 files changed

+85
-0
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# inspects the output of model created with llm-compressor
2+
# via the `run_llm_compressor.py` script
3+
4+
import safetensors
5+
import json
6+
7+
# inspect the config
8+
dir_name = 'opt-125m-FP8-Dynamic'
9+
json_config_name = f'{dir_name}/config.json'
10+
with open(json_config_name, 'r') as f:
11+
data = json.load(f)
12+
# TODO: pretty print
13+
print(json.dumps(data, indent=2))
14+
15+
# inpect the model, saved in safetensors format
16+
model_name = f'{dir_name}/model.safetensors'
17+
with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
18+
print(f.metadata())
19+
for k in f.keys():
20+
t = f.get_tensor(k)
21+
print(k, t.shape, t.dtype)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# inspects the output of model created with torchao
2+
# via the `torchao_hf_script.py` script
3+
4+
import json
5+
import torch
6+
import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
7+
8+
# not sure why I still need this
9+
torch.serialization.add_safe_globals([getattr])
10+
11+
dir_name = 'data/fp8-opt-125m'
12+
json_config_name = f'{dir_name}/config.json'
13+
14+
# inspect the config
15+
with open(json_config_name, 'r') as f:
16+
data = json.load(f)
17+
# TODO: pretty print
18+
print(json.dumps(data, indent=2))
19+
20+
# inspect the data
21+
model_name = f'{dir_name}/pytorch_model.bin'
22+
state_dict = torch.load(model_name, weights_only=True)
23+
for k, v in state_dict.items():
24+
print(k, v.shape, type(v))
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py
2+
3+
from transformers import AutoModelForCausalLM, AutoTokenizer
4+
5+
from llmcompressor import oneshot
6+
from llmcompressor.modifiers.quantization import QuantizationModifier
7+
from llmcompressor.utils import dispatch_for_generation
8+
9+
# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
10+
MODEL_ID = "facebook/opt-125m"
11+
12+
# Load model.
13+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
14+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15+
16+
# Configure the quantization algorithm and scheme.
17+
# In this case, we:
18+
# * quantize the weights to fp8 with per channel via ptq
19+
# * quantize the activations to fp8 with dynamic per token
20+
recipe = QuantizationModifier(
21+
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
22+
)
23+
24+
# Apply quantization.
25+
oneshot(model=model, recipe=recipe)
26+
27+
# Confirm generations of the quantized model look sane.
28+
print("========== SAMPLE GENERATION ==============")
29+
dispatch_for_generation(model)
30+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
31+
model.device
32+
)
33+
output = model.generate(input_ids, max_new_tokens=20)
34+
print(tokenizer.decode(output[0]))
35+
print("==========================================")
36+
37+
# Save to disk in compressed-tensors format.
38+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
39+
model.save_pretrained(SAVE_DIR)
40+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)