Skip to content

Commit da4bdc1

Browse files
authored
Merge pull request #67 from vkuzo/20251003_refactor
refactor hf scripts
2 parents 3dcb958 + 376b961 commit da4bdc1

9 files changed

+104
-87
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
__pycache__/
22
hf_torchao_vllm/data
3+
hf_torchao_vllm/sparse_logs

hf_torchao_vllm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ Example
77
python quantize_hf_model_with_torchao.py --model_name "Qwen/Qwen1.5-MoE-A2.7B" --experts_only_qwen_1_5_moe_a_2_7b True --save_model_to_disk True --quant_type nvfp4
88

99
# run the model from above in vLLM
10-
python run_quantized_model_in_vllm.py --model_name "data/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
10+
python run_quantized_model_in_vllm.py --model_name "data/torchao/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
1111
```
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# inspects the output of model created with llm-compressor
2+
# via the `run_llm_compressor.py` script
3+
4+
import safetensors
5+
import json
6+
import fire
7+
8+
def run(
9+
dir_name: str = 'data/llmcompressor/opt-125m-FP8-Dynamic',
10+
):
11+
json_config_name = f'{dir_name}/config.json'
12+
with open(json_config_name, 'r') as f:
13+
data = json.load(f)
14+
# TODO: pretty print
15+
print(json.dumps(data, indent=2))
16+
17+
# inpect the model, saved in safetensors format
18+
model_name = f'{dir_name}/model.safetensors'
19+
with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
20+
print(f.metadata())
21+
for k in f.keys():
22+
t = f.get_tensor(k)
23+
print(k, t.shape, t.dtype)
24+
25+
if __name__ == '__main__':
26+
fire.Fire(run)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# inspects the output of model created with torchao
2+
# via the `torchao_hf_script.py` script
3+
4+
import json
5+
import torch
6+
import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
7+
import fire
8+
9+
# not sure why I still need this
10+
torch.serialization.add_safe_globals([getattr])
11+
12+
def run(dir_name: str = 'data/torchao/fp8-opt-125m'):
13+
json_config_name = f'{dir_name}/config.json'
14+
15+
# inspect the config
16+
with open(json_config_name, 'r') as f:
17+
data = json.load(f)
18+
# TODO: pretty print
19+
print(json.dumps(data, indent=2))
20+
21+
# inspect the data
22+
model_name = f'{dir_name}/pytorch_model.bin'
23+
state_dict = torch.load(model_name, weights_only=True)
24+
for k, v in state_dict.items():
25+
print(k, v.shape, type(v))
26+
27+
if __name__ == '__main__':
28+
fire.Fire(run)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py
2+
3+
from transformers import AutoModelForCausalLM, AutoTokenizer
4+
5+
from llmcompressor import oneshot
6+
from llmcompressor.modifiers.quantization import QuantizationModifier
7+
from llmcompressor.utils import dispatch_for_generation
8+
9+
import fire
10+
11+
def run():
12+
13+
# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
14+
MODEL_ID = "facebook/opt-125m"
15+
16+
# Load model.
17+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
18+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
19+
20+
# Configure the quantization algorithm and scheme.
21+
# In this case, we:
22+
# * quantize the weights to fp8 with per channel via ptq
23+
# * quantize the activations to fp8 with dynamic per token
24+
recipe = QuantizationModifier(
25+
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
26+
)
27+
28+
# Apply quantization.
29+
oneshot(model=model, recipe=recipe)
30+
31+
# Confirm generations of the quantized model look sane.
32+
print("========== SAMPLE GENERATION ==============")
33+
dispatch_for_generation(model)
34+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
35+
model.device
36+
)
37+
output = model.generate(input_ids, max_new_tokens=20)
38+
print(tokenizer.decode(output[0]))
39+
print("==========================================")
40+
41+
# Save to disk in compressed-tensors format.
42+
SAVE_DIR = "data/llmcompressor/" + MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
43+
model.save_pretrained(SAVE_DIR)
44+
tokenizer.save_pretrained(SAVE_DIR)
45+
46+
if __name__ == '__main__':
47+
fire.Fire(run)

hf_torchao_vllm/quantize_hf_model_with_torchao.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def main(
252252
# Set default output directory based on model base name if not provided
253253
if output_dir is None:
254254
model_base_name = model_name.split("/")[-1]
255-
output_dir = f"data/{quant_type}-{model_base_name}"
255+
output_dir = f"data/torchao/{quant_type}-{model_base_name}"
256256

257257
# Convert to args-like object for compatibility with the rest of the code
258258
args = Namespace(

hf_torchao_vllm/utils/inspect_llm_compressor_output.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

hf_torchao_vllm/utils/inspect_torchao_output.py

Lines changed: 0 additions & 24 deletions
This file was deleted.

hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py

Lines changed: 0 additions & 40 deletions
This file was deleted.

0 commit comments

Comments
 (0)