Merge pull request #67 from vkuzo/20251003_refactor

vkuzo · web-flow · commit da4bdc1ef8a9 · 2025-10-03T10:05:53.000-04:00
refactor hf scripts
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__/
 hf_torchao_vllm/data
+hf_torchao_vllm/sparse_logs
diff --git a/hf_torchao_vllm/README.md b/hf_torchao_vllm/README.md
@@ -7,5 +7,5 @@ Example
 python quantize_hf_model_with_torchao.py --model_name "Qwen/Qwen1.5-MoE-A2.7B" --experts_only_qwen_1_5_moe_a_2_7b True --save_model_to_disk True --quant_type nvfp4
 
 # run the model from above in vLLM
-python run_quantized_model_in_vllm.py --model_name "data/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
+python run_quantized_model_in_vllm.py --model_name "data/torchao/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
 ```
diff --git a/hf_torchao_vllm/inspect_llm_compressor_output.py b/hf_torchao_vllm/inspect_llm_compressor_output.py
@@ -0,0 +1,26 @@
+# inspects the output of model created with llm-compressor
+# via the `run_llm_compressor.py` script
+
+import safetensors
+import json
+import fire
+
+def run(
+    dir_name: str = 'data/llmcompressor/opt-125m-FP8-Dynamic',
+):
+    json_config_name = f'{dir_name}/config.json'
+    with open(json_config_name, 'r') as f:
+        data = json.load(f)
+        # TODO: pretty print
+        print(json.dumps(data, indent=2))
+
+    # inpect the model, saved in safetensors format
+    model_name = f'{dir_name}/model.safetensors'
+    with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
+        print(f.metadata())
+        for k in f.keys():
+            t = f.get_tensor(k)
+            print(k, t.shape, t.dtype)
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/hf_torchao_vllm/inspect_torchao_output.py b/hf_torchao_vllm/inspect_torchao_output.py
@@ -0,0 +1,28 @@
+# inspects the output of model created with torchao
+# via the `torchao_hf_script.py` script
+
+import json
+import torch
+import torchao  # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
+import fire
+
+# not sure why I still need this
+torch.serialization.add_safe_globals([getattr])
+
+def run(dir_name: str = 'data/torchao/fp8-opt-125m'):
+    json_config_name = f'{dir_name}/config.json'
+
+    # inspect the config
+    with open(json_config_name, 'r') as f:
+        data = json.load(f)
+        # TODO: pretty print
+        print(json.dumps(data, indent=2))
+
+    # inspect the data
+    model_name = f'{dir_name}/pytorch_model.bin'
+    state_dict = torch.load(model_name, weights_only=True)
+    for k, v in state_dict.items():
+        print(k, v.shape, type(v))
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
@@ -0,0 +1,47 @@
+# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+import fire
+
+def run():
+
+    # MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+    MODEL_ID = "facebook/opt-125m"
+
+    # Load model.
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+    # Configure the quantization algorithm and scheme.
+    # In this case, we:
+    #   * quantize the weights to fp8 with per channel via ptq
+    #   * quantize the activations to fp8 with dynamic per token
+    recipe = QuantizationModifier(
+        targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+    )
+
+    # Apply quantization.
+    oneshot(model=model, recipe=recipe)
+
+    # Confirm generations of the quantized model look sane.
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+        model.device
+    )
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+
+    # Save to disk in compressed-tensors format.
+    SAVE_DIR = "data/llmcompressor/" + MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/hf_torchao_vllm/quantize_hf_model_with_torchao.py b/hf_torchao_vllm/quantize_hf_model_with_torchao.py
@@ -252,7 +252,7 @@ def main(
     # Set default output directory based on model base name if not provided
     if output_dir is None:
         model_base_name = model_name.split("/")[-1]
-        output_dir = f"data/{quant_type}-{model_base_name}"
+        output_dir = f"data/torchao/{quant_type}-{model_base_name}"
 
     # Convert to args-like object for compatibility with the rest of the code
     args = Namespace(
diff --git a/hf_torchao_vllm/utils/inspect_llm_compressor_output.py b/hf_torchao_vllm/utils/inspect_llm_compressor_output.py
diff --git a/hf_torchao_vllm/utils/inspect_torchao_output.py b/hf_torchao_vllm/utils/inspect_torchao_output.py
diff --git a/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`__pycache__/`
`2`	`2`	`hf_torchao_vllm/data`
	`3`	`+hf_torchao_vllm/sparse_logs`