Fix dtype (#517)

dsikka · web-flow · commit 2763f81524d1 · 2025-11-10T18:14:00.000-05:00
* try float16

* update

* update
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -427,7 +427,7 @@ def round_to_quantized_type_dtype(
         rounded = torch.clamp(tensor, finfo.min, finfo.max).to(dtype)
     else:
         iinfo = torch.iinfo(dtype)
-        rounded = torch.round(torch.clamp(tensor, iinfo.min, iinfo.max))
+        rounded = torch.round(torch.clamp(tensor, iinfo.min, iinfo.max)).to(dtype)
 
     if cast_to_original_dtype:
         return rounded.to(original_dtype)