[safetensors] Fix GPTQ/AWQ quantized model parameter counting (#1770)

mishig25 · web-flow · commit de70b251cd54 · 2025-09-25T14:07:10.000+02:00
### Fix GPTQ/AWQ quantized model parameter counting Fixes parameter count calculation for GPTQ/AWQ quantized models by: Applying 8x multiplier to qweight tensors based on quantization bits (32/4 = 8 for 4-bit) Skipping auxiliary tensors (qzeros, g_idx, scales) from parameter count Defaulting quantized tensor detection when no exclusion list is provided Before: [RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16](https://huggingface.co/RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16) reported ~2B parameters After: Correctly reports ~8B parameters
diff --git a/packages/hub/src/lib/parse-safetensors-metadata.spec.ts b/packages/hub/src/lib/parse-safetensors-metadata.spec.ts
@@ -207,6 +207,31 @@ describe("parseSafetensorsMetadata", () => {
 		assert.strictEqual(parameterCount.E8M0, 24);
 	});
 
+	it("fetch info for GPTQ quantized 8B model", async () => {
+		const parse = await parseSafetensorsMetadata({
+			repo: "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
+			revision: "3921b6aee65496a708b0af456c964ceca7423193",
+			computeParametersCount: true,
+		});
+
+		const parameterCount = parse.parameterCount;
+		assert.ok(parameterCount);
+		assert.ok(parameterCount.I32);
+		assert.ok(parameterCount.F16);
+		assert.strictEqual(parameterCount.I32, 6_979_321_856);
+		assert.strictEqual(parameterCount.F16, 1_052_315_648);
+
+		const parameterCountTotal =
+			parse.parameterTotal ??
+			sum(
+				Object.entries(parameterCount)
+					.filter(([, value]) => typeof value === "number")
+					.map(([, value]) => value as number)
+			);
+
+		assert.strictEqual(parameterCountTotal, 8_031_637_504);
+	});
+
 	it("fetch info for openai/gpt-oss-20b (large sharded model)", async () => {
 		const parse = await parseSafetensorsMetadata({
 			repo: "openai/gpt-oss-20b",
diff --git a/packages/hub/src/lib/parse-safetensors-metadata.ts b/packages/hub/src/lib/parse-safetensors-metadata.ts
@@ -36,6 +36,7 @@ export function parseSafetensorsShardFilename(filename: string): SafetensorsShar
 
 const PARALLEL_DOWNLOADS = 20;
 const MAX_HEADER_LENGTH = 25_000_000;
+const GPTQ_QWEIGHT_SUFFIX = "qweight";
 
 class SafetensorParseError extends Error {}
 
@@ -362,10 +363,14 @@ export interface ModelConfig {
  * Determines if a tensor is quantized based on quantization config and tensor name
  */
 function isQuantizedTensor(tensorName: string, quantConfig?: QuantizationConfig): boolean {
-	if (!quantConfig || !quantConfig.modules_to_not_convert) {
+	if (!quantConfig) {
 		return false;
 	}
 
+	if (!quantConfig.modules_to_not_convert || quantConfig.modules_to_not_convert.length === 0) {
+		return true;
+	}
+
 	for (const pattern of quantConfig.modules_to_not_convert) {
 		const regexPattern = pattern.replace(/\*/g, ".*");
 		const regex = new RegExp(regexPattern);
@@ -385,7 +390,9 @@ function getQuantizationMultiplier(tensorName: string, dtype: Dtype, quantConfig
 		return 1;
 	}
 
-	switch (quantConfig.quant_method) {
+	const quantMethod = quantConfig.quant_method?.toLowerCase();
+
+	switch (quantMethod) {
 		case "mxfp4":
 			if (dtype === "U8" && tensorName.includes("_blocks")) {
 				return 2;
@@ -394,6 +401,10 @@ function getQuantizationMultiplier(tensorName: string, dtype: Dtype, quantConfig
 
 		case "gptq":
 		case "awq":
+			if (getTensorSuffix(tensorName) === GPTQ_QWEIGHT_SUFFIX) {
+				const bits = quantConfig.bits && quantConfig.bits > 0 ? quantConfig.bits : 4;
+				return Math.max(1, Math.floor(32 / bits));
+			}
 			if (quantConfig.bits === 4 && dtype === "U8") {
 				return 2;
 			}
@@ -430,12 +441,18 @@ function computeNumOfParamsByDtypeSingleFile(
 	const tensors = omit(header, "__metadata__");
 
 	for (const [tensorName, v] of typedEntries(tensors)) {
+		if (shouldSkipTensor(tensorName, quantConfig)) {
+			continue;
+		}
 		if (v.shape.length === 0) {
 			continue;
 		}
 
 		const elements = v.shape.reduce((a, b) => a * b);
 		const multiplier = quantConfig ? getQuantizationMultiplier(tensorName, v.dtype, quantConfig) : 1;
+		if (multiplier === 0) {
+			continue;
+		}
 		counter[v.dtype] = (counter[v.dtype] ?? 0) + elements * multiplier;
 	}
 	return counter;
@@ -453,3 +470,32 @@ function computeNumOfParamsByDtypeSharded(
 	}
 	return counter;
 }
+
+function getTensorSuffix(tensorName: string): string {
+	const lastDotIndex = tensorName.lastIndexOf(".");
+	return lastDotIndex === -1 ? tensorName : tensorName.slice(lastDotIndex + 1);
+}
+
+function shouldSkipTensor(tensorName: string, quantConfig?: QuantizationConfig): boolean {
+	const GPTQ_AWQ_AUXILIARY_SUFFIXES = ["qzeros", "g_idx", "scales"];
+
+	if (!quantConfig) {
+		return false;
+	}
+
+	const quantMethod = quantConfig.quant_method?.toLowerCase();
+	if (!quantMethod || (quantMethod !== "gptq" && quantMethod !== "awq")) {
+		return false;
+	}
+
+	if (!isQuantizedTensor(tensorName, quantConfig)) {
+		return false;
+	}
+
+	const suffix = getTensorSuffix(tensorName);
+	if (suffix === GPTQ_QWEIGHT_SUFFIX) {
+		return false;
+	}
+
+	return GPTQ_AWQ_AUXILIARY_SUFFIXES.includes(suffix);
+}