[#9102][feat] AutoDeploy: Support fp8 kv cache (#9107)

nvchenghaoz · web-flow · commit f6f6e1f25d1a · 2025-11-13T23:55:45.000-08:00
Signed-off-by: Chenghao Zhang &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/cuda_backend_causal_conv.py
@@ -284,7 +284,7 @@ def _get_conv_cache(si: SequenceInfo):
                 in_channels,
                 max(1, kernel_size - 1),
                 device=si.device,
-                dtype=cache_config.dtype or inp_fake.dtype,
+                dtype=inp_fake.dtype,
             )
 
         return {"conv_state_cache": _get_conv_cache}
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/torch_backend_causal_conv.py
@@ -342,7 +342,7 @@ def _get_conv_cache(si: SequenceInfo):
                 in_channels,
                 kernel_size,
                 device=si.device,
-                dtype=cache_config.dtype or inp_fake.dtype,
+                dtype=inp_fake.dtype,
             )
 
         return {"conv_state_cache": _get_conv_cache}

Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ def _get_conv_cache(si: SequenceInfo):`
`284`	`284`	`in_channels,`
`285`	`285`	`max(1, kernel_size - 1),`
`286`	`286`	`device=si.device,`
`287`		`- dtype=cache_config.dtype or inp_fake.dtype,`
	`287`	`+ dtype=inp_fake.dtype,`
`288`	`288`	`)`
`289`	`289`
`290`	`290`	`return {"conv_state_cache": _get_conv_cache}`
Original file line number	Diff line number	Diff line change
`@@ -342,7 +342,7 @@ def _get_conv_cache(si: SequenceInfo):`
`342`	`342`	`in_channels,`
`343`	`343`	`kernel_size,`
`344`	`344`	`device=si.device,`
`345`		`- dtype=cache_config.dtype or inp_fake.dtype,`
	`345`	`+ dtype=inp_fake.dtype,`
`346`	`346`	`)`
`347`	`347`
`348`	`348`	`return {"conv_state_cache": _get_conv_cache}`