[None][fix] Remove unnecessary attention workspace memory check (#9064)

jiaganc · web-flow · commit 1a5672269727 · 2025-11-12T11:18:50.000+08:00
Signed-off-by: Jiagan Cheng &lt;jiaganc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -816,16 +816,6 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
         = runner->getWorkspaceSize(*op, num_tokens, max_attention_window_size, num_gen_tokens, max_blocks_per_sequence);
     TLLM_LOG_TRACE("Expected workspace size is %ld bytes", workspace_size);
 
-    if (workspace_size >= (16l << 30))
-    {
-        auto const [free_mem, total_mem] = tensorrt_llm::common::getDeviceMemoryInfo(false);
-        if (workspace_size >= static_cast<int64_t const>(free_mem))
-        {
-            throw std::runtime_error("attention workspace size " + std::to_string(workspace_size)
-                + " bytes, exceeds available CUDA memory " + std::to_string(free_mem) + " bytes");
-        }
-    }
-
     torch::Tensor workspace;
     if (workspace_.has_value())
     {
@@ -839,6 +829,7 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
     }
     else
     {
+        TLLM_LOG_TRACE("Allocate new attention workspace with size %ld bytes", workspace_size);
         workspace = torch::empty({workspace_size}, torch::dtype(torch::kByte).device(qkv_or_q.device()));
     }
 

Original file line number	Diff line number	Diff line change
`@@ -816,16 +816,6 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to`
`816`	`816`	`= runner->getWorkspaceSize(*op, num_tokens, max_attention_window_size, num_gen_tokens, max_blocks_per_sequence);`
`817`	`817`	`TLLM_LOG_TRACE("Expected workspace size is %ld bytes", workspace_size);`
`818`	`818`
`819`		`- if (workspace_size >= (16l << 30))`
`820`		`- {`
`821`		`- auto const [free_mem, total_mem] = tensorrt_llm::common::getDeviceMemoryInfo(false);`
`822`		`- if (workspace_size >= static_cast<int64_t const>(free_mem))`
`823`		`- {`
`824`		`- throw std::runtime_error("attention workspace size " + std::to_string(workspace_size)`
`825`		`- + " bytes, exceeds available CUDA memory " + std::to_string(free_mem) + " bytes");`
`826`		`- }`
`827`		`- }`
`828`		`-`
`829`	`819`	`torch::Tensor workspace;`
`830`	`820`	`if (workspace_.has_value())`
`831`	`821`	`{`
`@@ -839,6 +829,7 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to`
`839`	`829`	`}`
`840`	`830`	`else`
`841`	`831`	`{`
	`832`	`+ TLLM_LOG_TRACE("Allocate new attention workspace with size %ld bytes", workspace_size);`
`842`	`833`	`workspace = torch::empty({workspace_size}, torch::dtype(torch::kByte).device(qkv_or_q.device()));`
`843`	`834`	`}`
`844`	`835`