diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp index 3dfb0ba0745..6365d7f6123 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.cpp +++ b/cpp/tensorrt_llm/thop/attentionOp.cpp @@ -816,16 +816,6 @@ void attention(torch::Tensor q, std::optional k, std::optionalgetWorkspaceSize(*op, num_tokens, max_attention_window_size, num_gen_tokens, max_blocks_per_sequence); TLLM_LOG_TRACE("Expected workspace size is %ld bytes", workspace_size); - if (workspace_size >= (16l << 30)) - { - auto const [free_mem, total_mem] = tensorrt_llm::common::getDeviceMemoryInfo(false); - if (workspace_size >= static_cast(free_mem)) - { - throw std::runtime_error("attention workspace size " + std::to_string(workspace_size) - + " bytes, exceeds available CUDA memory " + std::to_string(free_mem) + " bytes"); - } - } - torch::Tensor workspace; if (workspace_.has_value()) { @@ -839,6 +829,7 @@ void attention(torch::Tensor q, std::optional k, std::optional