vllm-project · mishra-krishna · Sep 21, 2025 · Sep 21, 2025 · pavanimajety · Sep 24, 2025
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import pytest
 import torch
 import torch.nn.functional as F
@@ -8,7 +9,8 @@
 
 from vllm.platforms import current_platform
 
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+FLASHINFER_WORKSPACE_BUFFER_SIZE = int(
+    os.environ.get("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", 128 * 1024 * 1024))
 
 if not current_platform.has_device_capability(100):
     pytest.skip(

@@ -3,6 +3,7 @@
 """Attention layer with FlashInfer."""
 from __future__ import annotations
 
+import os
 from dataclasses import dataclass
 from typing import ClassVar, Optional, Union
 
@@ -41,7 +42,8 @@
 # yapf: enable
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+FLASHINFER_WORKSPACE_BUFFER_SIZE = int(
+    os.environ.get("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", 256 * 1024 * 1024))
 
 FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8

@@ -188,6 +188,7 @@
 """
 
 import functools
+import os
 from abc import abstractmethod
 from dataclasses import dataclass, field
 from typing import ClassVar, Generic, Optional, TypeVar, Union
@@ -426,7 +427,8 @@ def use_cudnn_prefill() -> bool:
 # Currently 394MB, this can be tuned based on GEMM sizes used.
 # Chosen to be the same as sglang:
 #  https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
-FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
+FLASHINFER_WORKSPACE_BUFFER_SIZE = int(
+    os.environ.get("VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", 394 * 1024 * 1024))
 
 
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):