diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml index 4d5acdfe63..2e7230cfa5 100644 --- a/.github/workflows/nightly-release.yml +++ b/.github/workflows/nightly-release.yml @@ -145,7 +145,7 @@ jobs: - name: Build wheel in container env: DOCKER_IMAGE: ${{ matrix.arch == 'aarch64' && format('pytorch/manylinuxaarch64-builder:cuda{0}', matrix.cuda) || format('pytorch/manylinux2_28-builder:cuda{0}', matrix.cuda) }} - FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda == '12.8' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0a' }} + FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda < '13.0' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 11.0f 12.0f' }} FLASHINFER_DEV_RELEASE_SUFFIX: ${{ needs.setup.outputs.dev_suffix }} run: | # Extract CUDA major and minor versions diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7e406ff2ac..0c95611c50 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -182,7 +182,7 @@ jobs: - name: Build wheel in container env: DOCKER_IMAGE: ${{ matrix.arch == 'aarch64' && format('pytorch/manylinuxaarch64-builder:cuda{0}', matrix.cuda) || format('pytorch/manylinux2_28-builder:cuda{0}', matrix.cuda) }} - FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda == '12.8' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0a' }} + FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda < '13.0' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 11.0f 12.0f' }} run: | # Extract CUDA major and minor versions CUDA_MAJOR=$(echo "${{ matrix.cuda }}" | cut -d'.' -f1) diff --git a/README.md b/README.md index 8f93c97f7a..88b579b180 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ python -m pip install dist/*.whl `flashinfer-jit-cache` (customize `FLASHINFER_CUDA_ARCH_LIST` for your target GPUs): ```bash -export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 12.0a" +export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 11.0f 12.0f" cd flashinfer-jit-cache python -m build --no-isolation --wheel python -m pip install dist/*.whl diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu index 9359eb5d12..016a4f982a 100644 --- a/csrc/xqa/mha.cu +++ b/csrc/xqa/mha.cu @@ -93,7 +93,7 @@ __constant__ constexpr uint32_t cacheVTileSeqLen = 32; constexpr uint32_t preferedKHeadPartBytes = 64; __constant__ constexpr uint32_t cacheVTileSeqLen = 32; #elif __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 900 || \ - __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030 + __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030 || __CUDA_ARCH__ == 1100 constexpr uint32_t preferedKHeadPartBytes = 128; __constant__ constexpr uint32_t cacheVTileSeqLen = 64; #else diff --git a/csrc/xqa/utils.cuh b/csrc/xqa/utils.cuh index f96d83f5f5..6302d4e20b 100644 --- a/csrc/xqa/utils.cuh +++ b/csrc/xqa/utils.cuh @@ -46,7 +46,8 @@ __constant__ constexpr float kE4M3_MAX = 448.F; constexpr uint32_t kMAX_SMEM_SIZE = (99u << 10); #elif __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 870 constexpr uint32_t kMAX_SMEM_SIZE = (163u << 10); -#elif __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030 +#elif __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030 || \ + __CUDA_ARCH__ == 1100 constexpr uint32_t kMAX_SMEM_SIZE = (227u << 10); #endif #endif diff --git a/docs/installation.rst b/docs/installation.rst index 4f628f7094..9087e87471 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -92,7 +92,7 @@ You can follow the steps below to install FlashInfer from source code: .. code-block:: bash - export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 12.0a" + export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 11.0f 12.0f" cd flashinfer-jit-cache python -m build --no-isolation --wheel python -m pip install dist/*.whl diff --git a/scripts/task_test_jit_cache_package_build_import.sh b/scripts/task_test_jit_cache_package_build_import.sh index e2e4a824aa..d03937bc47 100755 --- a/scripts/task_test_jit_cache_package_build_import.sh +++ b/scripts/task_test_jit_cache_package_build_import.sh @@ -43,7 +43,16 @@ arches = ["7.5", "8.0", "8.9", "9.0a"] if cuda_ver is not None: try: major, minor = map(int, cuda_ver.split(".")[:2]) - if (major, minor) >= (12, 8): + if (major, minor) >= (13, 0): + arches.append("10.0a") + arches.append("10.3a") + arches.append("11.0f") + arches.append("12.0f") + elif (major, minor) >= (12, 9): + arches.append("10.0a") + arches.append("10.3a") + arches.append("12.0f") + elif (major, minor) >= (12, 8): arches.append("10.0a") arches.append("12.0a") except Exception: