update CUDA_VERSION to 12.8.1

yefubao · yefubao · commit 72320b9d6525 · 2025-06-23T10:06:46.000+08:00
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
@@ -1,4 +1,4 @@
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ENV DEBIAN_FRONTEND noninteractive
@@ -110,14 +110,14 @@ RUN cd ${STAGE_DIR} && \
 ##############################################################################
 # Install cuDNN (NVIDIA CUDA Deep Neural Network library)
 ##############################################################################
-ARG CUDNN_VERSION=9.5.0
+ARG CUDNN_VERSION=9.7.1
 RUN cd ${STAGE_DIR} && \
         CUDNN_MAJOR=$(echo "${CUDNN_VERSION}" | cut -d. -f1) && \
         CUDNN_MINOR=$(echo "${CUDNN_VERSION}" | cut -d. -f2) && \
         CUDA_MAJOR=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \
         CUDA_MINOR=$(echo "${CUDA_VERSION}" | cut -d. -f2) && \
-        wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}.50_cuda${CUDA_MAJOR}-archive.tar.xz && \
-        tar -xJvf cudnn-linux-x86_64-${CUDNN_VERSION}.50_cuda12-archive.tar.xz && \
+        wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}.26_cuda${CUDA_MAJOR}-archive.tar.xz && \
+        tar -xJvf cudnn-linux-x86_64-${CUDNN_VERSION}.26_cuda12-archive.tar.xz && \
         mkdir -p /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}/include && \
         mkdir -p /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}/lib64 && \
         cp cudnn-*-archive/include/cudnn*.h /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR}/include/ && \
diff --git a/install/install-requirements.sh b/install/install-requirements.sh
@@ -77,19 +77,23 @@ cd ..
 rm -r ./TransformerEngine
 
 # cudnn frontend
-pip install nvidia-cudnn-cu12==9.5.0.50
+pip install nvidia-cudnn-cu12==9.7.1.26
 CMAKE_ARGS="-DCMAKE_POLICY_VERSION_MINIMUM=3.5" pip install nvidia-cudnn-frontend
 python -c "import torch; print('cuDNN version:', torch.backends.cudnn.version());"
 python -c "from transformer_engine.pytorch.utils import get_cudnn_version; get_cudnn_version()"
 
-# Megatron-LM requires flash-attn >= 2.1.1, <= 2.7.3
-cu=$(nvcc --version | grep "Cuda compilation tools" | awk '{print $5}' | cut -d '.' -f 1)
-torch=$(pip show torch | grep Version | awk '{print $2}' | cut -d '+' -f 1 | cut -d '.' -f 1,2)
-cp=$(python3 --version | awk '{print $2}' | awk -F. '{print $1$2}')
-cxx=$(g++ --version | grep 'g++' | awk '{print $3}' | cut -d '.' -f 1)
-wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu${cu}torch${torch}cxx${cxx}abiFALSE-cp${cp}-cp${cp}-linux_x86_64.whl
-pip install flash_attn-2.7.3+cu${cu}torch${torch}cxx${cxx}abiFALSE-cp${cp}-cp${cp}-linux_x86_64.whl
-rm flash_attn-2.7.3+cu${cu}torch${torch}cxx${cxx}abiFALSE-cp${cp}-cp${cp}-linux_x86_64.whl
+# # Megatron-LM requires flash-attn >= 2.1.1, <= 2.7.3
+# cu=$(nvcc --version | grep "Cuda compilation tools" | awk '{print $5}' | cut -d '.' -f 1)
+# torch=$(pip show torch | grep Version | awk '{print $2}' | cut -d '+' -f 1 | cut -d '.' -f 1,2)
+# cp=$(python3 --version | awk '{print $2}' | awk -F. '{print $1$2}')
+# cxx=$(g++ --version | grep 'g++' | awk '{print $3}' | cut -d '.' -f 1)
+# wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu${cu}torch${torch}cxx${cxx}abiFALSE-cp${cp}-cp${cp}-linux_x86_64.whl
+# pip install flash_attn-2.7.3+cu${cu}torch${torch}cxx${cxx}abiFALSE-cp${cp}-cp${cp}-linux_x86_64.whl
+# rm flash_attn-2.7.3+cu${cu}torch${torch}cxx${cxx}abiFALSE-cp${cp}-cp${cp}-linux_x86_64.whl
+wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
+pip install flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
+rm flash_attn-2.8.0.post2+cu12torch2.7cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
+
 
 # From Megatron-LM log
 pip install "git+https://github.com/Dao-AILab/flash-attention.git@v2.7.2#egg=flashattn-hopper&subdirectory=hopper"
@@ -153,7 +157,7 @@ if [ "${env}" == "train" ]; then
     fi
 
     # Replace the following code with torch version 2.6.0
-    if [[ $torch_version == *"2.6.0"* ]];then
+    if [[ $torch_version == *"2.6.0"* ]] || [[ $torch_version == *"2.7.0"* ]];then
         # Check and replace line 908
         LINE_908=$(sed -n '908p' "$FILE")
         EXPECTED_908='                if num_nodes_waiting > 0:'