Add FP32 and INT4 test in Llama2 (microsoft#21187)

mszhanyi · web-flow · commit 587e92c2791b · 2024-06-28T06:18:26.000+08:00
### Description
&lt;!-- Describe your changes. --&gt;



### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -38,14 +38,6 @@ parameters:
   type: number
   default: 0
 
-resources:
-  repositories:
-  - repository: LLaMa2Onnx
-    type: Github
-    endpoint: Microsoft
-    name: Microsoft/Llama-2-Onnx
-    ref: main
-
 variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
@@ -287,11 +279,12 @@ stages:
       workingDirectory: $(Build.SourcesDirectory)
       condition: ne(variables.hitAnother, 'True')
 
-- stage: Llama2_ONNX_FP16
+- stage: Llama2_7B_ONNX
   dependsOn:
   - Build_Onnxruntime_Cuda
   jobs:
-  - job: Llama2_ONNX_FP16
+  - job: Llama2_7B_ONNX
+    timeoutInMinutes: 120
     variables:
       skipComponentGovernanceDetection: true
     workspace:
@@ -319,15 +312,15 @@ stages:
 
     - template: templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
         Context: tools/ci_build/github/linux/docker/
         ScriptName: tools/ci_build/get_docker_image.py
         DockerBuildArgs: "
         --build-arg BUILD_UID=$( id -u )
         --build-arg BASEIMAGE=${{ variables.docker_base_image }}
         --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
         "
-        Repository: onnxruntimeubi8packagestest
+        Repository: onnxruntimeubi8packagestest_torch
         UpdateDepsTxt: false
 
     - task: DownloadPackage@1
@@ -343,7 +336,7 @@ stages:
         docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
            -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
            -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
-           onnxruntimeubi8packagestest \
+           onnxruntimeubi8packagestest_torch \
             bash -c "
               set -ex; \
               pushd /workspace/onnxruntime/python/tools/transformers/ ; \
@@ -352,14 +345,56 @@ stages:
               python3 -m pip install -r requirements.txt ; \
               popd ; \
               python3 -m pip install /ort-artifact/*.whl ; \
-              python3 -m pip uninstall -y torch ; \
-              python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
-              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
+              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --small_gp;\
+              ls -l llama2-7b-fp16; \
+              du -sh llama2-7b-fp16; \
               popd ; \
             "
       displayName: 'Run Llama2 to Onnx F16 and parity Test'
       workingDirectory: $(Build.SourcesDirectory)
 
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
+           onnxruntimeubi8packagestest_torch \
+            bash -c "
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/llama ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda;\
+              ls -l llama2-7b-fp32-gpu; \
+              du -sh llama2-7b-fp32-gpu; \
+              popd ; \
+            "
+      displayName: 'Run Llama2 to Onnx fp32 and parity Test'
+      workingDirectory: $(Build.SourcesDirectory)
+
+    - script: |
+        docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
+           -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
+           -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
+           onnxruntimeubi8packagestest_torch \
+            bash -c "
+              set -ex; \
+              pushd /workspace/onnxruntime/python/tools/transformers/ ; \
+              python3 -m pip install --upgrade pip ; \
+              pushd models/llama ; \
+              python3 -m pip install -r requirements.txt ; \
+              popd ; \
+              python3 -m pip install /ort-artifact/*.whl ; \
+              python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-int4-gpu  --precision int4 --execution_provider cuda --use_gqa;\
+              ls -l llama2-7b-int4-gpu; \
+              du -sh llama2-7b-int4-gpu; \
+              popd ; \
+            "
+      displayName: 'Run Llama2 to Onnx INT4 and parity Test'
+      workingDirectory: $(Build.SourcesDirectory)
+
 - stage: Whisper_ONNX
   dependsOn:
   - Build_Onnxruntime_Cuda
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -0,0 +1,57 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10.0 and CUDA 11.8 by default
+
+# Build base image with required system packages
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+FROM $BASEIMAGE AS base
+ARG TRT_VERSION
+ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+
+RUN dnf install -y bash wget &&\
+    dnf clean dbcache
+
+RUN pip3 install --upgrade pip
+RUN pip3 install setuptools>=68.2.2
+
+#Install TensorRT only if TRT_VERSION is not empty
+RUN if [ -n "$TRT_VERSION" ]; then  \
+    echo "TRT_VERSION is $TRT_VERSION" && \
+    dnf -y install  \
+    libnvinfer10-${TRT_VERSION}  \
+    libnvinfer-headers-devel-${TRT_VERSION}  \
+    libnvinfer-devel-${TRT_VERSION}  \
+    libnvinfer-lean10-${TRT_VERSION}  \
+    libnvonnxparsers10-${TRT_VERSION}  \
+    libnvonnxparsers-devel-${TRT_VERSION}  \
+    libnvinfer-dispatch10-${TRT_VERSION}  \
+    libnvinfer-plugin10-${TRT_VERSION}  \
+    libnvinfer-vc-plugin10-${TRT_VERSION}  \
+    libnvinfer-bin-${TRT_VERSION}  \
+    libnvinfer-plugin10-${TRT_VERSION}  \
+    libnvinfer-plugin-devel-${TRT_VERSION}  \
+    libnvinfer-vc-plugin-devel-${TRT_VERSION}  \
+    libnvinfer-lean-devel-${TRT_VERSION}  \
+    libnvinfer-dispatch-devel-${TRT_VERSION}  \
+    libnvinfer-headers-plugin-devel-${TRT_VERSION} && \
+    dnf clean dbcache ; \
+else \
+    echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
+fi
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && /tmp/scripts/install_java.sh && rm -rf /tmp/scripts
+
+RUN python3 -m pip uninstall -y torch
+RUN python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118
+
+# Build final image from base.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER