@@ -38,14 +38,6 @@ parameters:
3838 type : number
3939 default : 0
4040
41- resources :
42- repositories :
43- - repository : LLaMa2Onnx
44- type : Github
45- endpoint : Microsoft
46- name : Microsoft/Llama-2-Onnx
47- ref : main
48-
4941variables :
5042 - template : templates/common-variables.yml
5143 - name : docker_base_image
@@ -287,11 +279,12 @@ stages:
287279 workingDirectory: $(Build.SourcesDirectory)
288280 condition: ne(variables.hitAnother, 'True')
289281
290- - stage : Llama2_ONNX_FP16
282+ - stage : Llama2_7B_ONNX
291283 dependsOn :
292284 - Build_Onnxruntime_Cuda
293285 jobs :
294- - job : Llama2_ONNX_FP16
286+ - job : Llama2_7B_ONNX
287+ timeoutInMinutes : 120
295288 variables :
296289 skipComponentGovernanceDetection : true
297290 workspace :
@@ -319,15 +312,15 @@ stages:
319312
320313 - template : templates/get-docker-image-steps.yml
321314 parameters :
322- Dockerfile : tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
315+ Dockerfile : tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
323316 Context : tools/ci_build/github/linux/docker/
324317 ScriptName : tools/ci_build/get_docker_image.py
325318 DockerBuildArgs : "
326319 --build-arg BUILD_UID=$( id -u )
327320 --build-arg BASEIMAGE=${{ variables.docker_base_image }}
328321 --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
329322 "
330- Repository : onnxruntimeubi8packagestest
323+ Repository : onnxruntimeubi8packagestest_torch
331324 UpdateDepsTxt : false
332325
333326 - task : DownloadPackage@1
@@ -343,7 +336,7 @@ stages:
343336 docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
344337 -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
345338 -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
346- onnxruntimeubi8packagestest \
339+ onnxruntimeubi8packagestest_torch \
347340 bash -c "
348341 set -ex; \
349342 pushd /workspace/onnxruntime/python/tools/transformers/ ; \
@@ -352,14 +345,56 @@ stages:
352345 python3 -m pip install -r requirements.txt ; \
353346 popd ; \
354347 python3 -m pip install /ort-artifact/*.whl ; \
355- python3 -m pip uninstall -y torch ; \
356- python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
357- python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ; \
348+ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --small_gp; \
349+ ls -l llama2-7b-fp16 ; \
350+ du -sh llama2-7b-fp16; \
358351 popd ; \
359352 "
360353 displayName: 'Run Llama2 to Onnx F16 and parity Test'
361354 workingDirectory: $(Build.SourcesDirectory)
362355
356+ - script : |
357+ docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
358+ -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
359+ -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
360+ onnxruntimeubi8packagestest_torch \
361+ bash -c "
362+ set -ex; \
363+ pushd /workspace/onnxruntime/python/tools/transformers/ ; \
364+ python3 -m pip install --upgrade pip ; \
365+ pushd models/llama ; \
366+ python3 -m pip install -r requirements.txt ; \
367+ popd ; \
368+ python3 -m pip install /ort-artifact/*.whl ; \
369+ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda;\
370+ ls -l llama2-7b-fp32-gpu; \
371+ du -sh llama2-7b-fp32-gpu; \
372+ popd ; \
373+ "
374+ displayName: 'Run Llama2 to Onnx fp32 and parity Test'
375+ workingDirectory: $(Build.SourcesDirectory)
376+
377+ - script : |
378+ docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
379+ -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
380+ -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
381+ onnxruntimeubi8packagestest_torch \
382+ bash -c "
383+ set -ex; \
384+ pushd /workspace/onnxruntime/python/tools/transformers/ ; \
385+ python3 -m pip install --upgrade pip ; \
386+ pushd models/llama ; \
387+ python3 -m pip install -r requirements.txt ; \
388+ popd ; \
389+ python3 -m pip install /ort-artifact/*.whl ; \
390+ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-int4-gpu --precision int4 --execution_provider cuda --use_gqa;\
391+ ls -l llama2-7b-int4-gpu; \
392+ du -sh llama2-7b-int4-gpu; \
393+ popd ; \
394+ "
395+ displayName: 'Run Llama2 to Onnx INT4 and parity Test'
396+ workingDirectory: $(Build.SourcesDirectory)
397+
363398 - stage : Whisper_ONNX
364399 dependsOn :
365400 - Build_Onnxruntime_Cuda
0 commit comments