Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
7b55ce5
first commit
MarcelWilnicki Sep 25, 2025
1ede310
wip
MarcelWilnicki Sep 25, 2025
a31e5e1
wip
MarcelWilnicki Sep 25, 2025
6d5003d
wip
MarcelWilnicki Sep 26, 2025
776e70d
wip
MarcelWilnicki Sep 26, 2025
b3c4a0c
wip
MarcelWilnicki Sep 26, 2025
f8c8b06
wip
MarcelWilnicki Sep 26, 2025
df07b65
wip
MarcelWilnicki Sep 26, 2025
6626f91
wip
MarcelWilnicki Sep 26, 2025
01cb4de
wip
MarcelWilnicki Sep 30, 2025
bc405d6
wip
MarcelWilnicki Sep 30, 2025
ddf25ed
wip
MarcelWilnicki Oct 1, 2025
c4e81b0
wip
MarcelWilnicki Oct 1, 2025
c7764d4
wip
MarcelWilnicki Oct 2, 2025
146e3b0
wip
MarcelWilnicki Oct 2, 2025
83d284e
wip
MarcelWilnicki Oct 2, 2025
93ed7b4
wip
MarcelWilnicki Oct 3, 2025
07a1a34
wip
MarcelWilnicki Oct 3, 2025
ccddf0a
wip
MarcelWilnicki Oct 7, 2025
2a518e7
wip
MarcelWilnicki Oct 7, 2025
fe213de
wip
MarcelWilnicki Oct 7, 2025
02d4de6
wip
MarcelWilnicki Oct 7, 2025
d720b06
wip
MarcelWilnicki Oct 7, 2025
c682f18
wip
MarcelWilnicki Oct 7, 2025
2b56ab0
wip
MarcelWilnicki Oct 8, 2025
23e4287
wip
MarcelWilnicki Oct 8, 2025
871de3a
wip
MarcelWilnicki Oct 16, 2025
5ff4486
wip
MarcelWilnicki Oct 17, 2025
ee2acbd
wip
MarcelWilnicki Oct 17, 2025
0b9196f
wip
MarcelWilnicki Oct 17, 2025
5e97ac4
wip
MarcelWilnicki Oct 17, 2025
3ae639f
wip
MarcelWilnicki Oct 17, 2025
27af2ee
wip
MarcelWilnicki Oct 17, 2025
84eaf81
wip
MarcelWilnicki Oct 17, 2025
7181c43
wip
MarcelWilnicki Oct 17, 2025
47e0779
wip
MarcelWilnicki Oct 17, 2025
5ca5249
wip
MarcelWilnicki Oct 17, 2025
c2e309d
wip
MarcelWilnicki Oct 20, 2025
fba70f6
wip
MarcelWilnicki Oct 20, 2025
242ca72
wip
MarcelWilnicki Oct 20, 2025
e5f63ab
wip
MarcelWilnicki Oct 20, 2025
8628d9e
wip
MarcelWilnicki Oct 20, 2025
572eb52
wip
MarcelWilnicki Oct 20, 2025
58b26c7
wip
MarcelWilnicki Oct 22, 2025
6c47e2f
wip
MarcelWilnicki Oct 22, 2025
006ebb1
wip
MarcelWilnicki Oct 22, 2025
13e9fd7
wip
MarcelWilnicki Oct 22, 2025
706fe6e
wip
MarcelWilnicki Oct 22, 2025
ad8cf9f
wip
MarcelWilnicki Oct 22, 2025
8983fae
wip
MarcelWilnicki Oct 22, 2025
b22bd26
wip
MarcelWilnicki Oct 23, 2025
51f3b94
wip
MarcelWilnicki Oct 23, 2025
3f70599
wip
MarcelWilnicki Oct 24, 2025
4d8ec72
wip
MarcelWilnicki Oct 30, 2025
cfb802a
wip
MarcelWilnicki Oct 30, 2025
0face5a
wip
MarcelWilnicki Oct 31, 2025
14f0ab3
wip
MarcelWilnicki Oct 31, 2025
8351e91
wip
MarcelWilnicki Oct 31, 2025
0f8b11a
wip
MarcelWilnicki Oct 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
python3 -m unittest tests.test_pytorch_models

- name: End-user smoke test
run: |
run: |
wget https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/aio_objdet_dataset.tar.gz > /dev/null 2>&1
tar -xf aio_objdet_dataset.tar.gz > /dev/null

Expand Down Expand Up @@ -115,6 +115,7 @@ jobs:
COCO_IMG_PATH: aio_objdet_dataset
COCO_ANNO_PATH: aio_objdet_dataset/annotations.json
OMP_NUM_THREADS: 32
AIO_NUM_THREADS: 32
S3_URL_CRITEO_DATASET: ${{ secrets.S3_URL_CRITEO_DATASET }}
S3_URL_RESNET_50_V15_TF_FP32: ${{ secrets.S3_URL_RESNET_50_V15_TF_FP32 }}
S3_URL_SSD_INCEPTION_V2_TF_FP32: ${{ secrets.S3_URL_SSD_INCEPTION_V2_TF_FP32 }}
Expand Down Expand Up @@ -244,6 +245,7 @@ jobs:
- name: Unittest
run: |
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 -m unittest tests.test_pytorch_models
echo HERE1

- name: benchmark.py test
run: |
Expand All @@ -257,21 +259,21 @@ jobs:
tar -xf aio_objdet_dataset.tar.gz > /dev/null

wget https://github.com/tloen/alpaca-lora/raw/main/alpaca_data.json > /dev/null 2>&1
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json

AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32

IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch

AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en

IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60

wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch

wget -O bert_large_mlperf.pt https://zenodo.org/records/3733896/files/model.pytorch?download=1 > /dev/null 2>&1
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch

test_tensorflow_arm64:
runs-on: self-hosted
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright (c) 2024, Ampere Computing LLC
Copyright (c) 2025, Ampere Computing LLC
Copyright (c) 2022 Andrej Karpathy
Copyright (c) 2022 OpenAI
Copyright (c) 2022 Stability AI
Expand Down
2 changes: 1 addition & 1 deletion computer_vision/object_detection/yolo_v5/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down
12 changes: 6 additions & 6 deletions computer_vision/object_detection/yolo_v8/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down Expand Up @@ -61,15 +61,15 @@ def run_ort_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_pa
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.ort import OrtRunner
from ultralytics.yolo.utils import ops
from ultralytics.utils import nms

def run_single_pass(ort_runner, coco):
shape = (640, 640)
ort_runner.set_input_tensor("images", coco.get_input_array(shape).astype("float32"))
output = ort_runner.run(batch_size)

output = torch.from_numpy(output[0])
output = ops.non_max_suppression(output)
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
Expand Down Expand Up @@ -97,11 +97,11 @@ def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.pytorch import PyTorchRunner
from ultralytics.yolo.utils import ops
from ultralytics.utils import nms

def run_single_pass(pytorch_runner, coco):
output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
output = ops.non_max_suppression(output)
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
Expand All @@ -121,7 +121,7 @@ def run_single_pass(pytorch_runner, coco):

runner = PyTorchRunner(torch.jit.load(torchscript_model),
disable_jit_freeze=disable_jit_freeze,
example_inputs=torch.stack(dataset.get_input_array((640, 640))))
example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))

return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down Expand Up @@ -43,6 +43,8 @@ def parse_args():
parser.add_argument("--squad_path",
type=str,
help="path to directory with ImageNet validation images")
parser.add_argument("--fixed_input_size", type=int,
help='size of the input')
parser.add_argument("--disable_jit_freeze", action='store_true',
help="if true model will be run not in jit freeze mode")
return parser.parse_args()
Expand Down Expand Up @@ -93,7 +95,7 @@ def run_tf_fp16(model_path, batch_size, num_runs, timeout, squad_path, **kwargs)
return run_tf_fp(model_path, batch_size, num_runs, timeout, squad_path)


def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze=False):
def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze=False):
from utils.benchmark import run_model
from utils.nlp.squad import Squad_v1_1
from transformers import AutoTokenizer, BertConfig, BertForQuestionAnswering
Expand All @@ -117,7 +119,11 @@ def run_single_pass(pytorch_runner, squad):
padding=True, truncation=True, model_max_length=512)

def tokenize(question, text):
return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
if fixed_input_size is not None:
return tokenizer(question, text, padding="max_length", truncation=True,
max_length=fixed_input_size, return_tensors="pt")
else:
return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")

def detokenize(answer):
return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer))
Expand Down Expand Up @@ -199,8 +205,9 @@ def detokenize(answer):
return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)


def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze, **kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze)
def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze,
**kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze)


def main():
Expand Down
2 changes: 1 addition & 1 deletion recommendation/dlrm/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ tiktoken
ultralytics
evaluate
datasets
datasets[audio]
soundfile
librosa
numba
Expand All @@ -35,4 +36,4 @@ kornia
open-clip-torch<2.26.1
diffusers
accelerate
boto3==1.29.0; python_version>='3.12'
boto3==1.29.0; python_version>='3.12'
15 changes: 12 additions & 3 deletions setup_deb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

set -eo pipefail

ln -fs /usr/share/zoneinfo/Europe/Warsaw /etc/localtime
echo "Europe/Warsaw" | tee /etc/timezone >/dev/null

log() {
COLOR_DEFAULT='\033[0m'
COLOR_CYAN='\033[1;36m'
Expand Down Expand Up @@ -46,13 +49,15 @@ fi
log "Installing system dependencies ..."
sleep 1
apt-get update -y
apt-get install -y build-essential ffmpeg libsm6 libxext6 wget git unzip numactl libhdf5-dev cmake
apt-get install -y build-essential libsm6 libxext6 wget git unzip numactl libhdf5-dev cmake
if ! python3 -c ""; then
apt-get update -y
apt-get install -y python3 python3-pip
fi
if ! pip3 --version; then
apt-get install -y python3-pip
fi

PYTHON_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
PYTHON_DEV_SEARCH=$(apt-cache search --names-only "python${PYTHON_VERSION}-dev")
if [[ -n "$PYTHON_DEV_SEARCH" ]]; then
Expand All @@ -76,8 +81,9 @@ sleep 1
ARCH=$ARCH python3 "$SCRIPT_DIR"/utils/setup/install_frameworks.py

# get almost all python deps
pip3 install --break-system-packages -r "$(dirname "$0")/requirements.txt" ||
pip3 install -r "$(dirname "$0")/requirements.txt"
PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip install --ignore-installed --upgrade pip
python3 -m pip install --break-system-packages -r "$(dirname "$0")/requirements.txt" ||
python3 -m pip3 install -r "$(dirname "$0")/requirements.txt"

apt install -y autoconf autogen automake build-essential libasound2-dev \
libflac-dev libogg-dev libtool libvorbis-dev libopus-dev libmp3lame-dev \
Expand All @@ -98,6 +104,9 @@ if [ "$(python3 -c 'import torch; print(torch.cuda.is_available())')" == "True"
fi
log "done.\n"

apt-get update -y
apt-get install -y ffmpeg

if [ -f "/etc/machine-id" ]; then
cat /etc/machine-id >"$SCRIPT_DIR"/.setup_completed
else
Expand Down
Loading
Loading