Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
7b55ce5
first commit
MarcelWilnicki Sep 25, 2025
1ede310
wip
MarcelWilnicki Sep 25, 2025
a31e5e1
wip
MarcelWilnicki Sep 25, 2025
6d5003d
wip
MarcelWilnicki Sep 26, 2025
776e70d
wip
MarcelWilnicki Sep 26, 2025
b3c4a0c
wip
MarcelWilnicki Sep 26, 2025
f8c8b06
wip
MarcelWilnicki Sep 26, 2025
df07b65
wip
MarcelWilnicki Sep 26, 2025
6626f91
wip
MarcelWilnicki Sep 26, 2025
01cb4de
wip
MarcelWilnicki Sep 30, 2025
bc405d6
wip
MarcelWilnicki Sep 30, 2025
ddf25ed
wip
MarcelWilnicki Oct 1, 2025
c4e81b0
wip
MarcelWilnicki Oct 1, 2025
c7764d4
wip
MarcelWilnicki Oct 2, 2025
146e3b0
wip
MarcelWilnicki Oct 2, 2025
83d284e
wip
MarcelWilnicki Oct 2, 2025
93ed7b4
wip
MarcelWilnicki Oct 3, 2025
07a1a34
wip
MarcelWilnicki Oct 3, 2025
ccddf0a
wip
MarcelWilnicki Oct 7, 2025
2a518e7
wip
MarcelWilnicki Oct 7, 2025
fe213de
wip
MarcelWilnicki Oct 7, 2025
02d4de6
wip
MarcelWilnicki Oct 7, 2025
d720b06
wip
MarcelWilnicki Oct 7, 2025
c682f18
wip
MarcelWilnicki Oct 7, 2025
2b56ab0
wip
MarcelWilnicki Oct 8, 2025
23e4287
wip
MarcelWilnicki Oct 8, 2025
871de3a
wip
MarcelWilnicki Oct 16, 2025
5ff4486
wip
MarcelWilnicki Oct 17, 2025
ee2acbd
wip
MarcelWilnicki Oct 17, 2025
0b9196f
wip
MarcelWilnicki Oct 17, 2025
5e97ac4
wip
MarcelWilnicki Oct 17, 2025
3ae639f
wip
MarcelWilnicki Oct 17, 2025
27af2ee
wip
MarcelWilnicki Oct 17, 2025
84eaf81
wip
MarcelWilnicki Oct 17, 2025
7181c43
wip
MarcelWilnicki Oct 17, 2025
47e0779
wip
MarcelWilnicki Oct 17, 2025
5ca5249
wip
MarcelWilnicki Oct 17, 2025
c2e309d
wip
MarcelWilnicki Oct 20, 2025
fba70f6
wip
MarcelWilnicki Oct 20, 2025
242ca72
wip
MarcelWilnicki Oct 20, 2025
e5f63ab
wip
MarcelWilnicki Oct 20, 2025
8628d9e
wip
MarcelWilnicki Oct 20, 2025
572eb52
wip
MarcelWilnicki Oct 20, 2025
58b26c7
wip
MarcelWilnicki Oct 22, 2025
6c47e2f
wip
MarcelWilnicki Oct 22, 2025
006ebb1
wip
MarcelWilnicki Oct 22, 2025
13e9fd7
wip
MarcelWilnicki Oct 22, 2025
706fe6e
wip
MarcelWilnicki Oct 22, 2025
ad8cf9f
wip
MarcelWilnicki Oct 22, 2025
8983fae
wip
MarcelWilnicki Oct 22, 2025
b22bd26
wip
MarcelWilnicki Oct 23, 2025
51f3b94
wip
MarcelWilnicki Oct 23, 2025
3f70599
wip
MarcelWilnicki Oct 24, 2025
d918720
wip
MarcelWilnicki Oct 28, 2025
b1e41d2
Merge branch 'marcel/bert_fixed_input' into marcel/add_yolo_v11
MarcelWilnicki Oct 28, 2025
2b55c04
wip
MarcelWilnicki Oct 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ jobs:
python3 -m unittest tests.test_pytorch_models

- name: End-user smoke test
run: |
run: |
ffmpeg -version
wget https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/aio_objdet_dataset.tar.gz > /dev/null 2>&1
tar -xf aio_objdet_dataset.tar.gz > /dev/null

Expand Down Expand Up @@ -115,6 +116,7 @@ jobs:
COCO_IMG_PATH: aio_objdet_dataset
COCO_ANNO_PATH: aio_objdet_dataset/annotations.json
OMP_NUM_THREADS: 32
AIO_NUM_THREADS: 32
S3_URL_CRITEO_DATASET: ${{ secrets.S3_URL_CRITEO_DATASET }}
S3_URL_RESNET_50_V15_TF_FP32: ${{ secrets.S3_URL_RESNET_50_V15_TF_FP32 }}
S3_URL_SSD_INCEPTION_V2_TF_FP32: ${{ secrets.S3_URL_SSD_INCEPTION_V2_TF_FP32 }}
Expand Down Expand Up @@ -257,21 +259,21 @@ jobs:
tar -xf aio_objdet_dataset.tar.gz > /dev/null

wget https://github.com/tloen/alpaca-lora/raw/main/alpaca_data.json > /dev/null 2>&1
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json

AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32

IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch

AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en

IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60

wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt > /dev/null 2>&1
IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch

wget -O bert_large_mlperf.pt https://zenodo.org/records/3733896/files/model.pytorch?download=1 > /dev/null 2>&1
AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch

test_tensorflow_arm64:
runs-on: self-hosted
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright (c) 2024, Ampere Computing LLC
Copyright (c) 2025, Ampere Computing LLC
Copyright (c) 2022 Andrej Karpathy
Copyright (c) 2022 OpenAI
Copyright (c) 2022 Stability AI
Expand Down
118 changes: 118 additions & 0 deletions computer_vision/object_detection/yolo_v11/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# YOLO v8

This folder contains the script to run YOLO v8 on COCO object detection task.

Variants supplied below for PyTorch and ONNX Runtime in fp32 precision accept input of shape 640x640.

The original documentation of the model is available here: https://docs.ultralytics.com/#ultralytics-yolov8


### Metrics

Based on 1000 images from COCO Dataset for YOLOv8n model in PyTorch framework in fp32 precision

| Metric | IoU | Area | maxDets |Score |
|:---: |:---: |:---: |:---: |:---: |
| Average Precision (AP) |0.50:0.95 | all | 100 | 0.338 |
| Average Precision (AP) |0.50 | all | 100 | 0.452 |
| Average Precision (AP) |0.75 | all | 100 | 0.370 |
| Average Precision (AP) |0.50:0.95 | small | 100 | 0.122 |
| Average Precision (AP) |0.50:0.95 | medium | 100 | 0.351 |
| Average Precision (AP) |0.50:0.95 | large | 100 | 0.504 |
| Average Recall (AR) |0.50:0.95 | all | 1 | 0.265 |
| Average Recall (AR) |0.50:0.95 | all | 10 | 0.375 |
| Average Recall (AR) |0.50:0.95 | all | 100 | 0.381 |
| Average Recall (AR) |0.50:0.95 | small | 100 | 0.133 |
| Average Recall (AR) |0.50:0.95 | medium | 100 | 0.385 |
| Average Recall (AR) |0.50:0.95 | large | 100 | 0.569 |

Based on 1000 images from COCO Dataset for YOLOv8n model in ONNX Runtime framework in fp32 precision

| Metric | IoU | Area | maxDets |Score |
|:---: |:---: |:---: |:---: |:---: |
| Average Precision (AP) |0.50:0.95 | all | 100 | 0.338|
| Average Precision (AP) |0.50 | all | 100 | 0.452|
| Average Precision (AP) |0.75 | all | 100 | 0.370|
| Average Precision (AP) |0.50:0.95 | small | 100 | 0.122|
| Average Precision (AP) |0.50:0.95 | medium | 100 | 0.351|
| Average Precision (AP) |0.50:0.95 | large | 100 | 0.504|
| Average Recall (AR) |0.50:0.95 | all | 1 | 0.265|
| Average Recall (AR) |0.50:0.95 | all | 10 | 0.375|
| Average Recall (AR) |0.50:0.95 | all | 100 | 0.381|
| Average Recall (AR) |0.50:0.95 | small | 100 | 0.133|
| Average Recall (AR) |0.50:0.95 | medium | 100 | 0.385|
| Average Recall (AR) |0.50:0.95 | large | 100 | 0.569|

Based on 1000 images from COCO Dataset for YOLOv8x model in ONNX Runtime framework in fp32 precision

| Metric | IoU | Area | maxDets |Score |
|:---: |:---: |:---: |:---: |:---: |
| Average Precision (AP) |0.50:0.95 | all | 100 | 0.575|
| Average Precision (AP) |0.50 | all | 100 | 0.714|
| Average Precision (AP) |0.75 | all | 100 | 0.639|
| Average Precision (AP) |0.50:0.95 | small | 100 | 0.336|
| Average Precision (AP) |0.50:0.95 | medium | 100 | 0.633|
| Average Precision (AP) |0.50:0.95 | large | 100 | 0.812|
| Average Recall (AR) |0.50:0.95 | all | 1 | 0.409|
| Average Recall (AR) |0.50:0.95 | all | 10 | 0.611|
| Average Recall (AR) |0.50:0.95 | all | 100 | 0.620|
| Average Recall (AR) |0.50:0.95 | small | 100 | 0.361|
| Average Recall (AR) |0.50:0.95 | medium | 100 | 0.676|
| Average Recall (AR) |0.50:0.95 | large | 100 | 0.849|


### Dataset and model

Dataset can be downloaded from here: https://cocodataset.org/#download

PyTorch models in fp32 precision can be downloaded here:
```
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8s.pt
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8m.pt
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt
wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8x.pt
```

You can export a PyTorch model to ONNX Runtime model using the following Python code:

```python
from ultralytics import YOLO
model = YOLO('/path/to/yolov8n.pt')
model.export(format='onnx')
```

### Running instructions

Before running any code you should first export the PYTHONPATH variable with path pointing to the model zoo directory,
as well as AIO_NUM_THREADS specifying the number of threads to be used.

```
export PYTHONPATH=/path/to/model_zoo
export AIO_NUM_THREADS=1
```

For the best experience we also recommend setting environment variables as specified below.

```
export COCO_IMG_PATH=/path/to/images
export COCO_ANNO_PATH=/path/to/annotations
```

Now you are able to run the run.py script.

To get detailed information on the script's recognized arguments run it with -h flag for help.

The path to model (with a flag "-m") as well as its precision (with a flag "-p") have to be specified.

Please note that the default batch size is 1 and if not specified otherwise the script will run for 1 minute.

Example command:

```
python3 run.py -m /path/to/model.onnx -p fp32 --framework ort
```

```
python3 run.py -m /path/to/model.pt -p fp32 --framework pytorch
```
116 changes: 116 additions & 0 deletions computer_vision/object_detection/yolo_v11/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
import os
import sys
filename = "set_env_variables.sh"
directory = os.path.realpath(__file__).split("/")[:-1]
for idx in range(1, len(directory) - 1):
subdir = "/".join(directory[:-idx])
if filename in os.listdir(subdir):
print(f"\nPlease run \033[91m'source {os.path.join(subdir, filename)}'\033[0m first.")
break
else:
print(f"\n\033[91mFAIL: Couldn't find {filename}, are you running this script as part of Ampere Model Library?"
f"\033[0m")
sys.exit(1)


def parse_args():
import argparse
parser = argparse.ArgumentParser(description="Run YOLOv11 model.")
parser.add_argument("-m", "--model_path",
type=str, required=True,
help="path to the model")
parser.add_argument("-p", "--precision",
type=str, choices=["fp32"], default="fp32",
help="precision of the model provided")
parser.add_argument("-b", "--batch_size",
type=int, default=1,
help="batch size to feed the model with")
parser.add_argument("-f", "--framework",
type=str,
choices=["pytorch"], required=True,
help="specify the framework in which a model should be run")
parser.add_argument("--timeout",
type=float, default=60.0,
help="timeout in seconds")
parser.add_argument("--num_runs",
type=int,
help="number of passes through network to execute")
parser.add_argument("--images_path",
type=str,
help="path to directory with COCO validation images")
parser.add_argument("--anno_path",
type=str,
help="path to file with validation annotations")
parser.add_argument("--disable_jit_freeze", action='store_true',
help="if true model will be run not in jit freeze mode")
return parser.parse_args()


def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_path, disable_jit_freeze=False):
import torch
import os
from utils.cv.coco import COCODataset
from utils.benchmark import run_model

os.environ["YOLO_VERBOSE"] = os.getenv("YOLO_VERBOSE", "False")
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.pytorch import PyTorchRunner
from ultralytics.utils import nms

def run_single_pass(pytorch_runner, coco):
output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
coco.submit_bbox_prediction(
i,
coco.convert_bbox_to_coco_order(output[i][d][:4].tolist()),
output[i][d][4].item(),
coco.translate_cat_id_to_coco(output[i][d][5].item())
)

dataset = COCODataset(batch_size, "RGB", "COCO_val2014_000000000000", images_path,
anno_path, pre_processing="PyTorch_objdet", sort_ascending=True, order="NCHW")

from ultralytics import YOLO
model = YOLO(model_path)
torchscript_model = model.export(format="torchscript")

runner = PyTorchRunner(torch.jit.load(torchscript_model),
disable_jit_freeze=disable_jit_freeze,
example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))

return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)


def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_path, disable_jit_freeze, **kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_path, disable_jit_freeze)


def main():
from utils.misc import print_goodbye_message_and_die
args = parse_args()

if args.framework == "pytorch":
import torch
if torch.cuda.is_available():
run_pytorch_cuda(**vars(args))
elif args.precision == "fp32":
run_pytorch_fp32(**vars(args))
else:
print_goodbye_message_and_die(
"this model seems to be unsupported in a specified precision: " + args.precision)
else:
print_goodbye_message_and_die(
"this model seems to be unsupported in a specified framework: " + args.framework)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion computer_vision/object_detection/yolo_v5/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down
12 changes: 6 additions & 6 deletions computer_vision/object_detection/yolo_v8/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down Expand Up @@ -61,15 +61,15 @@ def run_ort_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_pa
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.ort import OrtRunner
from ultralytics.yolo.utils import ops
from ultralytics.utils import nms

def run_single_pass(ort_runner, coco):
shape = (640, 640)
ort_runner.set_input_tensor("images", coco.get_input_array(shape).astype("float32"))
output = ort_runner.run(batch_size)

output = torch.from_numpy(output[0])
output = ops.non_max_suppression(output)
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
Expand Down Expand Up @@ -97,11 +97,11 @@ def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_
# Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
# to set it to True if needed
from utils.pytorch import PyTorchRunner
from ultralytics.yolo.utils import ops
from ultralytics.utils import nms

def run_single_pass(pytorch_runner, coco):
output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
output = ops.non_max_suppression(output)
output = nms.non_max_suppression(output)

for i in range(batch_size):
for d in range(output[i].shape[0]):
Expand All @@ -121,7 +121,7 @@ def run_single_pass(pytorch_runner, coco):

runner = PyTorchRunner(torch.jit.load(torchscript_model),
disable_jit_freeze=disable_jit_freeze,
example_inputs=torch.stack(dataset.get_input_array((640, 640))))
example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))

return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024, Ampere Computing LLC
# Copyright (c) 2025, Ampere Computing LLC
try:
from utils import misc # noqa
except ModuleNotFoundError:
Expand Down Expand Up @@ -43,6 +43,8 @@ def parse_args():
parser.add_argument("--squad_path",
type=str,
help="path to directory with ImageNet validation images")
parser.add_argument("--fixed_input_size", type=int,
help='size of the input')
parser.add_argument("--disable_jit_freeze", action='store_true',
help="if true model will be run not in jit freeze mode")
return parser.parse_args()
Expand Down Expand Up @@ -93,7 +95,7 @@ def run_tf_fp16(model_path, batch_size, num_runs, timeout, squad_path, **kwargs)
return run_tf_fp(model_path, batch_size, num_runs, timeout, squad_path)


def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze=False):
def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze=False):
from utils.benchmark import run_model
from utils.nlp.squad import Squad_v1_1
from transformers import AutoTokenizer, BertConfig, BertForQuestionAnswering
Expand All @@ -117,7 +119,11 @@ def run_single_pass(pytorch_runner, squad):
padding=True, truncation=True, model_max_length=512)

def tokenize(question, text):
return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
if fixed_input_size is not None:
return tokenizer(question, text, padding="max_length", truncation=True,
max_length=fixed_input_size, return_tensors="pt")
else:
return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")

def detokenize(answer):
return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer))
Expand Down Expand Up @@ -199,8 +205,9 @@ def detokenize(answer):
return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)


def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze, **kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze)
def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze,
**kwargs):
return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze)


def main():
Expand Down
Loading
Loading