AmpereComputingAI · MarcelWilnicki · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 26, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -80,7 +80,8 @@ jobs:
           python3 -m unittest tests.test_pytorch_models
 
       - name: End-user smoke test
-        run: |            
+        run: |                      
+          ffmpeg -version
           wget https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/aio_objdet_dataset.tar.gz > /dev/null 2>&1
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
 
@@ -115,6 +116,7 @@ jobs:
       COCO_IMG_PATH: aio_objdet_dataset
       COCO_ANNO_PATH: aio_objdet_dataset/annotations.json
       OMP_NUM_THREADS: 32
+      AIO_NUM_THREADS: 32
       S3_URL_CRITEO_DATASET: ${{ secrets.S3_URL_CRITEO_DATASET }}
       S3_URL_RESNET_50_V15_TF_FP32: ${{ secrets.S3_URL_RESNET_50_V15_TF_FP32 }}
       S3_URL_SSD_INCEPTION_V2_TF_FP32: ${{ secrets.S3_URL_SSD_INCEPTION_V2_TF_FP32 }}
@@ -257,21 +259,21 @@ jobs:
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
 
           wget https://github.com/tloen/alpaca-lora/raw/main/alpaca_data.json > /dev/null 2>&1
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/text_generation/llama2/run.py -m meta-llama/Llama-2-7b-chat-hf --dataset_path=alpaca_data.json
 
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 recommendation/dlrm_torchbench/run.py -p fp32
 
-          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v15/run.py -m resnet50 -p fp32 -b 16 -f pytorch
 
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en 
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en 
 
-          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
 
           wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt > /dev/null 2>&1
-          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch              
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch              
 
           wget -O bert_large_mlperf.pt https://zenodo.org/records/3733896/files/model.pytorch?download=1 > /dev/null 2>&1
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
+          OMP_NUM_THREADS=32 AIO_NUM_THREADS=32 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
 
   test_tensorflow_arm64:
     runs-on: self-hosted

diff --git a/LICENSE b/LICENSE
@@ -187,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright (c) 2024, Ampere Computing LLC
+   Copyright (c) 2025, Ampere Computing LLC
    Copyright (c) 2022 Andrej Karpathy
    Copyright (c) 2022 OpenAI
    Copyright (c) 2022 Stability AI

diff --git a/computer_vision/object_detection/yolo_v11/README.md b/computer_vision/object_detection/yolo_v11/README.md
@@ -0,0 +1,118 @@
+# YOLO v8
+
+This folder contains the script to run YOLO v8 on COCO object detection task.
+
+Variants supplied below for PyTorch and ONNX Runtime in fp32 precision accept input of shape 640x640.
+
+The original documentation of the model is available here: https://docs.ultralytics.com/#ultralytics-yolov8
+
+
+### Metrics
+
+Based on 1000 images from COCO Dataset for YOLOv8n model in PyTorch framework in fp32 precision
+
+| Metric                  | IoU       | Area   | maxDets |Score  |
+|:---:                    |:---:      |:---:   |:---:    |:---:  |
+| Average Precision  (AP) |0.50:0.95 |    all | 100 | 0.338 |
+| Average Precision  (AP) |0.50      |    all | 100 | 0.452 |
+| Average Precision  (AP) |0.75      |    all | 100 | 0.370 |
+| Average Precision  (AP) |0.50:0.95 |  small | 100 | 0.122 |
+| Average Precision  (AP) |0.50:0.95 | medium | 100 | 0.351 |
+| Average Precision  (AP) |0.50:0.95 |  large | 100 | 0.504 |
+| Average Recall     (AR) |0.50:0.95 |    all |   1 | 0.265 |
+| Average Recall     (AR) |0.50:0.95 |    all |  10 | 0.375 |
+| Average Recall     (AR) |0.50:0.95 |    all | 100 | 0.381 |
+| Average Recall     (AR) |0.50:0.95 |  small | 100 | 0.133 |
+| Average Recall     (AR) |0.50:0.95 | medium | 100 | 0.385 |
+| Average Recall     (AR) |0.50:0.95 |  large | 100 | 0.569 |
+
+Based on 1000 images from COCO Dataset for YOLOv8n model in ONNX Runtime framework in fp32 precision
+
+| Metric                  | IoU       | Area   | maxDets |Score  |
+|:---:                    |:---:      |:---:   |:---:    |:---:  |
+| Average Precision  (AP) |0.50:0.95 |    all | 100 | 0.338|
+| Average Precision  (AP) |0.50      |    all | 100 | 0.452|
+| Average Precision  (AP) |0.75      |    all | 100 | 0.370|
+| Average Precision  (AP) |0.50:0.95 |  small | 100 | 0.122|
+| Average Precision  (AP) |0.50:0.95 | medium | 100 | 0.351|
+| Average Precision  (AP) |0.50:0.95 |  large | 100 | 0.504|
+| Average Recall     (AR) |0.50:0.95 |    all |   1 | 0.265|
+| Average Recall     (AR) |0.50:0.95 |    all |  10 | 0.375|
+| Average Recall     (AR) |0.50:0.95 |    all | 100 | 0.381|
+| Average Recall     (AR) |0.50:0.95 |  small | 100 | 0.133|
+| Average Recall     (AR) |0.50:0.95 | medium | 100 | 0.385|
+| Average Recall     (AR) |0.50:0.95 |  large | 100 | 0.569|
+
+Based on 1000 images from COCO Dataset for YOLOv8x model in ONNX Runtime framework in fp32 precision
+
+| Metric                  | IoU       | Area   | maxDets |Score  |
+|:---:                    |:---:      |:---:   |:---:    |:---:  |
+| Average Precision  (AP) |0.50:0.95 |    all | 100 | 0.575|
+| Average Precision  (AP) |0.50      |    all | 100 | 0.714|
+| Average Precision  (AP) |0.75      |    all | 100 | 0.639|
+| Average Precision  (AP) |0.50:0.95 |  small | 100 | 0.336|
+| Average Precision  (AP) |0.50:0.95 | medium | 100 | 0.633|
+| Average Precision  (AP) |0.50:0.95 |  large | 100 | 0.812|
+| Average Recall     (AR) |0.50:0.95 |    all |   1 | 0.409|
+| Average Recall     (AR) |0.50:0.95 |    all |  10 | 0.611|
+| Average Recall     (AR) |0.50:0.95 |    all | 100 | 0.620|
+| Average Recall     (AR) |0.50:0.95 |  small | 100 | 0.361|
+| Average Recall     (AR) |0.50:0.95 | medium | 100 | 0.676|
+| Average Recall     (AR) |0.50:0.95 |  large | 100 | 0.849|
+
+
+### Dataset and model
+
+Dataset can be downloaded from here: https://cocodataset.org/#download
+
+PyTorch models in fp32 precision can be downloaded here:
+```
+wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt
+wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8s.pt
+wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8m.pt
+wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt
+wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8x.pt
+```
+
+You can export a PyTorch model to ONNX Runtime model using the following Python code:
+
+```python
+from ultralytics import YOLO
+model = YOLO('/path/to/yolov8n.pt')
+model.export(format='onnx')
+```
+
+### Running instructions
+
+Before running any code you should first export the PYTHONPATH variable with path pointing to the model zoo directory,
+as well as AIO_NUM_THREADS specifying the number of threads to be used.
+
+```
+export PYTHONPATH=/path/to/model_zoo
+export AIO_NUM_THREADS=1
+```
+
+For the best experience we also recommend setting environment variables as specified below.
+
+```
+export COCO_IMG_PATH=/path/to/images
+export COCO_ANNO_PATH=/path/to/annotations
+```
+
+Now you are able to run the run.py script. 
+
+To get detailed information on the script's recognized arguments run it with -h flag for help.
+
+The path to model (with a flag "-m") as well as its precision (with a flag "-p") have to be specified.
+
+Please note that the default batch size is 1 and if not specified otherwise the script will run for 1 minute.
+
+Example command: 
+
+```
+python3 run.py -m /path/to/model.onnx -p fp32 --framework ort
+```
+
+```
+python3 run.py -m /path/to/model.pt -p fp32 --framework pytorch
+```
diff --git a/computer_vision/object_detection/yolo_v11/run.py b/computer_vision/object_detection/yolo_v11/run.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2025, Ampere Computing LLC
+try:
+    from utils import misc  # noqa
+except ModuleNotFoundError:
+    import os
+    import sys
+    filename = "set_env_variables.sh"
+    directory = os.path.realpath(__file__).split("/")[:-1]
+    for idx in range(1, len(directory) - 1):
+        subdir = "/".join(directory[:-idx])
+        if filename in os.listdir(subdir):
+            print(f"\nPlease run \033[91m'source {os.path.join(subdir, filename)}'\033[0m first.")
+            break
+    else:
+        print(f"\n\033[91mFAIL: Couldn't find {filename}, are you running this script as part of Ampere Model Library?"
+              f"\033[0m")
+    sys.exit(1)
+
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Run YOLOv11 model.")
+    parser.add_argument("-m", "--model_path",
+                        type=str, required=True,
+                        help="path to the model")
+    parser.add_argument("-p", "--precision",
+                        type=str, choices=["fp32"], default="fp32",
+                        help="precision of the model provided")
+    parser.add_argument("-b", "--batch_size",
+                        type=int, default=1,
+                        help="batch size to feed the model with")
+    parser.add_argument("-f", "--framework",
+                        type=str,
+                        choices=["pytorch"], required=True,
+                        help="specify the framework in which a model should be run")
+    parser.add_argument("--timeout",
+                        type=float, default=60.0,
+                        help="timeout in seconds")
+    parser.add_argument("--num_runs",
+                        type=int,
+                        help="number of passes through network to execute")
+    parser.add_argument("--images_path",
+                        type=str,
+                        help="path to directory with COCO validation images")
+    parser.add_argument("--anno_path",
+                        type=str,
+                        help="path to file with validation annotations")
+    parser.add_argument("--disable_jit_freeze", action='store_true',
+                        help="if true model will be run not in jit freeze mode")
+    return parser.parse_args()
+
+
+def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_path, disable_jit_freeze=False):
+    import torch
+    import os
+    from utils.cv.coco import COCODataset
+    from utils.benchmark import run_model
+
+    os.environ["YOLO_VERBOSE"] = os.getenv("YOLO_VERBOSE", "False")
+    # Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
+    # to set it to True if needed
+    from utils.pytorch import PyTorchRunner
+    from ultralytics.utils import nms
+
+    def run_single_pass(pytorch_runner, coco):
+        output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
+        output = nms.non_max_suppression(output)
+
+        for i in range(batch_size):
+            for d in range(output[i].shape[0]):
+                coco.submit_bbox_prediction(
+                    i,
+                    coco.convert_bbox_to_coco_order(output[i][d][:4].tolist()),
+                    output[i][d][4].item(),
+                    coco.translate_cat_id_to_coco(output[i][d][5].item())
+                )
+
+    dataset = COCODataset(batch_size, "RGB", "COCO_val2014_000000000000", images_path,
+                          anno_path, pre_processing="PyTorch_objdet", sort_ascending=True, order="NCHW")
+
+    from ultralytics import YOLO
+    model = YOLO(model_path)
+    torchscript_model = model.export(format="torchscript")
+
+    runner = PyTorchRunner(torch.jit.load(torchscript_model),
+                           disable_jit_freeze=disable_jit_freeze,
+                           example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))
+
+    return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)
+
+
+def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_path, disable_jit_freeze, **kwargs):
+    return run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_path, disable_jit_freeze)
+
+
+def main():
+    from utils.misc import print_goodbye_message_and_die
+    args = parse_args()
+
+    if args.framework == "pytorch":
+        import torch
+        if torch.cuda.is_available():
+            run_pytorch_cuda(**vars(args))
+        elif args.precision == "fp32":
+            run_pytorch_fp32(**vars(args))
+        else:
+            print_goodbye_message_and_die(
+                "this model seems to be unsupported in a specified precision: " + args.precision)
+    else:
+        print_goodbye_message_and_die(
+            "this model seems to be unsupported in a specified framework: " + args.framework)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/computer_vision/object_detection/yolo_v5/run.py b/computer_vision/object_detection/yolo_v5/run.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:

diff --git a/computer_vision/object_detection/yolo_v8/run.py b/computer_vision/object_detection/yolo_v8/run.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:
@@ -61,15 +61,15 @@ def run_ort_fp32(model_path, batch_size, num_runs, timeout, images_path, anno_pa
     # Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
     # to set it to True if needed
     from utils.ort import OrtRunner
-    from ultralytics.yolo.utils import ops
+    from ultralytics.utils import nms
 
     def run_single_pass(ort_runner, coco):
         shape = (640, 640)
         ort_runner.set_input_tensor("images", coco.get_input_array(shape).astype("float32"))
         output = ort_runner.run(batch_size)
 
         output = torch.from_numpy(output[0])
-        output = ops.non_max_suppression(output)
+        output = nms.non_max_suppression(output)
 
         for i in range(batch_size):
             for d in range(output[i].shape[0]):
@@ -97,11 +97,11 @@ def run_pytorch_fp(model_path, batch_size, num_runs, timeout, images_path, anno_
     # Ultralytics sets it to True by default. This way we suppress the logging by default while still allowing the user
     # to set it to True if needed
     from utils.pytorch import PyTorchRunner
-    from ultralytics.yolo.utils import ops
+    from ultralytics.utils import nms
 
     def run_single_pass(pytorch_runner, coco):
         output = pytorch_runner.run(batch_size, coco.get_input_array((640, 640)))
-        output = ops.non_max_suppression(output)
+        output = nms.non_max_suppression(output)
 
         for i in range(batch_size):
             for d in range(output[i].shape[0]):
@@ -121,7 +121,7 @@ def run_single_pass(pytorch_runner, coco):
 
     runner = PyTorchRunner(torch.jit.load(torchscript_model),
                            disable_jit_freeze=disable_jit_freeze,
-                           example_inputs=torch.stack(dataset.get_input_array((640, 640))))
+                           example_inputs=torch.stack((dataset.get_input_array((640, 640)),)))
 
     return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)
 

diff --git a/natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py b/natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) 2024, Ampere Computing LLC
+# Copyright (c) 2025, Ampere Computing LLC
 try:
     from utils import misc  # noqa
 except ModuleNotFoundError:
@@ -43,6 +43,8 @@ def parse_args():
     parser.add_argument("--squad_path",
                         type=str,
                         help="path to directory with ImageNet validation images")
+    parser.add_argument("--fixed_input_size", type=int,
+                        help='size of the input')
     parser.add_argument("--disable_jit_freeze", action='store_true',
                         help="if true model will be run not in jit freeze mode")
     return parser.parse_args()
@@ -93,7 +95,7 @@ def run_tf_fp16(model_path, batch_size, num_runs, timeout, squad_path, **kwargs)
     return run_tf_fp(model_path, batch_size, num_runs, timeout, squad_path)
 
 
-def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze=False):
+def run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze=False):
     from utils.benchmark import run_model
     from utils.nlp.squad import Squad_v1_1
     from transformers import AutoTokenizer, BertConfig, BertForQuestionAnswering
@@ -117,7 +119,11 @@ def run_single_pass(pytorch_runner, squad):
         padding=True, truncation=True, model_max_length=512)
 
     def tokenize(question, text):
-        return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
+        if fixed_input_size is not None:
+            return tokenizer(question, text, padding="max_length", truncation=True,
+                             max_length=fixed_input_size, return_tensors="pt")
+        else:
+            return tokenizer(question, text, padding=True, truncation=True, return_tensors="pt")
 
     def detokenize(answer):
         return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer))
@@ -199,8 +205,9 @@ def detokenize(answer):
     return run_model(run_single_pass, runner, dataset, batch_size, num_runs, timeout)
 
 
-def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze, **kwargs):
-    return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, disable_jit_freeze)
+def run_pytorch_fp32(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze,
+                     **kwargs):
+    return run_pytorch_fp(model_path, batch_size, num_runs, timeout, squad_path, fixed_input_size, disable_jit_freeze)
 
 
 def main():