From 4c91fd6b82f450515b69301b2fa7fa736afc3768 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Tue, 31 Mar 2026 13:27:27 +0200
Subject: [PATCH 01/20] Add function to retrieve comma-separated list of
 strings from environment variables in utils/environment.py

---
 .../inference_models/utils/environment.py     | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/inference_models/inference_models/utils/environment.py b/inference_models/inference_models/utils/environment.py
index e43fd35553..3e5604fd60 100644
--- a/inference_models/inference_models/utils/environment.py
+++ b/inference_models/inference_models/utils/environment.py
@@ -72,6 +72,27 @@ def get_comma_separated_list_of_integers_from_env(
         )
 
 
+def get_comma_separated_list_of_strings_from_env(
+    variable_name: str,
+    default: Optional[List[str]] = None,
+) -> List[str]:
+    value = os.getenv(variable_name)
+    if value is None:
+        if default is None:
+            raise InvalidEnvVariable(
+                message=f"Environment variable {variable_name} is required",
+                help_url="https://inference-models.roboflow.com/errors/runtime-environment/#invalidenvvariable",
+            )
+        return default
+    try:
+        return [v.strip() for v in parse_comma_separated_values(value)]
+    except ValueError:
+        raise InvalidEnvVariable(
+            message=f"Expected a environment variable `{variable_name}` to be comma separated list of strings but got '{value}'",
+            help_url="https://inference-models.roboflow.com/errors/runtime-environment/#invalidenvvariable",
+        )
+
+
 def parse_comma_separated_values(values: str) -> List[str]:
     if not values:
         return []

From a1e760514d9392510c86a5672aa62c9d88ef26d6 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Tue, 31 Mar 2026 13:30:22 +0200
Subject: [PATCH 02/20] Add function to retrieve a string from environment
 variables with optional default value in utils/environment.py

---
 .../inference_models/utils/environment.py            | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/inference_models/inference_models/utils/environment.py b/inference_models/inference_models/utils/environment.py
index 3e5604fd60..b23c007d17 100644
--- a/inference_models/inference_models/utils/environment.py
+++ b/inference_models/inference_models/utils/environment.py
@@ -93,6 +93,18 @@ def get_comma_separated_list_of_strings_from_env(
         )
 
 
+def get_string_from_env(variable_name: str, default: Optional[str] = None) -> str:
+    value = os.getenv(variable_name)
+    if value is None:
+        if default is None:
+            raise InvalidEnvVariable(
+                message=f"Environment variable {variable_name} is required",
+                help_url="https://inference-models.roboflow.com/errors/runtime-environment/#invalidenvvariable",
+            )
+        return default
+    return value.strip()
+
+
 def parse_comma_separated_values(values: str) -> List[str]:
     if not values:
         return []

From b3b25cba6da0987af487873da9e452d0897d731d Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Tue, 31 Mar 2026 13:30:48 +0200
Subject: [PATCH 03/20] Add default input name configurations for YOLO
 Ultralytics in configuration.py, utilizing new environment variable retrieval
 functions for strings and comma-separated lists.

---
 .../inference_models/configuration.py         | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py
index a0a34b39fb..2e9370a1d0 100644
--- a/inference_models/inference_models/configuration.py
+++ b/inference_models/inference_models/configuration.py
@@ -5,8 +5,10 @@
 from inference_models.utils.environment import (
     get_boolean_from_env,
     get_comma_separated_list_of_integers_from_env,
+    get_comma_separated_list_of_strings_from_env,
     get_float_from_env,
     get_integer_from_env,
+    get_string_from_env,
     parse_comma_separated_values,
 )
 
@@ -350,6 +352,31 @@
     variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CLASS_AGNOSTIC_NMS",
     default=INFERENCE_MODELS_DEFAULT_CLASS_AGNOSTIC_NMS,
 )
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME",
+    default="images",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME",
+    default="confidence",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME",
+    default="iou_threshold",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME",
+    default="max_detections",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES = get_comma_separated_list_of_strings_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES",
+    default=[
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME,
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME,
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME,
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME,
+    ],
+)
 INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_KEY_POINTS_THRESHOLD = get_float_from_env(
     variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_KEY_POINTS_THRESHOLD",
     default=0.0,

From f4aef5cec347c8f9b74a0497ed8bb33d9e53dc39 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Tue, 31 Mar 2026 13:31:27 +0200
Subject: [PATCH 04/20] Enhance YOLOv8 ONNX model input handling by adding
 support for multiple input names and validating against declared fused NMS
 input names. Update forward method to dynamically build input tensors based
 on configuration, improving flexibility and error handling.

---
 .../yolov8/yolov8_object_detection_onnx.py    | 73 ++++++++++++++++---
 1 file changed, 63 insertions(+), 10 deletions(-)

diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index e7607a7c12..1342dfc7af 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -11,6 +11,11 @@
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES,
 )
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
@@ -127,13 +132,28 @@ def from_pretrained(
             path_or_bytes=model_package_content["weights.onnx"],
             providers=onnx_execution_providers,
         )
-        input_batch_size = session.get_inputs()[0].shape[0]
+        onnx_graph_inputs = session.get_inputs()
+
+        input_batch_size = onnx_graph_inputs[0].shape[0]
         if isinstance(input_batch_size, str):
             input_batch_size = None
-        input_name = session.get_inputs()[0].name
+
+        input_names = [input.name for input in onnx_graph_inputs]
+
+        if inference_config.post_processing.fused:
+            if input_names != INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES:
+                raise CorruptedModelPackageError(
+                    message=(
+                        f"Fused NMS YOLOv8 ONNX model must declare input names exactly as: "
+                        f"{INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES}. "
+                        f"Got: {input_names}"
+                    ),
+                    help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
+                )
+
         return cls(
             session=session,
-            input_name=input_name,
+            input_names=input_names,
             class_names=class_names,
             inference_config=inference_config,
             device=device,
@@ -143,14 +163,14 @@ def from_pretrained(
     def __init__(
         self,
         session: onnxruntime.InferenceSession,
-        input_name: str,
+        input_names: List[str],
         inference_config: InferenceConfig,
         class_names: List[str],
         device: torch.device,
         input_batch_size: Optional[int],
     ):
         self._session = session
-        self._input_name = input_name
+        self._input_names = input_names
         self._inference_config = inference_config
         self._class_names = class_names
         self._device = device
@@ -184,11 +204,46 @@ def pre_process(
             pre_processing_overrides=pre_processing_overrides,
         )
 
-    def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
+    def forward(
+        self,
+        pre_processed_images: torch.Tensor,
+        confidence: float = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
+        iou_threshold: float = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
+        max_detections: int = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
+        **kwargs,
+    ) -> torch.Tensor:
         with self._session_thread_lock:
+            device = pre_processed_images.device
+
+            input_builders = {
+                INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME: lambda: pre_processed_images,
+            }
+
+            if self._inference_config.post_processing.fused:
+                input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
+                    float(confidence), dtype=torch.float32, device=device
+                )
+                input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(
+                    float(iou_threshold), dtype=torch.float32, device=device
+                )
+                input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.tensor(
+                    int(max_detections), dtype=torch.int32, device=device
+                )
+
+            try:
+                inputs = {name: input_builders[name]() for name in self._input_names}
+            except KeyError as e:
+                raise CorruptedModelPackageError(
+                    message=(
+                        f"Unknown ONNX input name declared by model: {e.args[0]}. "
+                        f"Available runtime builders: {list(input_builders.keys())}."
+                    ),
+                    help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
+                )
+
             return run_onnx_session_with_batch_size_limit(
                 session=self._session,
-                inputs={self._input_name: pre_processed_images},
+                inputs=inputs,
                 min_batch_size=self._min_batch_size,
                 max_batch_size=self._max_batch_size,
             )[0]
@@ -204,9 +259,7 @@ def post_process(
         **kwargs,
     ) -> List[Detections]:
         if self._inference_config.post_processing.fused:
-            nms_results = post_process_nms_fused_model_output(
-                output=model_results, conf_thresh=confidence
-            )
+            nms_results = model_results
         else:
             nms_results = run_nms_for_object_detection(
                 output=model_results,

From 9164869601836e3372bcb5c50c06b67d728c9e69 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 10:35:59 +0200
Subject: [PATCH 05/20] Refactor YOLOv8 ONNX model input validation to enhance
 error handling and logging. Update input name checks for fused NMS and modify
 forward method to conditionally build input tensors based on provided
 parameters, improving flexibility in model configuration.

---
 .../inference_models/configuration.py         |  2 +-
 .../yolov8/yolov8_object_detection_onnx.py    | 77 ++++++++++++-------
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py
index 2e9370a1d0..21d8b88b0d 100644
--- a/inference_models/inference_models/configuration.py
+++ b/inference_models/inference_models/configuration.py
@@ -366,7 +366,7 @@
 )
 INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME = get_string_from_env(
     variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME",
-    default="max_detections",
+    default="max_output_boxes_per_class",
 )
 INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES = get_comma_separated_list_of_strings_from_env(
     variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES",
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index 1342dfc7af..d9b306fb60 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -18,6 +18,7 @@
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES,
 )
 from inference_models.entities import ColorFormat
+from inference_models.logger import LOGGER
 from inference_models.errors import (
     CorruptedModelPackageError,
     EnvironmentConfigurationError,
@@ -36,7 +37,6 @@
     parse_inference_config,
 )
 from inference_models.models.common.roboflow.post_processing import (
-    post_process_nms_fused_model_output,
     rescale_detections,
     run_nms_for_object_detection,
 )
@@ -141,16 +141,47 @@ def from_pretrained(
         input_names = [input.name for input in onnx_graph_inputs]
 
         if inference_config.post_processing.fused:
-            if input_names != INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES:
+            expected_fused_inputs = (
+                INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES
+            )
+            expected_fused_input_set = set(expected_fused_inputs)
+            if (
+                INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME
+                not in input_names
+            ):
                 raise CorruptedModelPackageError(
                     message=(
-                        f"Fused NMS YOLOv8 ONNX model must declare input names exactly as: "
-                        f"{INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES}. "
+                        f"Fused NMS YOLOv8 ONNX model must declare the images input "
+                        f"({INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME!r}). "
                         f"Got: {input_names}"
                     ),
                     help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
                 )
 
+            unexpected_inputs = [
+                n for n in input_names if n not in expected_fused_input_set
+            ]
+            if unexpected_inputs:
+                raise CorruptedModelPackageError(
+                    message=(
+                        f"Fused NMS YOLOv8 ONNX model has unexpected inputs {unexpected_inputs}. "
+                        f"Expected each name to be one of: {expected_fused_inputs}"
+                    ),
+                    help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
+                )
+
+            missing_fused_inputs = [
+                n for n in expected_fused_inputs if n not in input_names
+            ]
+            if missing_fused_inputs:
+                LOGGER.warning(
+                    "Fused NMS ONNX graph omits inputs %s; they will not be passed at "
+                    "inference time and ONNX Runtime will use graph initializer defaults for those parameters. "
+                    "Python arguments matching omitted inputs (e.g. confidence, iou_threshold, max_detections) "
+                    "will not affect the fused NMS stage.",
+                    missing_fused_inputs,
+                )
+
         return cls(
             session=session,
             input_names=input_names,
@@ -207,9 +238,9 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
-        confidence: float = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
-        iou_threshold: float = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
-        max_detections: int = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
+        confidence: Optional[float] = None,
+        iou_threshold: Optional[float] = None,
+        max_detections: Optional[int] = None,
         **kwargs,
     ) -> torch.Tensor:
         with self._session_thread_lock:
@@ -220,26 +251,20 @@ def forward(
             }
 
             if self._inference_config.post_processing.fused:
-                input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
-                    float(confidence), dtype=torch.float32, device=device
-                )
-                input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(
-                    float(iou_threshold), dtype=torch.float32, device=device
-                )
-                input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.tensor(
-                    int(max_detections), dtype=torch.int32, device=device
-                )
+                if confidence is not None:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
+                        float(confidence), dtype=torch.float32, device=device
+                    )
+                if iou_threshold is not None:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(
+                        float(iou_threshold), dtype=torch.float32, device=device
+                    )
+                if max_detections is not None:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.tensor(
+                        int(max_detections), dtype=torch.int32, device=device
+                    )
 
-            try:
-                inputs = {name: input_builders[name]() for name in self._input_names}
-            except KeyError as e:
-                raise CorruptedModelPackageError(
-                    message=(
-                        f"Unknown ONNX input name declared by model: {e.args[0]}. "
-                        f"Available runtime builders: {list(input_builders.keys())}."
-                    ),
-                    help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
-                )
+            inputs = {name: builder_fn() for name, builder_fn in input_builders.items()}
 
             return run_onnx_session_with_batch_size_limit(
                 session=self._session,

From f53b1e868df2c2fb2711cf1f84075adf4c10db55 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 10:54:05 +0200
Subject: [PATCH 06/20] Add a new script for running fused NMS inference with
 YOLOv8 model. The script includes command-line options for specifying image
 and model paths, as well as parameters for confidence, IOU threshold, and
 maximum detections. It handles image loading, model initialization, and
 outputs detection results with bounding box coordinates and confidence
 scores.

---
 examples/fused-nms/run_fused_nms_inference.py | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 examples/fused-nms/run_fused_nms_inference.py

diff --git a/examples/fused-nms/run_fused_nms_inference.py b/examples/fused-nms/run_fused_nms_inference.py
new file mode 100644
index 0000000000..55c2a3f4a5
--- /dev/null
+++ b/examples/fused-nms/run_fused_nms_inference.py
@@ -0,0 +1,88 @@
+from pathlib import Path
+
+import click
+import cv2
+from typing import Optional
+from inference_models import AutoModel
+
+
+@click.command()
+@click.option(
+    "--image-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=False, readable=True),
+    required=True,
+    help="Path to the input image.",
+)
+@click.option(
+    "--model-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=True, readable=True),
+    required=True,
+    help="Path to the model directory.",
+)
+@click.option(
+    "--confidence",
+    type=float,
+    help="Confidence threshold used by post-processing.",
+)
+@click.option(
+    "--iou-threshold",
+    type=float,
+    help="IOU threshold used by post-processing.",
+)
+@click.option(
+    "--max-detections",
+    type=int,
+    help="Maximum number of detections used by post-processing.",
+)
+def main(
+    image_path: Path,
+    model_path: Path,
+    confidence: Optional[float] = None,
+    iou_threshold: Optional[float] = None,
+    max_detections: Optional[int] = None,
+) -> None:
+    image = cv2.imread(str(image_path))
+    if image is None:
+        raise click.ClickException(f"Could not load image from: {image_path}")
+
+    click.echo(f"Loading model: {model_path}")
+    model = AutoModel.from_pretrained(
+        model_path,
+        confidence=confidence,
+        iou_threshold=iou_threshold,
+        max_detections=max_detections,
+    )
+
+    click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
+
+    forward_pass = model._inference_config.forward_pass
+    if forward_pass.static_batch_size is None:
+        max_dyn = forward_pass.max_dynamic_batch_size
+        if max_dyn is not None:
+            click.echo(
+                "Batching: dynamic mode (no static batch size); "
+                f"maximum batch size is {max_dyn}."
+            )
+        else:
+            click.echo(
+                "Batching: dynamic mode (no static batch size); "
+                "max_dynamic_batch_size is not set in the model config."
+            )
+
+    click.echo("Running inference...")
+    predictions = model(image, confidence=confidence)
+    detections = predictions[0].to_supervision()
+
+    click.echo(f"Detected {len(detections)} objects")
+    for idx, (xyxy, class_id, conf) in enumerate(
+        zip(detections.xyxy, detections.class_id, detections.confidence), start=1
+    ):
+        x1, y1, x2, y2 = [int(v) for v in xyxy.tolist()]
+        click.echo(
+            f"[{idx}] class_id={int(class_id)} confidence={float(conf):.4f} "
+            f"bbox=({x1}, {y1}, {x2}, {y2})"
+        )
+
+
+if __name__ == "__main__":
+    main()

From 91b1359b7732a2ad4ac22fb0157e5bddad62c91a Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 19:07:54 +0200
Subject: [PATCH 07/20] Add a new script for running fused NMS inference with
 YOLOv8 model. The script includes command-line options for specifying image
 and model paths, as well as parameters for confidence, IOU threshold, and
 maximum detections. It handles image loading, model initialization, and
 outputs detection results with bounding box coordinates and confidence
 scores.

---
 .../fused-nms/run_fused_nms_inference.py      | 17 ++++++++++---
 .../yolov8/yolov8_object_detection_onnx.py    | 25 ++++++++++---------
 2 files changed, 26 insertions(+), 16 deletions(-)
 rename {examples => inference_models/examples}/fused-nms/run_fused_nms_inference.py (84%)

diff --git a/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
similarity index 84%
rename from examples/fused-nms/run_fused_nms_inference.py
rename to inference_models/examples/fused-nms/run_fused_nms_inference.py
index 55c2a3f4a5..a36ba41b75 100644
--- a/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -45,12 +45,21 @@ def main(
     if image is None:
         raise click.ClickException(f"Could not load image from: {image_path}")
 
+    nms_params = {
+        "confidence": confidence,
+        "iou_threshold": iou_threshold,
+        "max_detections": max_detections,
+    }
+
+    nms_params = {name: value for name, value in nms_params.items() if value is not None}
+    if nms_params:
+        click.echo(f"User provided NMS parameters: {nms_params}")
+
     click.echo(f"Loading model: {model_path}")
     model = AutoModel.from_pretrained(
         model_path,
-        confidence=confidence,
-        iou_threshold=iou_threshold,
-        max_detections=max_detections,
+        onnx_execution_providers=["CPUExecutionProvider"],
+        device="cpu",
     )
 
     click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
@@ -70,7 +79,7 @@ def main(
             )
 
     click.echo("Running inference...")
-    predictions = model(image, confidence=confidence)
+    predictions = model(image, **nms_params)
     detections = predictions[0].to_supervision()
 
     click.echo(f"Detected {len(detections)} objects")
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index d9b306fb60..cdcccd616b 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -238,30 +238,31 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
-        confidence: Optional[float] = None,
-        iou_threshold: Optional[float] = None,
-        max_detections: Optional[int] = None,
+        confidence: Optional[float] = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
+        iou_threshold: Optional[float] = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
+        max_detections: Optional[int] = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
         **kwargs,
     ) -> torch.Tensor:
         with self._session_thread_lock:
             device = pre_processed_images.device
+            batch_size = pre_processed_images.shape[0]
 
             input_builders = {
                 INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME: lambda: pre_processed_images,
             }
 
             if self._inference_config.post_processing.fused:
-                if confidence is not None:
-                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
-                        float(confidence), dtype=torch.float32, device=device
+                if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME in self._input_names:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.full(
+                        (batch_size,), float(confidence), dtype=torch.float32, device=device
                     )
-                if iou_threshold is not None:
-                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(
-                        float(iou_threshold), dtype=torch.float32, device=device
+                if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME in self._input_names:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.full(
+                        (batch_size,), float(iou_threshold), dtype=torch.float32, device=device
                     )
-                if max_detections is not None:
-                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.tensor(
-                        int(max_detections), dtype=torch.int32, device=device
+                if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME in self._input_names:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.full(
+                        (batch_size,), int(max_detections), dtype=torch.int64, device=device
                     )
 
             inputs = {name: builder_fn() for name, builder_fn in input_builders.items()}

From 33acdfcf11715448966d29d144fd6b6edbe88963 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 22:14:22 +0200
Subject: [PATCH 08/20] Integrate fused NMS post-processing in YOLOv8 ONNX
 model. Update the inference method to utilize the new
 post_process_nms_fused_model_output function for improved detection results
 when fused NMS is enabled.

---
 .../models/yolov8/yolov8_object_detection_onnx.py            | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index cdcccd616b..aebf2a3091 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -37,6 +37,7 @@
     parse_inference_config,
 )
 from inference_models.models.common.roboflow.post_processing import (
+    post_process_nms_fused_model_output,
     rescale_detections,
     run_nms_for_object_detection,
 )
@@ -285,7 +286,9 @@ def post_process(
         **kwargs,
     ) -> List[Detections]:
         if self._inference_config.post_processing.fused:
-            nms_results = model_results
+            nms_results = post_process_nms_fused_model_output(
+                output=model_results, conf_thresh=confidence
+            )
         else:
             nms_results = run_nms_for_object_detection(
                 output=model_results,

From f82585450b9d6684491a3fcb22c1437b873a0f44 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 22:25:05 +0200
Subject: [PATCH 09/20] Add benchmarking options to fused NMS inference script.
 Introduced command-line parameters for specifying the number of benchmark
 iterations and warmup runs, allowing users to measure inference latency with
 mean, median, and standard deviation statistics. Enhanced the main function
 to handle these new options while maintaining existing inference
 functionality.

---
 .../fused-nms/run_fused_nms_inference.py      | 60 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index a36ba41b75..a256e59f10 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -1,8 +1,11 @@
+import statistics
+import time
 from pathlib import Path
+from typing import Optional
 
 import click
 import cv2
-from typing import Optional
+
 from inference_models import AutoModel
 
 
@@ -34,12 +37,32 @@
     type=int,
     help="Maximum number of detections used by post-processing.",
 )
+@click.option(
+    "-n",
+    "--benchmark-iters",
+    type=click.IntRange(min=0),
+    default=0,
+    show_default=True,
+    help=(
+        "Number of timed inference runs for benchmarking (mean/median/std in ms). "
+        "0 runs inference once without benchmark stats."
+    ),
+)
+@click.option(
+    "--warmup",
+    type=click.IntRange(min=0),
+    default=5,
+    show_default=True,
+    help="Untimed warmup runs before timed iterations (only used when -n > 0).",
+)
 def main(
     image_path: Path,
     model_path: Path,
     confidence: Optional[float] = None,
     iou_threshold: Optional[float] = None,
     max_detections: Optional[int] = None,
+    benchmark_iters: int = 0,
+    warmup: int = 5,
 ) -> None:
     image = cv2.imread(str(image_path))
     if image is None:
@@ -78,8 +101,39 @@ def main(
                 "max_dynamic_batch_size is not set in the model config."
             )
 
-    click.echo("Running inference...")
-    predictions = model(image, **nms_params)
+    if benchmark_iters > 0:
+        if warmup > 0:
+            click.echo(f"Warmup: {warmup} untimed run(s)...")
+            for _ in range(warmup):
+                model(image, **nms_params)
+
+        click.echo(f"Benchmark: {benchmark_iters} timed run(s)...")
+
+        latencies_s: list[float] = []
+        predictions = None
+        for _ in range(benchmark_iters):
+            t0 = time.perf_counter()
+            predictions = model(image, **nms_params)
+            latencies_s.append(time.perf_counter() - t0)
+
+        latencies_ms = [t * 1000.0 for t in latencies_s]
+        mean_ms = statistics.mean(latencies_ms)
+        median_ms = statistics.median(latencies_ms)
+
+        if len(latencies_ms) > 1:
+            stdev_str = f"{statistics.stdev(latencies_ms):.4f}"
+        else:
+            stdev_str = "n/a (use -n 2 or more for std)"
+            
+        click.echo(
+            f"Inference latency (ms): mean={mean_ms:.4f}, median={median_ms:.4f}, "
+            f"std={stdev_str}"
+        )
+    else:
+        click.echo("Running inference...")
+        predictions = model(image, **nms_params)
+
+    assert predictions is not None
     detections = predictions[0].to_supervision()
 
     click.echo(f"Detected {len(detections)} objects")

From ad6e986d5c4d5364d40da3e4794218f9133da82d Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 23:15:57 +0200
Subject: [PATCH 10/20] Enhance fused NMS inference script with JSON reporting.
 Added functions to generate and write latency and prediction reports in JSON
 format, including detailed statistics on inference latencies. Updated
 command-line options to specify target directory for output files, improving
 usability and organization of results.

---
 .../fused-nms/run_fused_nms_inference.py      | 132 ++++++++++++------
 1 file changed, 92 insertions(+), 40 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index a256e59f10..a53e52009d 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -1,14 +1,59 @@
+import json
 import statistics
 import time
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional
 
 import click
 import cv2
+import supervision as sv
 
 from inference_models import AutoModel
 
 
+def _detections_to_json_dict(detections: sv.Detections) -> dict[str, Any]:
+    return {
+        "xyxy": detections.xyxy.tolist(),
+        "mask": detections.mask.tolist() if detections.mask is not None else None,
+        "confidence": (
+            detections.confidence.tolist()
+            if detections.confidence is not None
+            else None
+        ),
+        "class_id": (
+            detections.class_id.tolist() if detections.class_id is not None else None
+        ),
+    }
+
+
+def _latency_report_dict(
+    *,
+    image_path: Path,
+    model_path: Path,
+    warmup_runs: int,
+    latencies_ms: list[float],
+) -> dict[str, Any]:
+    n = len(latencies_ms)
+    return {
+        "image_path": str(image_path.resolve()),
+        "model_path": str(model_path.resolve()),
+        "warmup_runs": warmup_runs,
+        "timed_runs": n,
+        "unit": "ms",
+        "latencies_ms": latencies_ms,
+        "mean_ms": statistics.mean(latencies_ms) if n else None,
+        "median_ms": statistics.median(latencies_ms) if n else None,
+        "min_ms": min(latencies_ms) if n else None,
+        "max_ms": max(latencies_ms) if n else None,
+        "std_ms": statistics.stdev(latencies_ms) if n > 1 else None,
+    }
+
+
+def _write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2) + "\n")
+
+
 @click.command()
 @click.option(
     "--image-path",
@@ -40,8 +85,8 @@
 @click.option(
     "-n",
     "--benchmark-iters",
-    type=click.IntRange(min=0),
-    default=0,
+    type=click.IntRange(min=1),
+    default=1,
     show_default=True,
     help=(
         "Number of timed inference runs for benchmarking (mean/median/std in ms). "
@@ -53,11 +98,18 @@
     type=click.IntRange(min=0),
     default=5,
     show_default=True,
-    help="Untimed warmup runs before timed iterations (only used when -n > 0).",
+    help="Untimed warmup runs before timed iterations.",
+)
+@click.option(
+    "--target-dir",
+    type=click.Path(path_type=Path, file_okay=False),
+    required=True,
+    help="Directory for latency.json and prediction.json (created if missing).",
 )
 def main(
     image_path: Path,
     model_path: Path,
+    target_dir: Path,
     confidence: Optional[float] = None,
     iou_threshold: Optional[float] = None,
     max_detections: Optional[int] = None,
@@ -101,51 +153,51 @@ def main(
                 "max_dynamic_batch_size is not set in the model config."
             )
 
-    if benchmark_iters > 0:
-        if warmup > 0:
-            click.echo(f"Warmup: {warmup} untimed run(s)...")
-            for _ in range(warmup):
-                model(image, **nms_params)
-
-        click.echo(f"Benchmark: {benchmark_iters} timed run(s)...")
+    latencies_ms: list[float] = []
 
-        latencies_s: list[float] = []
-        predictions = None
-        for _ in range(benchmark_iters):
-            t0 = time.perf_counter()
-            predictions = model(image, **nms_params)
-            latencies_s.append(time.perf_counter() - t0)
+    if warmup > 0:
+        click.echo(f"Warmup: {warmup} untimed runs...")
+        for _ in range(warmup):
+            model(image, **nms_params)
 
-        latencies_ms = [t * 1000.0 for t in latencies_s]
-        mean_ms = statistics.mean(latencies_ms)
-        median_ms = statistics.median(latencies_ms)
+    click.echo(f"Benchmarking: {benchmark_iters} timed runs...")
 
-        if len(latencies_ms) > 1:
-            stdev_str = f"{statistics.stdev(latencies_ms):.4f}"
-        else:
-            stdev_str = "n/a (use -n 2 or more for std)"
-            
-        click.echo(
-            f"Inference latency (ms): mean={mean_ms:.4f}, median={median_ms:.4f}, "
-            f"std={stdev_str}"
-        )
-    else:
-        click.echo("Running inference...")
+    predictions = None
+    for _ in range(benchmark_iters):
+        t0 = time.perf_counter()
         predictions = model(image, **nms_params)
+        latencies_ms.append((time.perf_counter() - t0) * 1000.0)
 
     assert predictions is not None
+
+    click.echo("Writing reports ...")
+
     detections = predictions[0].to_supervision()
+    pred_payload: dict[str, Any] = {
+        "image_path": str(image_path.resolve()),
+        "detections": _detections_to_json_dict(detections),
+    }
 
-    click.echo(f"Detected {len(detections)} objects")
-    for idx, (xyxy, class_id, conf) in enumerate(
-        zip(detections.xyxy, detections.class_id, detections.confidence), start=1
-    ):
-        x1, y1, x2, y2 = [int(v) for v in xyxy.tolist()]
-        click.echo(
-            f"[{idx}] class_id={int(class_id)} confidence={float(conf):.4f} "
-            f"bbox=({x1}, {y1}, {x2}, {y2})"
-        )
+    target_dir.mkdir(parents=True, exist_ok=True)
+    latency_path = target_dir / "latency.json"
+    prediction_path = target_dir / "prediction.json"
+    nms_params_path = target_dir / "nms_params.json"
+    inference_config_path = target_dir / "inference_config.json"
+
+    _write_json(
+        latency_path,
+        _latency_report_dict(
+            image_path=image_path,
+            model_path=model_path,
+            warmup_runs=warmup,
+            latencies_ms=latencies_ms,
+        ),
+    )
+    _write_json(prediction_path, pred_payload)
+    _write_json(nms_params_path, nms_params)
+    _write_json(inference_config_path, model._inference_config.model_dump_json())
 
+    click.echo("Done!")
 
 if __name__ == "__main__":
     main()

From 178a6fe41460e26b1cf86b11af1b8e27cf691dcf Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 23:20:27 +0200
Subject: [PATCH 11/20] Add command-line options for run name and target
 directory in fused NMS inference script. Updated output file paths to include
 run name as a subdirectory, enhancing organization of results and improving
 usability for reporting.

---
 .../fused-nms/run_fused_nms_inference.py      | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index a53e52009d..53839464f0 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -55,6 +55,12 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
 
 
 @click.command()
+@click.option(
+    "--run-name",
+    type=str,
+    required=True,
+    help="Name of the run for reporting. Will be used as a subdirectory in the target directory.",
+)
 @click.option(
     "--image-path",
     type=click.Path(path_type=Path, exists=True, dir_okay=False, readable=True),
@@ -67,6 +73,12 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     required=True,
     help="Path to the model directory.",
 )
+@click.option(
+    "--target-dir",
+    type=click.Path(path_type=Path, file_okay=False),
+    required=True,
+    help="Directory for latency.json and prediction.json (created if missing).",
+)
 @click.option(
     "--confidence",
     type=float,
@@ -100,13 +112,8 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     show_default=True,
     help="Untimed warmup runs before timed iterations.",
 )
-@click.option(
-    "--target-dir",
-    type=click.Path(path_type=Path, file_okay=False),
-    required=True,
-    help="Directory for latency.json and prediction.json (created if missing).",
-)
 def main(
+    run_name: str,
     image_path: Path,
     model_path: Path,
     target_dir: Path,
@@ -179,10 +186,10 @@ def main(
     }
 
     target_dir.mkdir(parents=True, exist_ok=True)
-    latency_path = target_dir / "latency.json"
-    prediction_path = target_dir / "prediction.json"
-    nms_params_path = target_dir / "nms_params.json"
-    inference_config_path = target_dir / "inference_config.json"
+    latency_path = target_dir / run_name / "latency.json"
+    prediction_path = target_dir / run_name / "prediction.json"
+    nms_params_path = target_dir / run_name / "nms_params.json"
+    inference_config_path = target_dir / run_name / "inference_config.json"
 
     _write_json(
         latency_path,

From cbef2c6dd18b05d9b48e4d17d8de72bf31d48473 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 23:35:58 +0200
Subject: [PATCH 12/20] Add ONNX execution provider options to fused NMS
 inference script. Introduced a new command-line option for selecting ONNX
 Runtime execution providers (cpu, cuda, tensorrt) and updated the model
 loading process to utilize the selected provider and device. Enhanced latency
 reporting to include execution provider details, improving configurability
 and transparency in inference performance.

---
 .../fused-nms/run_fused_nms_inference.py      | 54 +++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index 53839464f0..39fbb84f1b 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -11,6 +11,26 @@
 from inference_models import AutoModel
 
 
+def _onnx_ep_preset_to_providers_and_device(
+    preset: str,
+) -> tuple[list[str], str]:
+    """Map CLI preset to ONNX Runtime provider chain and PyTorch device string."""
+    if preset == "cpu":
+        return (["CPUExecutionProvider"], "cpu")
+    if preset == "cuda":
+        return (["CUDAExecutionProvider", "CPUExecutionProvider"], "cuda")
+    if preset == "tensorrt":
+        return (
+            [
+                "TensorrtExecutionProvider",
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],
+            "cuda",
+        )
+    raise click.ClickException(f"Unknown onnx-execution-providers preset: {preset!r}")
+
+
 def _detections_to_json_dict(detections: sv.Detections) -> dict[str, Any]:
     return {
         "xyxy": detections.xyxy.tolist(),
@@ -32,6 +52,9 @@ def _latency_report_dict(
     model_path: Path,
     warmup_runs: int,
     latencies_ms: list[float],
+    onnx_execution_providers_preset: str,
+    onnx_execution_providers: list[str],
+    device: str,
 ) -> dict[str, Any]:
     n = len(latencies_ms)
     return {
@@ -46,6 +69,9 @@ def _latency_report_dict(
         "min_ms": min(latencies_ms) if n else None,
         "max_ms": max(latencies_ms) if n else None,
         "std_ms": statistics.stdev(latencies_ms) if n > 1 else None,
+        "onnx_execution_providers_preset": onnx_execution_providers_preset,
+        "onnx_execution_providers": onnx_execution_providers,
+        "device": device,
     }
 
 
@@ -112,6 +138,19 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     show_default=True,
     help="Untimed warmup runs before timed iterations.",
 )
+@click.option(
+    "--onnx-execution-providers",
+    "onnx_ep_preset",
+    type=click.Choice(["cpu", "cuda", "tensorrt"], case_sensitive=False),
+    default="cpu",
+    show_default=True,
+    help=(
+        "ONNX Runtime execution provider chain: "
+        "cpu (CPUExecutionProvider); "
+        "cuda (CUDAExecutionProvider then CPUExecutionProvider); "
+        "tensorrt (TensorrtExecutionProvider, CUDA, then CPU fallbacks)."
+    ),
+)
 def main(
     run_name: str,
     image_path: Path,
@@ -122,6 +161,7 @@ def main(
     max_detections: Optional[int] = None,
     benchmark_iters: int = 0,
     warmup: int = 5,
+    onnx_ep_preset: str = "cpu",
 ) -> None:
     image = cv2.imread(str(image_path))
     if image is None:
@@ -137,11 +177,16 @@ def main(
     if nms_params:
         click.echo(f"User provided NMS parameters: {nms_params}")
 
-    click.echo(f"Loading model: {model_path}")
+    onnx_ep_preset = onnx_ep_preset.lower()
+    onnx_providers, device_str = _onnx_ep_preset_to_providers_and_device(onnx_ep_preset)
+    click.echo(
+        f"Loading model: {model_path} "
+        f"(onnx_execution_providers={onnx_providers!r}, device={device_str!r})"
+    )
     model = AutoModel.from_pretrained(
         model_path,
-        onnx_execution_providers=["CPUExecutionProvider"],
-        device="cpu",
+        onnx_execution_providers=list(onnx_providers),
+        device=device_str,
     )
 
     click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
@@ -198,6 +243,9 @@ def main(
             model_path=model_path,
             warmup_runs=warmup,
             latencies_ms=latencies_ms,
+            onnx_execution_providers_preset=onnx_ep_preset,
+            onnx_execution_providers=list(onnx_providers),
+            device=device_str,
         ),
     )
     _write_json(prediction_path, pred_payload)

From 161a073cd9f2f62bc6d23f0d82a61a4a9ca6b0be Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Wed, 1 Apr 2026 23:57:22 +0200
Subject: [PATCH 13/20] Add batch processing capability to fused NMS inference
 script. Introduced a new command-line option for specifying batch size,
 allowing users to duplicate the input image for batched inference. Updated
 the main function and related methods to handle batch inputs, enhancing
 performance measurement and flexibility in inference execution.

---
 .../fused-nms/run_fused_nms_inference.py      | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index 39fbb84f1b..fc94378a20 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -2,10 +2,11 @@
 import statistics
 import time
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import click
 import cv2
+import numpy as np
 import supervision as sv
 
 from inference_models import AutoModel
@@ -55,6 +56,7 @@ def _latency_report_dict(
     onnx_execution_providers_preset: str,
     onnx_execution_providers: list[str],
     device: str,
+    batch_size: int,
 ) -> dict[str, Any]:
     n = len(latencies_ms)
     return {
@@ -72,9 +74,17 @@ def _latency_report_dict(
         "onnx_execution_providers_preset": onnx_execution_providers_preset,
         "onnx_execution_providers": onnx_execution_providers,
         "device": device,
+        "batch_size": batch_size,
     }
 
 
+def _batch_input_from_image(image: np.ndarray, batch_size: int) -> Union[np.ndarray, list[np.ndarray]]:
+    """Build model input: one HxWxC array for batch 1, else a list of `batch_size` copies."""
+    if batch_size == 1:
+        return image
+    return [np.ascontiguousarray(image) for _ in range(batch_size)]
+
+
 def _write_json(path: Path, payload: dict[str, Any]) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(json.dumps(payload, indent=2) + "\n")
@@ -151,6 +161,13 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
         "tensorrt (TensorrtExecutionProvider, CUDA, then CPU fallbacks)."
     ),
 )
+@click.option(
+    "--batch-size",
+    type=click.IntRange(min=1),
+    default=1,
+    show_default=True,
+    help="Duplicate the input image this many times and run a single batched forward pass.",
+)
 def main(
     run_name: str,
     image_path: Path,
@@ -162,11 +179,16 @@ def main(
     benchmark_iters: int = 0,
     warmup: int = 5,
     onnx_ep_preset: str = "cpu",
+    batch_size: int = 1,
 ) -> None:
     image = cv2.imread(str(image_path))
     if image is None:
         raise click.ClickException(f"Could not load image from: {image_path}")
 
+    batch_input = _batch_input_from_image(image, batch_size)
+    if batch_size > 1:
+        click.echo(f"Batching: batch_size={batch_size} (repeated single image).")
+
     nms_params = {
         "confidence": confidence,
         "iou_threshold": iou_threshold,
@@ -210,14 +232,14 @@ def main(
     if warmup > 0:
         click.echo(f"Warmup: {warmup} untimed runs...")
         for _ in range(warmup):
-            model(image, **nms_params)
+            model(batch_input, **nms_params)
 
     click.echo(f"Benchmarking: {benchmark_iters} timed runs...")
 
     predictions = None
     for _ in range(benchmark_iters):
         t0 = time.perf_counter()
-        predictions = model(image, **nms_params)
+        predictions = model(batch_input, **nms_params)
         latencies_ms.append((time.perf_counter() - t0) * 1000.0)
 
     assert predictions is not None
@@ -227,6 +249,7 @@ def main(
     detections = predictions[0].to_supervision()
     pred_payload: dict[str, Any] = {
         "image_path": str(image_path.resolve()),
+        "batch_size": batch_size,
         "detections": _detections_to_json_dict(detections),
     }
 
@@ -246,6 +269,7 @@ def main(
             onnx_execution_providers_preset=onnx_ep_preset,
             onnx_execution_providers=list(onnx_providers),
             device=device_str,
+            batch_size=batch_size,
         ),
     )
     _write_json(prediction_path, pred_payload)

From 3b6e7ee247a70f5605cb3b1bcda681f6e02357cd Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 01:17:38 +0200
Subject: [PATCH 14/20] Refactor fused NMS inference script to support
 directory input for images. Updated command-line options to accept an image
 directory instead of a single image path, enabling batch processing of
 multiple images. Enhanced latency reporting with additional metrics and
 improved JSON output structure for better organization of results.

---
 .../fused-nms/run_fused_nms_inference.py      | 110 ++++++------------
 1 file changed, 37 insertions(+), 73 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index fc94378a20..91bdd40d97 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -1,13 +1,11 @@
 import json
-import statistics
 import time
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 import click
 import cv2
 import numpy as np
-import supervision as sv
 
 from inference_models import AutoModel
 
@@ -32,24 +30,8 @@ def _onnx_ep_preset_to_providers_and_device(
     raise click.ClickException(f"Unknown onnx-execution-providers preset: {preset!r}")
 
 
-def _detections_to_json_dict(detections: sv.Detections) -> dict[str, Any]:
-    return {
-        "xyxy": detections.xyxy.tolist(),
-        "mask": detections.mask.tolist() if detections.mask is not None else None,
-        "confidence": (
-            detections.confidence.tolist()
-            if detections.confidence is not None
-            else None
-        ),
-        "class_id": (
-            detections.class_id.tolist() if detections.class_id is not None else None
-        ),
-    }
-
-
 def _latency_report_dict(
     *,
-    image_path: Path,
     model_path: Path,
     warmup_runs: int,
     latencies_ms: list[float],
@@ -58,33 +40,23 @@ def _latency_report_dict(
     device: str,
     batch_size: int,
 ) -> dict[str, Any]:
-    n = len(latencies_ms)
     return {
-        "image_path": str(image_path.resolve()),
         "model_path": str(model_path.resolve()),
-        "warmup_runs": warmup_runs,
-        "timed_runs": n,
-        "unit": "ms",
-        "latencies_ms": latencies_ms,
-        "mean_ms": statistics.mean(latencies_ms) if n else None,
-        "median_ms": statistics.median(latencies_ms) if n else None,
-        "min_ms": min(latencies_ms) if n else None,
-        "max_ms": max(latencies_ms) if n else None,
-        "std_ms": statistics.stdev(latencies_ms) if n > 1 else None,
         "onnx_execution_providers_preset": onnx_execution_providers_preset,
         "onnx_execution_providers": onnx_execution_providers,
         "device": device,
         "batch_size": batch_size,
+        "warmup_runs": warmup_runs,
+        "timed_runs": len(latencies_ms),
+        "mean_ms": np.mean(latencies_ms),
+        "p_50_ms": np.percentile(latencies_ms, 50),
+        "p_95_ms": np.percentile(latencies_ms, 95),
+        "p_99_ms": np.percentile(latencies_ms, 99),
+        "mean_per_image_ms": np.mean(latencies_ms) / batch_size,
+        "throughput_fps": (batch_size * len(latencies_ms)) / (np.sum(latencies_ms) / 1000),
     }
 
 
-def _batch_input_from_image(image: np.ndarray, batch_size: int) -> Union[np.ndarray, list[np.ndarray]]:
-    """Build model input: one HxWxC array for batch 1, else a list of `batch_size` copies."""
-    if batch_size == 1:
-        return image
-    return [np.ascontiguousarray(image) for _ in range(batch_size)]
-
-
 def _write_json(path: Path, payload: dict[str, Any]) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(json.dumps(payload, indent=2) + "\n")
@@ -98,10 +70,10 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     help="Name of the run for reporting. Will be used as a subdirectory in the target directory.",
 )
 @click.option(
-    "--image-path",
-    type=click.Path(path_type=Path, exists=True, dir_okay=False, readable=True),
+    "--image-dir",
+    type=click.Path(path_type=Path, exists=True, dir_okay=True, readable=True),
     required=True,
-    help="Path to the input image.",
+    help="Path to the input image directory.",
 )
 @click.option(
     "--model-path",
@@ -113,7 +85,7 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     "--target-dir",
     type=click.Path(path_type=Path, file_okay=False),
     required=True,
-    help="Directory for latency.json and prediction.json (created if missing).",
+    help="Directory for latency.json (created if missing).",
 )
 @click.option(
     "--confidence",
@@ -162,15 +134,13 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     ),
 )
 @click.option(
-    "--batch-size",
-    type=click.IntRange(min=1),
-    default=1,
-    show_default=True,
-    help="Duplicate the input image this many times and run a single batched forward pass.",
+    "--batch-images",
+    type=click.Flag(default=False),
+    help="Batch the input images and run a single batched forward pass.",
 )
 def main(
     run_name: str,
-    image_path: Path,
+    image_dir: Path,
     model_path: Path,
     target_dir: Path,
     confidence: Optional[float] = None,
@@ -179,15 +149,21 @@ def main(
     benchmark_iters: int = 0,
     warmup: int = 5,
     onnx_ep_preset: str = "cpu",
-    batch_size: int = 1,
+    batch_images: bool = False,
 ) -> None:
-    image = cv2.imread(str(image_path))
-    if image is None:
-        raise click.ClickException(f"Could not load image from: {image_path}")
+    image_paths = list(image_dir.glob("*.jpg"))
 
-    batch_input = _batch_input_from_image(image, batch_size)
-    if batch_size > 1:
-        click.echo(f"Batching: batch_size={batch_size} (repeated single image).")
+    images = []
+    for image_path in image_paths:
+        image = cv2.imread(str(image_path))
+        if image is None:
+            raise click.ClickException(f"Could not load image from: {image_path}")
+        images.append(image)
+
+    if batch_images:
+        batch_input = images[:4]
+    else:
+        batch_input = images[0]
 
     nms_params = {
         "confidence": confidence,
@@ -227,52 +203,40 @@ def main(
                 "max_dynamic_batch_size is not set in the model config."
             )
 
-    latencies_ms: list[float] = []
+    click.echo(f"Warmup: {warmup} untimed runs..." if warmup > 0 else "No warmup runs.")
 
-    if warmup > 0:
-        click.echo(f"Warmup: {warmup} untimed runs...")
-        for _ in range(warmup):
-            model(batch_input, **nms_params)
+    for _ in range(warmup):
+        predictions = model(batch_input, **nms_params)
+        _ = predictions[0].to_supervision()
 
     click.echo(f"Benchmarking: {benchmark_iters} timed runs...")
 
-    predictions = None
+    latencies_ms: list[float] = []
     for _ in range(benchmark_iters):
         t0 = time.perf_counter()
         predictions = model(batch_input, **nms_params)
+        _ = predictions[0].to_supervision()
         latencies_ms.append((time.perf_counter() - t0) * 1000.0)
 
-    assert predictions is not None
-
     click.echo("Writing reports ...")
 
-    detections = predictions[0].to_supervision()
-    pred_payload: dict[str, Any] = {
-        "image_path": str(image_path.resolve()),
-        "batch_size": batch_size,
-        "detections": _detections_to_json_dict(detections),
-    }
-
     target_dir.mkdir(parents=True, exist_ok=True)
     latency_path = target_dir / run_name / "latency.json"
-    prediction_path = target_dir / run_name / "prediction.json"
     nms_params_path = target_dir / run_name / "nms_params.json"
     inference_config_path = target_dir / run_name / "inference_config.json"
 
     _write_json(
         latency_path,
         _latency_report_dict(
-            image_path=image_path,
             model_path=model_path,
             warmup_runs=warmup,
             latencies_ms=latencies_ms,
             onnx_execution_providers_preset=onnx_ep_preset,
             onnx_execution_providers=list(onnx_providers),
             device=device_str,
-            batch_size=batch_size,
+            batch_size=len(batch_input) if isinstance(batch_input, list) else 1,
         ),
     )
-    _write_json(prediction_path, pred_payload)
     _write_json(nms_params_path, nms_params)
     _write_json(inference_config_path, model._inference_config.model_dump_json())
 

From a8aa807b97a0d66c0ebf40f113dfce585b527a89 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 01:53:40 +0200
Subject: [PATCH 15/20] Update fused NMS inference script to improve batch
 processing and latency reporting. Introduced a constant for test batch size,
 enhanced JSON output to include image paths, and updated command-line options
 for benchmark iterations and warmup runs. Refactored image loading logic to
 support dynamic batching based on model configuration.

---
 .../fused-nms/run_fused_nms_inference.py      | 92 +++++++++----------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference.py
index 91bdd40d97..d59be4276b 100644
--- a/inference_models/examples/fused-nms/run_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference.py
@@ -9,6 +9,8 @@
 
 from inference_models import AutoModel
 
+TEST_BATCH_SIZE = 4
+
 
 def _onnx_ep_preset_to_providers_and_device(
     preset: str,
@@ -39,9 +41,11 @@ def _latency_report_dict(
     onnx_execution_providers: list[str],
     device: str,
     batch_size: int,
+    images: list[Path],
 ) -> dict[str, Any]:
     return {
         "model_path": str(model_path.resolve()),
+        "images": [str(image.resolve()) for image in images],
         "onnx_execution_providers_preset": onnx_execution_providers_preset,
         "onnx_execution_providers": onnx_execution_providers,
         "device": device,
@@ -106,7 +110,7 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
     "-n",
     "--benchmark-iters",
     type=click.IntRange(min=1),
-    default=1,
+    default=200,
     show_default=True,
     help=(
         "Number of timed inference runs for benchmarking (mean/median/std in ms). "
@@ -116,7 +120,7 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
 @click.option(
     "--warmup",
     type=click.IntRange(min=0),
-    default=5,
+    default=20,
     show_default=True,
     help="Untimed warmup runs before timed iterations.",
 )
@@ -133,11 +137,6 @@ def _write_json(path: Path, payload: dict[str, Any]) -> None:
         "tensorrt (TensorrtExecutionProvider, CUDA, then CPU fallbacks)."
     ),
 )
-@click.option(
-    "--batch-images",
-    type=click.Flag(default=False),
-    help="Batch the input images and run a single batched forward pass.",
-)
 def main(
     run_name: str,
     image_dir: Path,
@@ -146,37 +145,13 @@ def main(
     confidence: Optional[float] = None,
     iou_threshold: Optional[float] = None,
     max_detections: Optional[int] = None,
-    benchmark_iters: int = 0,
-    warmup: int = 5,
+    benchmark_iters: int = 200,
+    warmup: int = 20,
     onnx_ep_preset: str = "cpu",
-    batch_images: bool = False,
 ) -> None:
-    image_paths = list(image_dir.glob("*.jpg"))
-
-    images = []
-    for image_path in image_paths:
-        image = cv2.imread(str(image_path))
-        if image is None:
-            raise click.ClickException(f"Could not load image from: {image_path}")
-        images.append(image)
-
-    if batch_images:
-        batch_input = images[:4]
-    else:
-        batch_input = images[0]
-
-    nms_params = {
-        "confidence": confidence,
-        "iou_threshold": iou_threshold,
-        "max_detections": max_detections,
-    }
-
-    nms_params = {name: value for name, value in nms_params.items() if value is not None}
-    if nms_params:
-        click.echo(f"User provided NMS parameters: {nms_params}")
-
     onnx_ep_preset = onnx_ep_preset.lower()
     onnx_providers, device_str = _onnx_ep_preset_to_providers_and_device(onnx_ep_preset)
+
     click.echo(
         f"Loading model: {model_path} "
         f"(onnx_execution_providers={onnx_providers!r}, device={device_str!r})"
@@ -188,25 +163,41 @@ def main(
     )
 
     click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
+    
+    nms_params = {
+        "confidence": confidence,
+        "iou_threshold": iou_threshold,
+        "max_detections": max_detections,
+    }
+    nms_params = {name: value for name, value in nms_params.items() if value is not None}
+
+    if nms_params:
+        click.echo(f"User provided NMS parameters: {nms_params}")
 
     forward_pass = model._inference_config.forward_pass
-    if forward_pass.static_batch_size is None:
-        max_dyn = forward_pass.max_dynamic_batch_size
-        if max_dyn is not None:
-            click.echo(
-                "Batching: dynamic mode (no static batch size); "
-                f"maximum batch size is {max_dyn}."
-            )
-        else:
-            click.echo(
-                "Batching: dynamic mode (no static batch size); "
-                "max_dynamic_batch_size is not set in the model config."
-            )
+    use_batching = forward_pass.static_batch_size is None
+
+    if use_batching:
+        click.echo(f"Model exported as dynamic. Using image batch")
+    else:
+        click.echo(f"Model exported as static. Using single image inference")
+
+    image_paths = list(image_dir.glob("*.jpg"))
+    batched_image_paths = image_paths[:TEST_BATCH_SIZE] if use_batching else image_paths[:1]
+
+    images = []
+    for image_path in batched_image_paths:
+        image = cv2.imread(str(image_path))
+        if image is None:
+            raise click.ClickException(f"Could not load image from: {image_path}")
+        images.append(image)
+
+    inputs = images[:TEST_BATCH_SIZE] if use_batching else images[0]
 
     click.echo(f"Warmup: {warmup} untimed runs..." if warmup > 0 else "No warmup runs.")
 
     for _ in range(warmup):
-        predictions = model(batch_input, **nms_params)
+        predictions = model(inputs, **nms_params)
         _ = predictions[0].to_supervision()
 
     click.echo(f"Benchmarking: {benchmark_iters} timed runs...")
@@ -214,7 +205,7 @@ def main(
     latencies_ms: list[float] = []
     for _ in range(benchmark_iters):
         t0 = time.perf_counter()
-        predictions = model(batch_input, **nms_params)
+        predictions = model(inputs, **nms_params)
         _ = predictions[0].to_supervision()
         latencies_ms.append((time.perf_counter() - t0) * 1000.0)
 
@@ -234,11 +225,12 @@ def main(
             onnx_execution_providers_preset=onnx_ep_preset,
             onnx_execution_providers=list(onnx_providers),
             device=device_str,
-            batch_size=len(batch_input) if isinstance(batch_input, list) else 1,
+            batch_size=len(inputs) if isinstance(inputs, list) else 1,
+            images=batched_image_paths,
         ),
     )
-    _write_json(nms_params_path, nms_params)
     _write_json(inference_config_path, model._inference_config.model_dump_json())
+    _write_json(nms_params_path, nms_params)
 
     click.echo("Done!")
 

From 245a56b729fbf7652ee9c716e200ab7bb749764b Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 01:53:56 +0200
Subject: [PATCH 16/20] Refactor input handling in YOLOv8 ONNX object detection
 to use single-element tensors for confidence, IOU threshold, and max
 detections. This change improves compatibility with the input builders and
 streamlines the inference process.

---
 .../models/yolov8/yolov8_object_detection_onnx.py   | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index aebf2a3091..ba29eca8c1 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -246,7 +246,6 @@ def forward(
     ) -> torch.Tensor:
         with self._session_thread_lock:
             device = pre_processed_images.device
-            batch_size = pre_processed_images.shape[0]
 
             input_builders = {
                 INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME: lambda: pre_processed_images,
@@ -254,16 +253,16 @@ def forward(
 
             if self._inference_config.post_processing.fused:
                 if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME in self._input_names:
-                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.full(
-                        (batch_size,), float(confidence), dtype=torch.float32, device=device
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
+                        [float(confidence)], dtype=torch.float32, device=device
                     )
                 if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME in self._input_names:
-                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.full(
-                        (batch_size,), float(iou_threshold), dtype=torch.float32, device=device
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(
+                        [float(iou_threshold)], dtype=torch.float32, device=device
                     )
                 if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME in self._input_names:
-                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.full(
-                        (batch_size,), int(max_detections), dtype=torch.int64, device=device
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.tensor(
+                        [int(max_detections)], dtype=torch.int64, device=device
                     )
 
             inputs = {name: builder_fn() for name, builder_fn in input_builders.items()}

From 8e0dd2dbde5ee7db25444c8dc5801969f66fa07f Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 01:54:58 +0200
Subject: [PATCH 17/20] Add new script for running fused NMS inference with
 YOLOv8 model. Includes command-line options for image and model paths, as
 well as parameters for confidence, IOU threshold, and maximum detections.
 Handles image loading, model initialization, and outputs detection results
 with bounding box coordinates and confidence scores.

---
 .../run_single_fused_nms_inference.py         | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 inference_models/examples/fused-nms/run_single_fused_nms_inference.py

diff --git a/inference_models/examples/fused-nms/run_single_fused_nms_inference.py b/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
new file mode 100644
index 0000000000..a36ba41b75
--- /dev/null
+++ b/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
@@ -0,0 +1,97 @@
+from pathlib import Path
+
+import click
+import cv2
+from typing import Optional
+from inference_models import AutoModel
+
+
+@click.command()
+@click.option(
+    "--image-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=False, readable=True),
+    required=True,
+    help="Path to the input image.",
+)
+@click.option(
+    "--model-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=True, readable=True),
+    required=True,
+    help="Path to the model directory.",
+)
+@click.option(
+    "--confidence",
+    type=float,
+    help="Confidence threshold used by post-processing.",
+)
+@click.option(
+    "--iou-threshold",
+    type=float,
+    help="IOU threshold used by post-processing.",
+)
+@click.option(
+    "--max-detections",
+    type=int,
+    help="Maximum number of detections used by post-processing.",
+)
+def main(
+    image_path: Path,
+    model_path: Path,
+    confidence: Optional[float] = None,
+    iou_threshold: Optional[float] = None,
+    max_detections: Optional[int] = None,
+) -> None:
+    image = cv2.imread(str(image_path))
+    if image is None:
+        raise click.ClickException(f"Could not load image from: {image_path}")
+
+    nms_params = {
+        "confidence": confidence,
+        "iou_threshold": iou_threshold,
+        "max_detections": max_detections,
+    }
+
+    nms_params = {name: value for name, value in nms_params.items() if value is not None}
+    if nms_params:
+        click.echo(f"User provided NMS parameters: {nms_params}")
+
+    click.echo(f"Loading model: {model_path}")
+    model = AutoModel.from_pretrained(
+        model_path,
+        onnx_execution_providers=["CPUExecutionProvider"],
+        device="cpu",
+    )
+
+    click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
+
+    forward_pass = model._inference_config.forward_pass
+    if forward_pass.static_batch_size is None:
+        max_dyn = forward_pass.max_dynamic_batch_size
+        if max_dyn is not None:
+            click.echo(
+                "Batching: dynamic mode (no static batch size); "
+                f"maximum batch size is {max_dyn}."
+            )
+        else:
+            click.echo(
+                "Batching: dynamic mode (no static batch size); "
+                "max_dynamic_batch_size is not set in the model config."
+            )
+
+    click.echo("Running inference...")
+    predictions = model(image, **nms_params)
+    detections = predictions[0].to_supervision()
+
+    click.echo(f"Detected {len(detections)} objects")
+    for idx, (xyxy, class_id, conf) in enumerate(
+        zip(detections.xyxy, detections.class_id, detections.confidence), start=1
+    ):
+        x1, y1, x2, y2 = [int(v) for v in xyxy.tolist()]
+        click.echo(
+            f"[{idx}] class_id={int(class_id)} confidence={float(conf):.4f} "
+            f"bbox=({x1}, {y1}, {x2}, {y2})"
+        )
+
+
+if __name__ == "__main__":
+    main()

From 768077b86991250a82276be4ce3820b1df693f2e Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 01:55:27 +0200
Subject: [PATCH 18/20] Add new script for benchmarking fused NMS inference
 with YOLOv8 model. Includes command-line options for run name, image
 directory, model path, target directory, and parameters for confidence, IOU
 threshold, and maximum detections. Implements latency reporting and JSON
 output for detailed performance metrics, enhancing usability and
 configurability for inference tasks.

---
 ...d_nms_inference.py => run_fused_nms_inference_benchmarking.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename inference_models/examples/fused-nms/{run_fused_nms_inference.py => run_fused_nms_inference_benchmarking.py} (100%)

diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference.py b/inference_models/examples/fused-nms/run_fused_nms_inference_benchmarking.py
similarity index 100%
rename from inference_models/examples/fused-nms/run_fused_nms_inference.py
rename to inference_models/examples/fused-nms/run_fused_nms_inference_benchmarking.py

From e8b26acbdcd5a310301f80556c16fad497b0be45 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 16:47:36 +0200
Subject: [PATCH 19/20] Refactor ONNX input handling to support broadcast
 scalars and improve error messaging. Updated input validation to allow
 tensors with dimension 0 equal to 1 and enhanced error messages for
 incompatible batch sizes. Adjusted batch processing logic to ensure
 compatibility with dynamic input sizes in YOLOv8 model.

---
 .../inference_models/models/common/onnx.py    | 70 +++++++++++--------
 .../yolov8/yolov8_object_detection_onnx.py    | 30 ++++----
 2 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/inference_models/inference_models/models/common/onnx.py b/inference_models/inference_models/models/common/onnx.py
index 510e1d1682..b99f5289b7 100644
--- a/inference_models/inference_models/models/common/onnx.py
+++ b/inference_models/inference_models/models/common/onnx.py
@@ -198,8 +198,10 @@ def run_onnx_session_with_batch_size_limit(
     Args:
         session: ONNX Runtime inference session.
 
-        inputs: Dictionary mapping input names to PyTorch tensors. All tensors
-            must have the same batch size (first dimension).
+        inputs: Dictionary mapping input names to PyTorch tensors. Tensors that
+            participate in the main batch must share the same size on dimension 0.
+            Tensors with dimension 0 equal to 1 are treated as broadcast scalars
+            and are not split across chunks.
 
         output_shape_mapping: Optional dictionary mapping output names to their
             expected shapes. Used for pre-allocating output buffers. If None,
@@ -217,7 +219,8 @@ def run_onnx_session_with_batch_size_limit(
         the model's output specification.
 
     Raises:
-        ModelRuntimeError: If input tensors have different batch sizes.
+        ModelInputError: If dimension-0 sizes are incompatible (for example two
+            different batch sizes both greater than 1).
 
     Examples:
         Run inference with batch size limit:
@@ -257,7 +260,7 @@ def run_onnx_session_with_batch_size_limit(
         - Automatically handles batch splitting and result concatenation
         - Pads the last chunk if min_batch_size is specified
         - Uses `run_onnx_session_via_iobinding()` internally for efficiency
-        - All input tensors must have the same batch size
+        - Broadcast inputs with batch dimension 1 are supported alongside batched tensors
 
     See Also:
         - `run_onnx_session_via_iobinding()`: Lower-level ONNX execution
@@ -269,20 +272,24 @@ def run_onnx_session_with_batch_size_limit(
             inputs=inputs,
             output_shape_mapping=output_shape_mapping,
         )
-    input_batch_sizes = set()
-    for input_tensor in inputs.values():
-        input_batch_sizes.add(input_tensor.shape[0])
-    if len(input_batch_sizes) != 1:
+
+    batch_input_sizes = [tensor.shape[0] for tensor in inputs.values() if tensor.numel() != 1]
+    batch_size = max(batch_input_sizes)
+    is_incompatible_batch_size_set = [
+        size for size in batch_input_sizes if size != batch_size
+    ]
+    if is_incompatible_batch_size_set:
         raise ModelInputError(
-            message="When running forward pass through ONNX model detected inputs with different batch sizes. "
-            "This is the error with the model you run. If the model was trained or exported "
-            "on Roboflow platform - contact us to get help. Otherwise, verify your model package or "
-            "implementation of the model class.",
+            message="When running forward pass through ONNX model detected inputs with incompatible sizes on "
+            "dimension 0. Expected each tensor to have either size 1 (scalar/broadcast inputs) or the same "
+            f"primary batch size ({batch_size}). Got distinct sizes: {sorted(set(batch_input_sizes))!r}. "
+            "If the model was trained or exported on Roboflow platform, contact us for help. Otherwise, "
+            "verify your model package or implementation of the model class.",
             help_url="https://inference-models.roboflow.com/errors/input-validation/#modelinputerror",
         )
-    input_batch_size = input_batch_sizes.pop()
-    if input_batch_size <= max_batch_size and (
-        min_batch_size is None or input_batch_size >= min_batch_size
+
+    if batch_size <= max_batch_size and (
+        min_batch_size is None or batch_size >= min_batch_size
     ):
         # no point iterating
         return run_onnx_session_via_iobinding(
@@ -293,25 +300,28 @@ def run_onnx_session_with_batch_size_limit(
     all_results = []
     for _ in session.get_outputs():
         all_results.append([])
-    for i in range(0, input_batch_size, max_batch_size):
+    for i in range(0, batch_size, max_batch_size):
         batch_inputs = {}
         reminder = 0
         for name, value in inputs.items():
-            batched_value = value[i : i + max_batch_size]
-            if min_batch_size is not None:
-                reminder = min_batch_size - batched_value.shape[0]
-            if reminder > 0:
-                batched_value = torch.cat(
-                    (
-                        batched_value,
-                        torch.zeros(
-                            (reminder,) + batched_value.shape[1:],
-                            dtype=batched_value.dtype,
-                            device=batched_value.device,
+            if value.shape[0] == batch_size:
+                batched_value = value[i : i + max_batch_size]
+                if min_batch_size is not None:
+                    reminder = min_batch_size - batched_value.shape[0]
+                if reminder > 0:
+                    batched_value = torch.cat(
+                        (
+                            batched_value,
+                            torch.zeros(
+                                (reminder,) + batched_value.shape[1:],
+                                dtype=batched_value.dtype,
+                                device=batched_value.device,
+                            ),
                         ),
-                    ),
-                    dim=0,
-                )
+                        dim=0,
+                    )
+            else:
+                batched_value = value
             batched_value = batched_value.contiguous()
             batch_inputs[name] = batched_value
         batch_output_shape_mapping = None
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index ba29eca8c1..d6ef1a65de 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -134,18 +134,13 @@ def from_pretrained(
             providers=onnx_execution_providers,
         )
         onnx_graph_inputs = session.get_inputs()
-
-        input_batch_size = onnx_graph_inputs[0].shape[0]
-        if isinstance(input_batch_size, str):
-            input_batch_size = None
-
         input_names = [input.name for input in onnx_graph_inputs]
 
         if inference_config.post_processing.fused:
-            expected_fused_inputs = (
+            expected_fused_nms_inputs = (
                 INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES
             )
-            expected_fused_input_set = set(expected_fused_inputs)
+            expected_fused_nms_input_set = set(expected_fused_nms_inputs)
             if (
                 INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME
                 not in input_names
@@ -160,28 +155,35 @@ def from_pretrained(
                 )
 
             unexpected_inputs = [
-                n for n in input_names if n not in expected_fused_input_set
+                name for name in input_names if name not in expected_fused_nms_input_set
             ]
             if unexpected_inputs:
                 raise CorruptedModelPackageError(
                     message=(
                         f"Fused NMS YOLOv8 ONNX model has unexpected inputs {unexpected_inputs}. "
-                        f"Expected each name to be one of: {expected_fused_inputs}"
+                        f"Expected each name to be one of: {expected_fused_nms_input_set}"
                     ),
                     help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
                 )
 
-            missing_fused_inputs = [
-                n for n in expected_fused_inputs if n not in input_names
+            missing_inputs = [
+                name for name in expected_fused_nms_inputs if name not in input_names
             ]
-            if missing_fused_inputs:
+            if missing_inputs:
                 LOGGER.warning(
                     "Fused NMS ONNX graph omits inputs %s; they will not be passed at "
                     "inference time and ONNX Runtime will use graph initializer defaults for those parameters. "
                     "Python arguments matching omitted inputs (e.g. confidence, iou_threshold, max_detections) "
                     "will not affect the fused NMS stage.",
-                    missing_fused_inputs,
+                    missing_inputs,
                 )
+        
+        images_input = [graph_input for graph_input in onnx_graph_inputs if graph_input.name == INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME][0]
+        input_batch_size = images_input.shape[0]
+        
+        # Dynamic batch size export results in "batch" string as dimension 0 representation
+        if isinstance(input_batch_size, str):
+            input_batch_size = None
 
         return cls(
             session=session,
@@ -254,7 +256,7 @@ def forward(
             if self._inference_config.post_processing.fused:
                 if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME in self._input_names:
                     input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
-                        [float(confidence)], dtype=torch.float32, device=device
+                        float(confidence), dtype=torch.float32, device=device
                     )
                 if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME in self._input_names:
                     input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(

From c7b11a9da4489e8ae536f43f5a5940a675fdccb4 Mon Sep 17 00:00:00 2001
From: Damian Kosowski <damian@roboflow.com>
Date: Thu, 2 Apr 2026 17:35:57 +0200
Subject: [PATCH 20/20] Add ONNX execution provider options to the fused NMS
 inference script. Introduced a new command-line option for selecting
 execution providers (cpu, cuda, tensorrt) and updated model loading to
 utilize the selected provider and device. Enhanced logging to include
 provider details for improved configurability and transparency in inference
 performance.

---
 .../run_single_fused_nms_inference.py         | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/inference_models/examples/fused-nms/run_single_fused_nms_inference.py b/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
index a36ba41b75..6c5f0b48c5 100644
--- a/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
+++ b/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
@@ -1,11 +1,32 @@
 from pathlib import Path
+from typing import Optional
 
 import click
 import cv2
-from typing import Optional
+
 from inference_models import AutoModel
 
 
+def _onnx_ep_preset_to_providers_and_device(
+    preset: str,
+) -> tuple[list[str], str]:
+    """Map CLI preset to ONNX Runtime provider chain and PyTorch device string."""
+    if preset == "cpu":
+        return (["CPUExecutionProvider"], "cpu")
+    if preset == "cuda":
+        return (["CUDAExecutionProvider", "CPUExecutionProvider"], "cuda")
+    if preset == "tensorrt":
+        return (
+            [
+                "TensorrtExecutionProvider",
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],
+            "cuda",
+        )
+    raise click.ClickException(f"Unknown onnx-execution-providers preset: {preset!r}")
+
+
 @click.command()
 @click.option(
     "--image-path",
@@ -34,12 +55,26 @@
     type=int,
     help="Maximum number of detections used by post-processing.",
 )
+@click.option(
+    "--onnx-execution-providers",
+    "onnx_ep_preset",
+    type=click.Choice(["cpu", "cuda", "tensorrt"], case_sensitive=False),
+    default="cpu",
+    show_default=True,
+    help=(
+        "ONNX Runtime execution provider chain: "
+        "cpu (CPUExecutionProvider); "
+        "cuda (CUDAExecutionProvider then CPUExecutionProvider); "
+        "tensorrt (TensorrtExecutionProvider, CUDA, then CPU fallbacks)."
+    ),
+)
 def main(
     image_path: Path,
     model_path: Path,
     confidence: Optional[float] = None,
     iou_threshold: Optional[float] = None,
     max_detections: Optional[int] = None,
+    onnx_ep_preset: str = "cpu",
 ) -> None:
     image = cv2.imread(str(image_path))
     if image is None:
@@ -55,11 +90,17 @@ def main(
     if nms_params:
         click.echo(f"User provided NMS parameters: {nms_params}")
 
-    click.echo(f"Loading model: {model_path}")
+    onnx_ep_preset = onnx_ep_preset.lower()
+    onnx_providers, device_str = _onnx_ep_preset_to_providers_and_device(onnx_ep_preset)
+
+    click.echo(
+        f"Loading model: {model_path} "
+        f"(onnx_execution_providers={onnx_providers!r}, device={device_str!r})"
+    )
     model = AutoModel.from_pretrained(
         model_path,
-        onnx_execution_providers=["CPUExecutionProvider"],
-        device="cpu",
+        onnx_execution_providers=list(onnx_providers),
+        device=device_str,
     )
 
     click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")