diff --git a/inference_models/examples/fused-nms/run_fused_nms_inference_benchmarking.py b/inference_models/examples/fused-nms/run_fused_nms_inference_benchmarking.py
new file mode 100644
index 0000000000..d59be4276b
--- /dev/null
+++ b/inference_models/examples/fused-nms/run_fused_nms_inference_benchmarking.py
@@ -0,0 +1,238 @@
+import json
+import time
+from pathlib import Path
+from typing import Any, Optional
+
+import click
+import cv2
+import numpy as np
+
+from inference_models import AutoModel
+
+TEST_BATCH_SIZE = 4
+
+
+def _onnx_ep_preset_to_providers_and_device(
+    preset: str,
+) -> tuple[list[str], str]:
+    """Map CLI preset to ONNX Runtime provider chain and PyTorch device string."""
+    if preset == "cpu":
+        return (["CPUExecutionProvider"], "cpu")
+    if preset == "cuda":
+        return (["CUDAExecutionProvider", "CPUExecutionProvider"], "cuda")
+    if preset == "tensorrt":
+        return (
+            [
+                "TensorrtExecutionProvider",
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],
+            "cuda",
+        )
+    raise click.ClickException(f"Unknown onnx-execution-providers preset: {preset!r}")
+
+
+def _latency_report_dict(
+    *,
+    model_path: Path,
+    warmup_runs: int,
+    latencies_ms: list[float],
+    onnx_execution_providers_preset: str,
+    onnx_execution_providers: list[str],
+    device: str,
+    batch_size: int,
+    images: list[Path],
+) -> dict[str, Any]:
+    return {
+        "model_path": str(model_path.resolve()),
+        "images": [str(image.resolve()) for image in images],
+        "onnx_execution_providers_preset": onnx_execution_providers_preset,
+        "onnx_execution_providers": onnx_execution_providers,
+        "device": device,
+        "batch_size": batch_size,
+        "warmup_runs": warmup_runs,
+        "timed_runs": len(latencies_ms),
+        "mean_ms": np.mean(latencies_ms),
+        "p_50_ms": np.percentile(latencies_ms, 50),
+        "p_95_ms": np.percentile(latencies_ms, 95),
+        "p_99_ms": np.percentile(latencies_ms, 99),
+        "mean_per_image_ms": np.mean(latencies_ms) / batch_size,
+        "throughput_fps": (batch_size * len(latencies_ms)) / (np.sum(latencies_ms) / 1000),
+    }
+
+
+def _write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2) + "\n")
+
+
+@click.command()
+@click.option(
+    "--run-name",
+    type=str,
+    required=True,
+    help="Name of the run for reporting. Will be used as a subdirectory in the target directory.",
+)
+@click.option(
+    "--image-dir",
+    type=click.Path(path_type=Path, exists=True, dir_okay=True, readable=True),
+    required=True,
+    help="Path to the input image directory.",
+)
+@click.option(
+    "--model-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=True, readable=True),
+    required=True,
+    help="Path to the model directory.",
+)
+@click.option(
+    "--target-dir",
+    type=click.Path(path_type=Path, file_okay=False),
+    required=True,
+    help="Directory for latency.json (created if missing).",
+)
+@click.option(
+    "--confidence",
+    type=float,
+    help="Confidence threshold used by post-processing.",
+)
+@click.option(
+    "--iou-threshold",
+    type=float,
+    help="IOU threshold used by post-processing.",
+)
+@click.option(
+    "--max-detections",
+    type=int,
+    help="Maximum number of detections used by post-processing.",
+)
+@click.option(
+    "-n",
+    "--benchmark-iters",
+    type=click.IntRange(min=1),
+    default=200,
+    show_default=True,
+    help=(
+        "Number of timed inference runs for benchmarking (mean/median/std in ms). "
+        "0 runs inference once without benchmark stats."
+    ),
+)
+@click.option(
+    "--warmup",
+    type=click.IntRange(min=0),
+    default=20,
+    show_default=True,
+    help="Untimed warmup runs before timed iterations.",
+)
+@click.option(
+    "--onnx-execution-providers",
+    "onnx_ep_preset",
+    type=click.Choice(["cpu", "cuda", "tensorrt"], case_sensitive=False),
+    default="cpu",
+    show_default=True,
+    help=(
+        "ONNX Runtime execution provider chain: "
+        "cpu (CPUExecutionProvider); "
+        "cuda (CUDAExecutionProvider then CPUExecutionProvider); "
+        "tensorrt (TensorrtExecutionProvider, CUDA, then CPU fallbacks)."
+    ),
+)
+def main(
+    run_name: str,
+    image_dir: Path,
+    model_path: Path,
+    target_dir: Path,
+    confidence: Optional[float] = None,
+    iou_threshold: Optional[float] = None,
+    max_detections: Optional[int] = None,
+    benchmark_iters: int = 200,
+    warmup: int = 20,
+    onnx_ep_preset: str = "cpu",
+) -> None:
+    onnx_ep_preset = onnx_ep_preset.lower()
+    onnx_providers, device_str = _onnx_ep_preset_to_providers_and_device(onnx_ep_preset)
+
+    click.echo(
+        f"Loading model: {model_path} "
+        f"(onnx_execution_providers={onnx_providers!r}, device={device_str!r})"
+    )
+    model = AutoModel.from_pretrained(
+        model_path,
+        onnx_execution_providers=list(onnx_providers),
+        device=device_str,
+    )
+
+    click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
+    
+    nms_params = {
+        "confidence": confidence,
+        "iou_threshold": iou_threshold,
+        "max_detections": max_detections,
+    }
+    nms_params = {name: value for name, value in nms_params.items() if value is not None}
+
+    if nms_params:
+        click.echo(f"User provided NMS parameters: {nms_params}")
+
+    forward_pass = model._inference_config.forward_pass
+    use_batching = forward_pass.static_batch_size is None
+
+    if use_batching:
+        click.echo(f"Model exported as dynamic. Using image batch")
+    else:
+        click.echo(f"Model exported as static. Using single image inference")
+
+    image_paths = list(image_dir.glob("*.jpg"))
+    batched_image_paths = image_paths[:TEST_BATCH_SIZE] if use_batching else image_paths[:1]
+
+    images = []
+    for image_path in batched_image_paths:
+        image = cv2.imread(str(image_path))
+        if image is None:
+            raise click.ClickException(f"Could not load image from: {image_path}")
+        images.append(image)
+
+    inputs = images[:TEST_BATCH_SIZE] if use_batching else images[0]
+
+    click.echo(f"Warmup: {warmup} untimed runs..." if warmup > 0 else "No warmup runs.")
+
+    for _ in range(warmup):
+        predictions = model(inputs, **nms_params)
+        _ = predictions[0].to_supervision()
+
+    click.echo(f"Benchmarking: {benchmark_iters} timed runs...")
+
+    latencies_ms: list[float] = []
+    for _ in range(benchmark_iters):
+        t0 = time.perf_counter()
+        predictions = model(inputs, **nms_params)
+        _ = predictions[0].to_supervision()
+        latencies_ms.append((time.perf_counter() - t0) * 1000.0)
+
+    click.echo("Writing reports ...")
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+    latency_path = target_dir / run_name / "latency.json"
+    nms_params_path = target_dir / run_name / "nms_params.json"
+    inference_config_path = target_dir / run_name / "inference_config.json"
+
+    _write_json(
+        latency_path,
+        _latency_report_dict(
+            model_path=model_path,
+            warmup_runs=warmup,
+            latencies_ms=latencies_ms,
+            onnx_execution_providers_preset=onnx_ep_preset,
+            onnx_execution_providers=list(onnx_providers),
+            device=device_str,
+            batch_size=len(inputs) if isinstance(inputs, list) else 1,
+            images=batched_image_paths,
+        ),
+    )
+    _write_json(inference_config_path, model._inference_config.model_dump_json())
+    _write_json(nms_params_path, nms_params)
+
+    click.echo("Done!")
+
+if __name__ == "__main__":
+    main()
diff --git a/inference_models/examples/fused-nms/run_single_fused_nms_inference.py b/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
new file mode 100644
index 0000000000..6c5f0b48c5
--- /dev/null
+++ b/inference_models/examples/fused-nms/run_single_fused_nms_inference.py
@@ -0,0 +1,138 @@
+from pathlib import Path
+from typing import Optional
+
+import click
+import cv2
+
+from inference_models import AutoModel
+
+
+def _onnx_ep_preset_to_providers_and_device(
+    preset: str,
+) -> tuple[list[str], str]:
+    """Map CLI preset to ONNX Runtime provider chain and PyTorch device string."""
+    if preset == "cpu":
+        return (["CPUExecutionProvider"], "cpu")
+    if preset == "cuda":
+        return (["CUDAExecutionProvider", "CPUExecutionProvider"], "cuda")
+    if preset == "tensorrt":
+        return (
+            [
+                "TensorrtExecutionProvider",
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],
+            "cuda",
+        )
+    raise click.ClickException(f"Unknown onnx-execution-providers preset: {preset!r}")
+
+
+@click.command()
+@click.option(
+    "--image-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=False, readable=True),
+    required=True,
+    help="Path to the input image.",
+)
+@click.option(
+    "--model-path",
+    type=click.Path(path_type=Path, exists=True, dir_okay=True, readable=True),
+    required=True,
+    help="Path to the model directory.",
+)
+@click.option(
+    "--confidence",
+    type=float,
+    help="Confidence threshold used by post-processing.",
+)
+@click.option(
+    "--iou-threshold",
+    type=float,
+    help="IOU threshold used by post-processing.",
+)
+@click.option(
+    "--max-detections",
+    type=int,
+    help="Maximum number of detections used by post-processing.",
+)
+@click.option(
+    "--onnx-execution-providers",
+    "onnx_ep_preset",
+    type=click.Choice(["cpu", "cuda", "tensorrt"], case_sensitive=False),
+    default="cpu",
+    show_default=True,
+    help=(
+        "ONNX Runtime execution provider chain: "
+        "cpu (CPUExecutionProvider); "
+        "cuda (CUDAExecutionProvider then CPUExecutionProvider); "
+        "tensorrt (TensorrtExecutionProvider, CUDA, then CPU fallbacks)."
+    ),
+)
+def main(
+    image_path: Path,
+    model_path: Path,
+    confidence: Optional[float] = None,
+    iou_threshold: Optional[float] = None,
+    max_detections: Optional[int] = None,
+    onnx_ep_preset: str = "cpu",
+) -> None:
+    image = cv2.imread(str(image_path))
+    if image is None:
+        raise click.ClickException(f"Could not load image from: {image_path}")
+
+    nms_params = {
+        "confidence": confidence,
+        "iou_threshold": iou_threshold,
+        "max_detections": max_detections,
+    }
+
+    nms_params = {name: value for name, value in nms_params.items() if value is not None}
+    if nms_params:
+        click.echo(f"User provided NMS parameters: {nms_params}")
+
+    onnx_ep_preset = onnx_ep_preset.lower()
+    onnx_providers, device_str = _onnx_ep_preset_to_providers_and_device(onnx_ep_preset)
+
+    click.echo(
+        f"Loading model: {model_path} "
+        f"(onnx_execution_providers={onnx_providers!r}, device={device_str!r})"
+    )
+    model = AutoModel.from_pretrained(
+        model_path,
+        onnx_execution_providers=list(onnx_providers),
+        device=device_str,
+    )
+
+    click.echo(f"Fused NMS available: {model._inference_config.post_processing.fused}")
+
+    forward_pass = model._inference_config.forward_pass
+    if forward_pass.static_batch_size is None:
+        max_dyn = forward_pass.max_dynamic_batch_size
+        if max_dyn is not None:
+            click.echo(
+                "Batching: dynamic mode (no static batch size); "
+                f"maximum batch size is {max_dyn}."
+            )
+        else:
+            click.echo(
+                "Batching: dynamic mode (no static batch size); "
+                "max_dynamic_batch_size is not set in the model config."
+            )
+
+    click.echo("Running inference...")
+    predictions = model(image, **nms_params)
+    detections = predictions[0].to_supervision()
+
+    click.echo(f"Detected {len(detections)} objects")
+    for idx, (xyxy, class_id, conf) in enumerate(
+        zip(detections.xyxy, detections.class_id, detections.confidence), start=1
+    ):
+        x1, y1, x2, y2 = [int(v) for v in xyxy.tolist()]
+        click.echo(
+            f"[{idx}] class_id={int(class_id)} confidence={float(conf):.4f} "
+            f"bbox=({x1}, {y1}, {x2}, {y2})"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py
index a0a34b39fb..21d8b88b0d 100644
--- a/inference_models/inference_models/configuration.py
+++ b/inference_models/inference_models/configuration.py
@@ -5,8 +5,10 @@
 from inference_models.utils.environment import (
     get_boolean_from_env,
     get_comma_separated_list_of_integers_from_env,
+    get_comma_separated_list_of_strings_from_env,
     get_float_from_env,
     get_integer_from_env,
+    get_string_from_env,
     parse_comma_separated_values,
 )
 
@@ -350,6 +352,31 @@
     variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CLASS_AGNOSTIC_NMS",
     default=INFERENCE_MODELS_DEFAULT_CLASS_AGNOSTIC_NMS,
 )
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME",
+    default="images",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME",
+    default="confidence",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME",
+    default="iou_threshold",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME = get_string_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME",
+    default="max_output_boxes_per_class",
+)
+INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES = get_comma_separated_list_of_strings_from_env(
+    variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES",
+    default=[
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME,
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME,
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME,
+        INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME,
+    ],
+)
 INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_KEY_POINTS_THRESHOLD = get_float_from_env(
     variable_name="INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_KEY_POINTS_THRESHOLD",
     default=0.0,
diff --git a/inference_models/inference_models/models/common/onnx.py b/inference_models/inference_models/models/common/onnx.py
index 510e1d1682..b99f5289b7 100644
--- a/inference_models/inference_models/models/common/onnx.py
+++ b/inference_models/inference_models/models/common/onnx.py
@@ -198,8 +198,10 @@ def run_onnx_session_with_batch_size_limit(
     Args:
         session: ONNX Runtime inference session.
 
-        inputs: Dictionary mapping input names to PyTorch tensors. All tensors
-            must have the same batch size (first dimension).
+        inputs: Dictionary mapping input names to PyTorch tensors. Tensors that
+            participate in the main batch must share the same size on dimension 0.
+            Tensors with dimension 0 equal to 1 are treated as broadcast scalars
+            and are not split across chunks.
 
         output_shape_mapping: Optional dictionary mapping output names to their
             expected shapes. Used for pre-allocating output buffers. If None,
@@ -217,7 +219,8 @@ def run_onnx_session_with_batch_size_limit(
         the model's output specification.
 
     Raises:
-        ModelRuntimeError: If input tensors have different batch sizes.
+        ModelInputError: If dimension-0 sizes are incompatible (for example two
+            different batch sizes both greater than 1).
 
     Examples:
         Run inference with batch size limit:
@@ -257,7 +260,7 @@ def run_onnx_session_with_batch_size_limit(
         - Automatically handles batch splitting and result concatenation
         - Pads the last chunk if min_batch_size is specified
         - Uses `run_onnx_session_via_iobinding()` internally for efficiency
-        - All input tensors must have the same batch size
+        - Broadcast inputs with batch dimension 1 are supported alongside batched tensors
 
     See Also:
         - `run_onnx_session_via_iobinding()`: Lower-level ONNX execution
@@ -269,20 +272,24 @@ def run_onnx_session_with_batch_size_limit(
             inputs=inputs,
             output_shape_mapping=output_shape_mapping,
         )
-    input_batch_sizes = set()
-    for input_tensor in inputs.values():
-        input_batch_sizes.add(input_tensor.shape[0])
-    if len(input_batch_sizes) != 1:
+
+    batch_input_sizes = [tensor.shape[0] for tensor in inputs.values() if tensor.numel() != 1]
+    batch_size = max(batch_input_sizes)
+    is_incompatible_batch_size_set = [
+        size for size in batch_input_sizes if size != batch_size
+    ]
+    if is_incompatible_batch_size_set:
         raise ModelInputError(
-            message="When running forward pass through ONNX model detected inputs with different batch sizes. "
-            "This is the error with the model you run. If the model was trained or exported "
-            "on Roboflow platform - contact us to get help. Otherwise, verify your model package or "
-            "implementation of the model class.",
+            message="When running forward pass through ONNX model detected inputs with incompatible sizes on "
+            "dimension 0. Expected each tensor to have either size 1 (scalar/broadcast inputs) or the same "
+            f"primary batch size ({batch_size}). Got distinct sizes: {sorted(set(batch_input_sizes))!r}. "
+            "If the model was trained or exported on Roboflow platform, contact us for help. Otherwise, "
+            "verify your model package or implementation of the model class.",
             help_url="https://inference-models.roboflow.com/errors/input-validation/#modelinputerror",
         )
-    input_batch_size = input_batch_sizes.pop()
-    if input_batch_size <= max_batch_size and (
-        min_batch_size is None or input_batch_size >= min_batch_size
+
+    if batch_size <= max_batch_size and (
+        min_batch_size is None or batch_size >= min_batch_size
     ):
         # no point iterating
         return run_onnx_session_via_iobinding(
@@ -293,25 +300,28 @@ def run_onnx_session_with_batch_size_limit(
     all_results = []
     for _ in session.get_outputs():
         all_results.append([])
-    for i in range(0, input_batch_size, max_batch_size):
+    for i in range(0, batch_size, max_batch_size):
         batch_inputs = {}
         reminder = 0
         for name, value in inputs.items():
-            batched_value = value[i : i + max_batch_size]
-            if min_batch_size is not None:
-                reminder = min_batch_size - batched_value.shape[0]
-            if reminder > 0:
-                batched_value = torch.cat(
-                    (
-                        batched_value,
-                        torch.zeros(
-                            (reminder,) + batched_value.shape[1:],
-                            dtype=batched_value.dtype,
-                            device=batched_value.device,
+            if value.shape[0] == batch_size:
+                batched_value = value[i : i + max_batch_size]
+                if min_batch_size is not None:
+                    reminder = min_batch_size - batched_value.shape[0]
+                if reminder > 0:
+                    batched_value = torch.cat(
+                        (
+                            batched_value,
+                            torch.zeros(
+                                (reminder,) + batched_value.shape[1:],
+                                dtype=batched_value.dtype,
+                                device=batched_value.device,
+                            ),
                         ),
-                    ),
-                    dim=0,
-                )
+                        dim=0,
+                    )
+            else:
+                batched_value = value
             batched_value = batched_value.contiguous()
             batch_inputs[name] = batched_value
         batch_output_shape_mapping = None
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
index e7607a7c12..d6ef1a65de 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_onnx.py
@@ -11,8 +11,14 @@
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME,
+    INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES,
 )
 from inference_models.entities import ColorFormat
+from inference_models.logger import LOGGER
 from inference_models.errors import (
     CorruptedModelPackageError,
     EnvironmentConfigurationError,
@@ -127,13 +133,61 @@ def from_pretrained(
             path_or_bytes=model_package_content["weights.onnx"],
             providers=onnx_execution_providers,
         )
-        input_batch_size = session.get_inputs()[0].shape[0]
+        onnx_graph_inputs = session.get_inputs()
+        input_names = [input.name for input in onnx_graph_inputs]
+
+        if inference_config.post_processing.fused:
+            expected_fused_nms_inputs = (
+                INFERENCE_MODELS_YOLO_ULTRALYTICS_DECLARED_FUSED_NMS_INPUT_NAMES
+            )
+            expected_fused_nms_input_set = set(expected_fused_nms_inputs)
+            if (
+                INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME
+                not in input_names
+            ):
+                raise CorruptedModelPackageError(
+                    message=(
+                        f"Fused NMS YOLOv8 ONNX model must declare the images input "
+                        f"({INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME!r}). "
+                        f"Got: {input_names}"
+                    ),
+                    help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
+                )
+
+            unexpected_inputs = [
+                name for name in input_names if name not in expected_fused_nms_input_set
+            ]
+            if unexpected_inputs:
+                raise CorruptedModelPackageError(
+                    message=(
+                        f"Fused NMS YOLOv8 ONNX model has unexpected inputs {unexpected_inputs}. "
+                        f"Expected each name to be one of: {expected_fused_nms_input_set}"
+                    ),
+                    help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
+                )
+
+            missing_inputs = [
+                name for name in expected_fused_nms_inputs if name not in input_names
+            ]
+            if missing_inputs:
+                LOGGER.warning(
+                    "Fused NMS ONNX graph omits inputs %s; they will not be passed at "
+                    "inference time and ONNX Runtime will use graph initializer defaults for those parameters. "
+                    "Python arguments matching omitted inputs (e.g. confidence, iou_threshold, max_detections) "
+                    "will not affect the fused NMS stage.",
+                    missing_inputs,
+                )
+        
+        images_input = [graph_input for graph_input in onnx_graph_inputs if graph_input.name == INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME][0]
+        input_batch_size = images_input.shape[0]
+        
+        # Dynamic batch size export results in "batch" string as dimension 0 representation
         if isinstance(input_batch_size, str):
             input_batch_size = None
-        input_name = session.get_inputs()[0].name
+
         return cls(
             session=session,
-            input_name=input_name,
+            input_names=input_names,
             class_names=class_names,
             inference_config=inference_config,
             device=device,
@@ -143,14 +197,14 @@ def from_pretrained(
     def __init__(
         self,
         session: onnxruntime.InferenceSession,
-        input_name: str,
+        input_names: List[str],
         inference_config: InferenceConfig,
         class_names: List[str],
         device: torch.device,
         input_batch_size: Optional[int],
     ):
         self._session = session
-        self._input_name = input_name
+        self._input_names = input_names
         self._inference_config = inference_config
         self._class_names = class_names
         self._device = device
@@ -184,11 +238,40 @@ def pre_process(
             pre_processing_overrides=pre_processing_overrides,
         )
 
-    def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
+    def forward(
+        self,
+        pre_processed_images: torch.Tensor,
+        confidence: Optional[float] = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
+        iou_threshold: Optional[float] = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
+        max_detections: Optional[int] = INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
+        **kwargs,
+    ) -> torch.Tensor:
         with self._session_thread_lock:
+            device = pre_processed_images.device
+
+            input_builders = {
+                INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IMAGES_INPUT_NAME: lambda: pre_processed_images,
+            }
+
+            if self._inference_config.post_processing.fused:
+                if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME in self._input_names:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE_INPUT_NAME] = lambda: torch.tensor(
+                        float(confidence), dtype=torch.float32, device=device
+                    )
+                if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME in self._input_names:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD_INPUT_NAME] = lambda: torch.tensor(
+                        [float(iou_threshold)], dtype=torch.float32, device=device
+                    )
+                if INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME in self._input_names:
+                    input_builders[INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS_INPUT_NAME] = lambda: torch.tensor(
+                        [int(max_detections)], dtype=torch.int64, device=device
+                    )
+
+            inputs = {name: builder_fn() for name, builder_fn in input_builders.items()}
+
             return run_onnx_session_with_batch_size_limit(
                 session=self._session,
-                inputs={self._input_name: pre_processed_images},
+                inputs=inputs,
                 min_batch_size=self._min_batch_size,
                 max_batch_size=self._max_batch_size,
             )[0]
diff --git a/inference_models/inference_models/utils/environment.py b/inference_models/inference_models/utils/environment.py
index e43fd35553..b23c007d17 100644
--- a/inference_models/inference_models/utils/environment.py
+++ b/inference_models/inference_models/utils/environment.py
@@ -72,6 +72,39 @@ def get_comma_separated_list_of_integers_from_env(
         )
 
 
+def get_comma_separated_list_of_strings_from_env(
+    variable_name: str,
+    default: Optional[List[str]] = None,
+) -> List[str]:
+    value = os.getenv(variable_name)
+    if value is None:
+        if default is None:
+            raise InvalidEnvVariable(
+                message=f"Environment variable {variable_name} is required",
+                help_url="https://inference-models.roboflow.com/errors/runtime-environment/#invalidenvvariable",
+            )
+        return default
+    try:
+        return [v.strip() for v in parse_comma_separated_values(value)]
+    except ValueError:
+        raise InvalidEnvVariable(
+            message=f"Expected a environment variable `{variable_name}` to be comma separated list of strings but got '{value}'",
+            help_url="https://inference-models.roboflow.com/errors/runtime-environment/#invalidenvvariable",
+        )
+
+
+def get_string_from_env(variable_name: str, default: Optional[str] = None) -> str:
+    value = os.getenv(variable_name)
+    if value is None:
+        if default is None:
+            raise InvalidEnvVariable(
+                message=f"Environment variable {variable_name} is required",
+                help_url="https://inference-models.roboflow.com/errors/runtime-environment/#invalidenvvariable",
+            )
+        return default
+    return value.strip()
+
+
 def parse_comma_separated_values(values: str) -> List[str]:
     if not values:
         return []