diff --git a/inference/core/interfaces/http/http_api.py b/inference/core/interfaces/http/http_api.py
index a9e1a5e4a1..b90351b281 100644
--- a/inference/core/interfaces/http/http_api.py
+++ b/inference/core/interfaces/http/http_api.py
@@ -5,7 +5,6 @@
 from concurrent.futures import CancelledError, Future, ThreadPoolExecutor
 from functools import partial
 from threading import Lock, Thread
-from time import sleep
 from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
 from uuid import uuid4
 
@@ -21,7 +20,7 @@
     Query,
     Request,
 )
-from fastapi.responses import JSONResponse, RedirectResponse, Response
+from fastapi.responses import JSONResponse, Response
 from fastapi.staticfiles import StaticFiles
 from fastapi_cprofile.profiler import CProfileMiddleware
 from pydantic import ValidationError
@@ -44,10 +43,8 @@
     ClassificationInferenceRequest,
     DepthEstimationRequest,
     InferenceRequest,
-    InferenceRequestImage,
     InstanceSegmentationInferenceRequest,
     KeypointsDetectionInferenceRequest,
-    LMMInferenceRequest,
     ObjectDetectionInferenceRequest,
     SemanticSegmentationInferenceRequest,
 )
@@ -92,13 +89,11 @@
     InferenceResponse,
     InstanceSegmentationInferenceResponse,
     KeypointsDetectionInferenceResponse,
-    LMMInferenceResponse,
     MultiLabelClassificationInferenceResponse,
     ObjectDetectionInferenceResponse,
     SemanticSegmentationInferenceResponse,
     StubResponse,
 )
-from inference.core.entities.responses.notebooks import NotebookStartResponse
 from inference.core.entities.responses.ocr import OCRInferenceResponse
 from inference.core.entities.responses.perception_encoder import (
     PerceptionEncoderCompareResponse,
@@ -133,7 +128,6 @@
     API_BASE_URL,
     API_LOGGING_ENABLED,
     BUILDER_ORIGIN,
-    CONFIDENCE_LOWER_BOUND_OOM_PREVENTION,
     CORE_MODEL_CLIP_ENABLED,
     CORE_MODEL_DOCTR_ENABLED,
     CORE_MODEL_EASYOCR_ENABLED,
@@ -161,20 +155,14 @@
     HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_ENABLED,
     HTTP_API_SHARED_WORKFLOWS_THREAD_POOL_WORKERS,
     LAMBDA,
-    LEGACY_ROUTE_ENABLED,
     LMM_ENABLED,
     METRICS_ENABLED,
-    MOONDREAM2_ENABLED,
-    NOTEBOOK_ENABLED,
-    NOTEBOOK_PASSWORD,
-    NOTEBOOK_PORT,
     PINNED_MODELS,
     PRELOAD_API_KEY,
     PRELOAD_MODELS,
     PROFILE,
     ROBOFLOW_INTERNAL_SERVICE_NAME,
     ROBOFLOW_INTERNAL_SERVICE_SECRET,
-    ROBOFLOW_SERVICE_SECRET,
     SAM3_EXEC_MODE,
     SAM3_FINE_TUNED_MODELS_ENABLED,
     USE_INFERENCE_MODELS,
@@ -185,24 +173,26 @@
     WORKFLOWS_STEP_EXECUTION_MODE,
 )
 from inference.core.exceptions import (
-    ContentTypeInvalid,
-    ContentTypeMissing,
     InputImageLoadError,
     MissingApiKeyError,
-    MissingServiceSecretError,
     RoboflowAPINotAuthorizedError,
     RoboflowAPINotNotFoundError,
     WebRTCConfigurationError,
     WorkspaceLoadError,
 )
 from inference.core.interfaces.base import BaseInterface
-from inference.core.interfaces.http.dependencies import (
-    parse_body_content_for_legacy_request_handler,
-)
 from inference.core.interfaces.http.error_handlers import (
     with_route_exceptions,
-    with_route_exceptions_async,
 )
+from inference.core.interfaces.http.routes.info import create_info_router
+from inference.core.interfaces.http.routes.inference import create_inference_router
+from inference.core.interfaces.http.routes.models import create_models_router
+from inference.core.interfaces.http.routes.core_models import create_core_models_router
+from inference.core.interfaces.http.routes.stream import create_stream_router
+from inference.core.interfaces.http.routes.webrtc import create_webrtc_worker_router
+from inference.core.interfaces.http.routes.notebook import create_notebook_router
+from inference.core.interfaces.http.routes.workflows import create_workflows_router
+from inference.core.interfaces.http.routes.legacy import create_legacy_router
 from inference.core.interfaces.http.handlers.workflows import (
     filter_out_unwanted_workflow_outputs,
     handle_describe_workflows_blocks_request,
@@ -215,12 +205,9 @@
     orjson_response_keeping_parent_id,
 )
 from inference.core.interfaces.stream_manager.api.entities import (
-    CommandContext,
     CommandResponse,
     ConsumePipelineResponse,
     InferencePipelineStatusResponse,
-    InitializeWebRTCPipelineResponse,
-    InitializeWebRTCResponse,
     ListPipelinesResponse,
 )
 from inference.core.interfaces.stream_manager.api.stream_manager_client import (
@@ -232,11 +219,6 @@
     InitialiseWebRTCPipelinePayload,
     OperationStatus,
 )
-from inference.core.interfaces.webrtc_worker import start_worker
-from inference.core.interfaces.webrtc_worker.entities import (
-    WebRTCWorkerRequest,
-    WebRTCWorkerResult,
-)
 from inference.core.managers.base import ModelManager
 from inference.core.managers.metrics import get_container_stats
 from inference.core.managers.prometheus import InferenceInstrumentator
@@ -247,7 +229,6 @@
     get_workflow_specification,
 )
 from inference.core.utils.container import is_docker_socket_mounted
-from inference.core.utils.notebooks import start_notebook
 from inference.core.workflows.core_steps.common.entities import StepExecutionMode
 from inference.core.workflows.errors import WorkflowError, WorkflowSyntaxError
 from inference.core.workflows.execution_engine.core import (
@@ -729,1055 +710,34 @@ def process_inference_request(
             )
             return orjson_response(resp)
 
-        def process_workflow_inference_request(
-            workflow_request: WorkflowInferenceRequest,
-            workflow_specification: dict,
-            background_tasks: Optional[BackgroundTasks],
-            profiler: WorkflowsProfiler,
-        ) -> WorkflowInferenceResponse:
-
-            workflow_init_parameters = {
-                "workflows_core.model_manager": model_manager,
-                "workflows_core.api_key": workflow_request.api_key,
-                "workflows_core.background_tasks": background_tasks,
-            }
-            execution_engine = ExecutionEngine.init(
-                workflow_definition=workflow_specification,
-                init_parameters=workflow_init_parameters,
-                max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
-                prevent_local_images_loading=True,
-                profiler=profiler,
-                executor=self.shared_thread_pool_executor,
-                workflow_id=workflow_request.workflow_id,
-            )
-            is_preview = False
-            if hasattr(workflow_request, "is_preview"):
-                is_preview = workflow_request.is_preview
-            workflow_results = execution_engine.run(
-                runtime_parameters=workflow_request.inputs,
-                serialize_results=True,
-                _is_preview=is_preview,
-            )
-            with profiler.profile_execution_phase(
-                name="workflow_results_filtering",
-                categories=["inference_package_operation"],
-            ):
-                outputs = filter_out_unwanted_workflow_outputs(
-                    workflow_results=workflow_results,
-                    excluded_fields=workflow_request.excluded_fields,
-                )
-            profiler_trace = profiler.export_trace()
-            response = WorkflowInferenceResponse(
-                outputs=outputs,
-                profiler_trace=profiler_trace,
-            )
-            return orjson_response(response=response)
-
-        def load_core_model(
-            inference_request: InferenceRequest,
-            api_key: Optional[str] = None,
-            core_model: str = None,
-            countinference: Optional[bool] = None,
-            service_secret: Optional[str] = None,
-        ) -> None:
-            """Loads a core model (e.g., "clip" or "sam") into the model manager.
-
-            Args:
-                inference_request (InferenceRequest): The request containing version and other details.
-                api_key (Optional[str]): The API key for the request.
-                core_model (str): The core model type, e.g., "clip" or "sam".
-                countinference (Optional[bool]): Whether to count inference or not.
-                service_secret (Optional[str]): The service secret for the request.
-
-            Returns:
-                str: The core model ID.
-            """
-            if api_key:
-                inference_request.api_key = api_key
-            version_id_field = f"{core_model}_version_id"
-            core_model_id = (
-                f"{core_model}/{inference_request.__getattribute__(version_id_field)}"
-            )
-            self.model_manager.add_model(
-                core_model_id,
-                inference_request.api_key,
-                endpoint_type=ModelEndpointType.CORE_MODEL,
-                countinference=countinference,
-                service_secret=service_secret,
-            )
-            return core_model_id
-
-        load_clip_model = partial(load_core_model, core_model="clip")
-        """Loads the CLIP model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The CLIP model ID.
-        """
-
-        load_pe_model = partial(load_core_model, core_model="perception_encoder")
-        """Loads the Perception Encoder model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The Perception Encoder model ID.
-        """
-
-        load_sam_model = partial(load_core_model, core_model="sam")
-        """Loads the SAM model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The SAM model ID.
-        """
-        load_sam2_model = partial(load_core_model, core_model="sam2")
-        """Loads the SAM2 model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The SAM2 model ID.
-        """
-
-        load_gaze_model = partial(load_core_model, core_model="gaze")
-        """Loads the GAZE model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The GAZE model ID.
-        """
-
-        load_doctr_model = partial(load_core_model, core_model="doctr")
-        """Loads the DocTR model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The DocTR model ID.
-        """
-
-        load_easy_ocr_model = partial(load_core_model, core_model="easy_ocr")
-        """Loads the EasyOCR model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The EasyOCR model ID.
-        """
-
-        load_paligemma_model = partial(load_core_model, core_model="paligemma")
-
-        load_grounding_dino_model = partial(
-            load_core_model, core_model="grounding_dino"
-        )
-        """Loads the Grounding DINO model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The Grounding DINO model ID.
-        """
-
-        load_yolo_world_model = partial(load_core_model, core_model="yolo_world")
-        load_owlv2_model = partial(load_core_model, core_model="owlv2")
-        """Loads the YOLO World model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The YOLO World model ID.
-        """
-
-        load_trocr_model = partial(load_core_model, core_model="trocr")
-        """Loads the TrOCR model into the model manager.
-
-        Args:
-        Same as `load_core_model`.
-
-        Returns:
-        The TrOCR model ID.
-        """
-
-        @app.get(
-            "/info",
-            response_model=ServerVersionInfo,
-            summary="Info",
-            description="Get the server name and version number",
-        )
-        def root():
-            """Endpoint to get the server name and version number.
-
-            Returns:
-                ServerVersionInfo: The server version information.
-            """
-            return ServerVersionInfo(
-                name="Roboflow Inference Server",
-                version=__version__,
-                uuid=GLOBAL_INFERENCE_SERVER_ID,
-            )
-
-        @app.get(
-            "/logs",
-            summary="Get Recent Logs",
-            description="Get recent application logs for debugging",
-        )
-        def get_logs(
-            limit: Optional[int] = Query(
-                100, description="Maximum number of log entries to return"
-            ),
-            level: Optional[str] = Query(
-                None,
-                description="Filter by log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
-            ),
-            since: Optional[str] = Query(
-                None, description="Return logs since this ISO timestamp"
-            ),
-        ):
-            """Get recent application logs from memory.
+        app.include_router(create_info_router())
 
-            Only available when ENABLE_IN_MEMORY_LOGS environment variable is set to 'true'.
+        if (not LAMBDA and GET_MODEL_REGISTRY_ENABLED) or not (LAMBDA or GCP_SERVERLESS):
 
-            Args:
-                limit: Maximum number of log entries (default 100)
-                level: Filter by log level
-                since: ISO timestamp to filter logs since
-
-            Returns:
-                List of log entries with timestamp, level, logger, and message
-            """
-            # Check if in-memory logging is enabled
-            from inference.core.logging.memory_handler import (
-                get_recent_logs,
-                is_memory_logging_enabled,
-            )
-
-            if not is_memory_logging_enabled():
-                raise HTTPException(
-                    status_code=404, detail="Logs endpoint not available"
-                )
-
-            try:
-                logs = get_recent_logs(limit=limit or 100, level=level, since=since)
-                return {"logs": logs, "total_count": len(logs)}
-            except (ImportError, ModuleNotFoundError):
-                raise HTTPException(
-                    status_code=500, detail="Logging system not properly initialized"
-                )
-
-        if not LAMBDA and GET_MODEL_REGISTRY_ENABLED:
-
-            @app.get(
-                "/model/registry",
-                response_model=ModelsDescriptions,
-                summary="Get model keys",
-                description="Get the ID of each loaded model",
-            )
-            def registry():
-                """Get the ID of each loaded model in the registry.
-
-                Returns:
-                    ModelsDescriptions: The object containing models descriptions
-                """
-                logger.debug(f"Reached /model/registry")
-                models_descriptions = self.model_manager.describe_models()
-                return ModelsDescriptions.from_models_descriptions(
-                    models_descriptions=models_descriptions
-                )
-
-        # The current AWS Lambda authorizer only supports path parameters, therefore we can only use the legacy infer route. This case statement excludes routes which won't work for the current Lambda authorizer.
-        if not (LAMBDA or GCP_SERVERLESS):
-
-            @app.post(
-                "/model/add",
-                response_model=ModelsDescriptions,
-                summary="Load a model",
-                description="Load the model with the given model ID",
-            )
-            @with_route_exceptions
-            def model_add(
-                request: AddModelRequest,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """Load the model with the given model ID into the model manager.
-
-                Args:
-                    request (AddModelRequest): The request containing the model ID and optional API key.
-                    countinference (Optional[bool]): Whether to count inference or not.
-                    service_secret (Optional[str]): The service secret for the request.
-
-                Returns:
-                    ModelsDescriptions: The object containing models descriptions
-                """
-                logger.debug(f"Reached /model/add")
-                de_aliased_model_id = resolve_roboflow_model_alias(
-                    model_id=request.model_id
-                )
-                logger.info(f"Loading model: {de_aliased_model_id}")
-                self.model_manager.add_model(
-                    de_aliased_model_id,
-                    request.api_key,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-                models_descriptions = self.model_manager.describe_models()
-                return ModelsDescriptions.from_models_descriptions(
-                    models_descriptions=models_descriptions
-                )
-
-            @app.post(
-                "/model/remove",
-                response_model=ModelsDescriptions,
-                summary="Remove a model",
-                description="Remove the model with the given model ID",
-            )
-            @with_route_exceptions
-            def model_remove(request: ClearModelRequest):
-                """Remove the model with the given model ID from the model manager.
-
-                Args:
-                    request (ClearModelRequest): The request containing the model ID to be removed.
-
-                Returns:
-                    ModelsDescriptions: The object containing models descriptions
-                """
-                logger.debug(f"Reached /model/remove")
-                de_aliased_model_id = resolve_roboflow_model_alias(
-                    model_id=request.model_id
-                )
-                self.model_manager.remove(de_aliased_model_id)
-                models_descriptions = self.model_manager.describe_models()
-                return ModelsDescriptions.from_models_descriptions(
-                    models_descriptions=models_descriptions
-                )
-
-            @app.post(
-                "/model/clear",
-                response_model=ModelsDescriptions,
-                summary="Remove all models",
-                description="Remove all loaded models",
-            )
-            @with_route_exceptions
-            def model_clear():
-                """Remove all loaded models from the model manager.
-
-                Returns:
-                    ModelsDescriptions: The object containing models descriptions
-                """
-                logger.debug(f"Reached /model/clear")
-                self.model_manager.clear()
-                models_descriptions = self.model_manager.describe_models()
-                return ModelsDescriptions.from_models_descriptions(
-                    models_descriptions=models_descriptions
+            app.include_router(
+                create_models_router(model_manager=self.model_manager)
                 )
 
         # these NEW endpoints need authentication protection
         if not LAMBDA and not GCP_SERVERLESS:
 
-            @app.post(
-                "/infer/object_detection",
-                response_model=Union[
-                    ObjectDetectionInferenceResponse,
-                    List[ObjectDetectionInferenceResponse],
-                    StubResponse,
-                ],
-                summary="Object detection infer",
-                description="Run inference with the specified object detection model",
-                response_model_exclude_none=True,
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_object_detection(
-                inference_request: ObjectDetectionInferenceRequest,
-                background_tasks: BackgroundTasks,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """Run inference with the specified object detection model.
-
-                Args:
-                    inference_request (ObjectDetectionInferenceRequest): The request containing the necessary details for object detection.
-                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
-
-                Returns:
-                    Union[ObjectDetectionInferenceResponse, List[ObjectDetectionInferenceResponse]]: The response containing the inference results.
-                """
-                logger.debug(f"Reached /infer/object_detection")
-                return process_inference_request(
-                    inference_request,
-                    active_learning_eligible=True,
-                    background_tasks=background_tasks,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-            @app.post(
-                "/infer/instance_segmentation",
-                response_model=Union[
-                    InstanceSegmentationInferenceResponse, StubResponse
-                ],
-                summary="Instance segmentation infer",
-                description="Run inference with the specified instance segmentation model",
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_instance_segmentation(
-                inference_request: InstanceSegmentationInferenceRequest,
-                background_tasks: BackgroundTasks,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """Run inference with the specified instance segmentation model.
-
-                Args:
-                    inference_request (InstanceSegmentationInferenceRequest): The request containing the necessary details for instance segmentation.
-                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
-
-                Returns:
-                    InstanceSegmentationInferenceResponse: The response containing the inference results.
-                """
-                logger.debug(f"Reached /infer/instance_segmentation")
-                return process_inference_request(
-                    inference_request,
-                    active_learning_eligible=True,
-                    background_tasks=background_tasks,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-            @app.post(
-                "/infer/semantic_segmentation",
-                response_model=Union[
-                    SemanticSegmentationInferenceResponse, StubResponse
-                ],
-                summary="Semantic segmentation infer",
-                description="Run inference with the specified semantic segmentation model",
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_semantic_segmentation(
-                inference_request: SemanticSegmentationInferenceRequest,
-                background_tasks: BackgroundTasks,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """Run inference with the specified semantic segmentation model.
-
-                Args:
-                    inference_request (SemanticSegmentationInferenceRequest): The request containing the necessary details for semantic segmentation.
-                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
-
-                Returns:
-                    SemanticSegmentationInferenceResponse: The response containing the inference results.
-                """
-                logger.debug(f"Reached /infer/semantic_segmentation")
-                return process_inference_request(
-                    inference_request,
-                    active_learning_eligible=True,
-                    background_tasks=background_tasks,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-            @app.post(
-                "/infer/classification",
-                response_model=Union[
-                    ClassificationInferenceResponse,
-                    MultiLabelClassificationInferenceResponse,
-                    StubResponse,
-                ],
-                summary="Classification infer",
-                description="Run inference with the specified classification model",
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_classification(
-                inference_request: ClassificationInferenceRequest,
-                background_tasks: BackgroundTasks,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """Run inference with the specified classification model.
-
-                Args:
-                    inference_request (ClassificationInferenceRequest): The request containing the necessary details for classification.
-                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
-
-                Returns:
-                    Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]: The response containing the inference results.
-                """
-                logger.debug(f"Reached /infer/classification")
-                return process_inference_request(
-                    inference_request,
-                    active_learning_eligible=True,
-                    background_tasks=background_tasks,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-            @app.post(
-                "/infer/keypoints_detection",
-                response_model=Union[KeypointsDetectionInferenceResponse, StubResponse],
-                summary="Keypoints detection infer",
-                description="Run inference with the specified keypoints detection model",
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_keypoints(
-                inference_request: KeypointsDetectionInferenceRequest,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """Run inference with the specified keypoints detection model.
-
-                Args:
-                    inference_request (KeypointsDetectionInferenceRequest): The request containing the necessary details for keypoints detection.
-
-                Returns:
-                    Union[ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse]: The response containing the inference results.
-                """
-                logger.debug(f"Reached /infer/keypoints_detection")
-                return process_inference_request(
-                    inference_request,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-            if LMM_ENABLED or MOONDREAM2_ENABLED:
-
-                @app.post(
-                    "/infer/lmm",
-                    response_model=Union[
-                        LMMInferenceResponse,
-                        List[LMMInferenceResponse],
-                        StubResponse,
-                    ],
-                    summary="Large multi-modal model infer",
-                    description="Run inference with the specified large multi-modal model",
-                    response_model_exclude_none=True,
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def infer_lmm(
-                    inference_request: LMMInferenceRequest,
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """Run inference with the specified large multi-modal model.
-
-                    Args:
-                        inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.
-
-                    Returns:
-                        Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.
-                    """
-                    logger.debug(f"Reached /infer/lmm")
-                    return process_inference_request(
-                        inference_request,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-
-                @app.post(
-                    "/infer/lmm/{model_id:path}",
-                    response_model=Union[
-                        LMMInferenceResponse,
-                        List[LMMInferenceResponse],
-                        StubResponse,
-                    ],
-                    summary="Large multi-modal model infer with model ID in path",
-                    description="Run inference with the specified large multi-modal model. Model ID is specified in the URL path (can contain slashes).",
-                    response_model_exclude_none=True,
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def infer_lmm_with_model_id(
-                    model_id: str,
-                    inference_request: LMMInferenceRequest,
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """Run inference with the specified large multi-modal model.
-
-                    The model_id can be specified in the URL path. If model_id is also provided
-                    in the request body, it must match the path parameter.
-
-                    Args:
-                        model_id (str): The model identifier from the URL path.
-                        inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.
-
-                    Returns:
-                        Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.
-
-                    Raises:
-                        HTTPException: If model_id in path and request body don't match.
-                    """
-                    logger.debug(f"Reached /infer/lmm/{model_id}")
-
-                    # Validate model_id consistency between path and request body
-                    if (
-                        inference_request.model_id is not None
-                        and inference_request.model_id != model_id
-                    ):
-                        raise HTTPException(
-                            status_code=400,
-                            detail=f"Model ID mismatch: path specifies '{model_id}' but request body specifies '{inference_request.model_id}'",
-                        )
-
-                    # Set the model_id from path if not in request body
-                    inference_request.model_id = model_id
-
-                    return process_inference_request(
-                        inference_request,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
+            app.include_router(create_inference_router(model_manager=self.model_manager))
 
         if not DISABLE_WORKFLOW_ENDPOINTS:
-
-            @app.post(
-                "/{workspace_name}/workflows/{workflow_id}/describe_interface",
-                response_model=DescribeInterfaceResponse,
-                summary="Endpoint to describe interface of predefined workflow",
-                description="Checks Roboflow API for workflow definition, once acquired - describes workflow inputs and outputs",
-            )
-            @with_route_exceptions
-            def describe_predefined_workflow_interface(
-                workspace_name: str,
-                workflow_id: str,
-                workflow_request: PredefinedWorkflowDescribeInterfaceRequest,
-            ) -> DescribeInterfaceResponse:
-                workflow_specification = get_workflow_specification(
-                    api_key=workflow_request.api_key,
-                    workspace_id=workspace_name,
-                    workflow_id=workflow_id,
-                    use_cache=workflow_request.use_cache,
-                    workflow_version_id=workflow_request.workflow_version_id,
-                )
-                return handle_describe_workflows_interface(
-                    definition=workflow_specification,
-                )
-
-            @app.post(
-                "/workflows/describe_interface",
-                response_model=DescribeInterfaceResponse,
-                summary="Endpoint to describe interface of workflow given in request",
-                description="Parses workflow definition and retrieves describes inputs and outputs",
-            )
-            @with_route_exceptions
-            def describe_workflow_interface(
-                workflow_request: WorkflowSpecificationDescribeInterfaceRequest,
-            ) -> DescribeInterfaceResponse:
-                return handle_describe_workflows_interface(
-                    definition=workflow_request.specification,
-                )
-
-            @app.post(
-                "/{workspace_name}/workflows/{workflow_id}",
-                response_model=WorkflowInferenceResponse,
-                summary="Endpoint to run predefined workflow",
-                description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body",
-            )
-            @app.post(
-                "/infer/workflows/{workspace_name}/{workflow_id}",
-                response_model=WorkflowInferenceResponse,
-                summary="[LEGACY] Endpoint to run predefined workflow",
-                description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024",
-                deprecated=True,
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_from_predefined_workflow(
-                workspace_name: str,
-                workflow_id: str,
-                workflow_request: PredefinedWorkflowInferenceRequest,
-                background_tasks: BackgroundTasks,
-            ) -> WorkflowInferenceResponse:
-                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
-                if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
-                    profiler = BaseWorkflowsProfiler.init(
-                        max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
-                    )
-                else:
-                    profiler = NullWorkflowsProfiler.init()
-                with profiler.profile_execution_phase(
-                    name="workflow_definition_fetching",
-                    categories=["inference_package_operation"],
-                ):
-                    workflow_specification = get_workflow_specification(
-                        api_key=workflow_request.api_key,
-                        workspace_id=workspace_name,
-                        workflow_id=workflow_id,
-                        use_cache=workflow_request.use_cache,
-                        workflow_version_id=workflow_request.workflow_version_id,
-                    )
-                if not workflow_request.workflow_id:
-                    workflow_request.workflow_id = workflow_id
-                if not workflow_specification.get("id"):
-                    logger.warning(
-                        "Internal workflow ID missing in specification for '%s'",
-                        workflow_id,
-                    )
-                return process_workflow_inference_request(
-                    workflow_request=workflow_request,
-                    workflow_specification=workflow_specification,
-                    background_tasks=(
-                        background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
-                    ),
-                    profiler=profiler,
-                )
-
-            @app.post(
-                "/workflows/run",
-                response_model=WorkflowInferenceResponse,
-                summary="Endpoint to run workflow specification provided in payload",
-                description="Parses and executes workflow specification, injecting runtime parameters from request body.",
-            )
-            @app.post(
-                "/infer/workflows",
-                response_model=WorkflowInferenceResponse,
-                summary="[LEGACY] Endpoint to run workflow specification provided in payload",
-                description="Parses and executes workflow specification, injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024.",
-                deprecated=True,
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def infer_from_workflow(
-                workflow_request: WorkflowSpecificationInferenceRequest,
-                background_tasks: BackgroundTasks,
-            ) -> WorkflowInferenceResponse:
-                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
-                if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
-                    profiler = BaseWorkflowsProfiler.init(
-                        max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
-                    )
-                else:
-                    profiler = NullWorkflowsProfiler.init()
-                return process_workflow_inference_request(
-                    workflow_request=workflow_request,
-                    workflow_specification=workflow_request.specification,
-                    background_tasks=(
-                        background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
-                    ),
-                    profiler=profiler,
+            app.include_router(
+                create_workflows_router(
+                    model_manager=model_manager,
+                    shared_thread_pool_executor=self.shared_thread_pool_executor,
                 )
-
-            @app.get(
-                "/workflows/execution_engine/versions",
-                response_model=ExecutionEngineVersions,
-                summary="Returns available Execution Engine versions sorted from oldest to newest",
-                description="Returns available Execution Engine versions sorted from oldest to newest",
-            )
-            @with_route_exceptions
-            def get_execution_engine_versions() -> ExecutionEngineVersions:
-                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
-                versions = get_available_versions()
-                return ExecutionEngineVersions(versions=versions)
-
-            @app.get(
-                "/workflows/blocks/describe",
-                response_model=WorkflowsBlocksDescription,
-                summary="[LEGACY] Endpoint to get definition of workflows blocks that are accessible",
-                description="Endpoint provides detailed information about workflows building blocks that are "
-                "accessible in the inference server. This information could be used to programmatically "
-                "build / display workflows.",
-                deprecated=True,
             )
-            @with_route_exceptions
-            def describe_workflows_blocks(
-                request: Request,
-            ) -> Union[WorkflowsBlocksDescription, Response]:
-                result = handle_describe_workflows_blocks_request()
-                return gzip_response_if_requested(request=request, response=result)
-
-            @app.post(
-                "/workflows/blocks/describe",
-                response_model=WorkflowsBlocksDescription,
-                summary="[EXPERIMENTAL] Endpoint to get definition of workflows blocks that are accessible",
-                description="Endpoint provides detailed information about workflows building blocks that are "
-                "accessible in the inference server. This information could be used to programmatically "
-                "build / display workflows. Additionally - in request body one can specify list of "
-                "dynamic blocks definitions which will be transformed into blocks and used to generate "
-                "schemas and definitions of connections",
-            )
-            @with_route_exceptions
-            def describe_workflows_blocks(
-                request: Request,
-                request_payload: Optional[DescribeBlocksRequest] = None,
-            ) -> Union[WorkflowsBlocksDescription, Response]:
-                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
-                dynamic_blocks_definitions = None
-                requested_execution_engine_version = None
-                api_key = None
-                if request_payload is not None:
-                    dynamic_blocks_definitions = (
-                        request_payload.dynamic_blocks_definitions
-                    )
-                    requested_execution_engine_version = (
-                        request_payload.execution_engine_version
-                    )
-                    api_key = request_payload.api_key or request.query_params.get(
-                        "api_key", None
-                    )
-                result = handle_describe_workflows_blocks_request(
-                    dynamic_blocks_definitions=dynamic_blocks_definitions,
-                    requested_execution_engine_version=requested_execution_engine_version,
-                    api_key=api_key,
-                )
-                return gzip_response_if_requested(request=request, response=result)
-
-            @app.get(
-                "/workflows/definition/schema",
-                response_model=WorkflowsBlocksSchemaDescription,
-                summary="Endpoint to fetch the workflows block schema",
-                description="Endpoint to fetch the schema of all available blocks. This information can be "
-                "used to validate workflow definitions and suggest syntax in the JSON editor.",
-            )
-            @with_route_exceptions
-            def get_workflow_schema(
-                request: Request,
-            ) -> WorkflowsBlocksSchemaDescription:
-                result = get_workflow_schema_description()
-                return gzip_response_if_requested(request, response=result)
-
-            @app.post(
-                "/workflows/blocks/dynamic_outputs",
-                response_model=List[OutputDefinition],
-                summary="[EXPERIMENTAL] Endpoint to get definition of dynamic output for workflow step",
-                description="Endpoint to be used when step outputs can be discovered only after "
-                "filling manifest with data.",
-            )
-            @with_route_exceptions
-            def get_dynamic_block_outputs(
-                step_manifest: Dict[str, Any],
-            ) -> List[OutputDefinition]:
-                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
-                # Potentially TODO: dynamic blocks do not support dynamic outputs, but if it changes
-                # we need to provide dynamic blocks manifests here
-                dummy_workflow_definition = {
-                    "version": "1.0",
-                    "inputs": [],
-                    "steps": [step_manifest],
-                    "outputs": [],
-                }
-                available_blocks = load_workflow_blocks()
-                parsed_definition = parse_workflow_definition(
-                    raw_workflow_definition=dummy_workflow_definition,
-                    available_blocks=available_blocks,
-                )
-                parsed_manifest = parsed_definition.steps[0]
-                return parsed_manifest.get_actual_outputs()
-
-            @app.post(
-                "/workflows/validate",
-                response_model=WorkflowValidationStatus,
-                summary="[EXPERIMENTAL] Endpoint to validate",
-                description="Endpoint provides a way to check validity of JSON workflow definition.",
-            )
-            @with_route_exceptions
-            def validate_workflow(
-                specification: dict,
-                api_key: Optional[str] = Query(
-                    None,
-                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                ),
-            ) -> WorkflowValidationStatus:
-                # TODO: get rid of async: https://github.com/roboflow/inference/issues/569
-                step_execution_mode = StepExecutionMode(WORKFLOWS_STEP_EXECUTION_MODE)
-                workflow_init_parameters = {
-                    "workflows_core.model_manager": model_manager,
-                    "workflows_core.api_key": api_key,
-                    "workflows_core.background_tasks": None,
-                    "workflows_core.step_execution_mode": step_execution_mode,
-                }
-                _ = ExecutionEngine.init(
-                    workflow_definition=specification,
-                    init_parameters=workflow_init_parameters,
-                    max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
-                    prevent_local_images_loading=True,
-                )
-                return WorkflowValidationStatus(status="ok")
 
         if WEBRTC_WORKER_ENABLED:
 
-            @app.post(
-                "/initialise_webrtc_worker",
-                response_model=InitializeWebRTCResponse,
-                summary="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
-                description="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
-            )
-            @with_route_exceptions_async
-            async def initialise_webrtc_worker(
-                request: WebRTCWorkerRequest,
-                r: Request,
-            ) -> InitializeWebRTCResponse:
-                if str(r.headers.get("origin")).lower() == BUILDER_ORIGIN.lower():
-                    if re.search(
-                        r"^https://[^.]+\.roboflow\.[^./]+/", str(r.url).lower()
-                    ):
-                        request.is_preview = True
-
-                logger.debug("Received initialise_webrtc_worker request")
-                worker_result: WebRTCWorkerResult = await start_worker(
-                    webrtc_request=request,
-                )
-                if worker_result.exception_type is not None:
-                    if worker_result.exception_type == "WorkflowSyntaxError":
-                        raise WorkflowSyntaxError(
-                            public_message=worker_result.error_message,
-                            context=worker_result.error_context,
-                            inner_error=worker_result.inner_error,
-                        )
-                    if worker_result.exception_type == "WorkflowError":
-                        raise WorkflowError(
-                            public_message=worker_result.error_message,
-                            context=worker_result.error_context,
-                        )
-                    expected_exceptions = {
-                        "Exception": Exception,
-                        "KeyError": KeyError,
-                        "MissingApiKeyError": MissingApiKeyError,
-                        "NotImplementedError": NotImplementedError,
-                        "RoboflowAPINotAuthorizedError": RoboflowAPINotAuthorizedError,
-                        "RoboflowAPINotNotFoundError": RoboflowAPINotNotFoundError,
-                        "ValidationError": ValidationError,
-                        "WebRTCConfigurationError": WebRTCConfigurationError,
-                    }
-                    exc = expected_exceptions.get(
-                        worker_result.exception_type, Exception
-                    )(worker_result.error_message)
-                    logger.error(
-                        f"Initialise webrtc worker failed with %s: %s",
-                        worker_result.exception_type,
-                        worker_result.error_message,
-                    )
-                    raise exc
-                logger.debug("Returning initialise_webrtc_worker response")
-                return InitializeWebRTCResponse(
-                    context=CommandContext(),
-                    status=OperationStatus.SUCCESS,
-                    sdp=worker_result.answer.sdp,
-                    type=worker_result.answer.type,
-                )
+            app.include_router(create_webrtc_worker_router())
 
         if ENABLE_STREAM_API:
-
-            @app.get(
-                "/inference_pipelines/list",
-                response_model=ListPipelinesResponse,
-                summary="[EXPERIMENTAL] List active InferencePipelines",
-                description="[EXPERIMENTAL] Listing all active InferencePipelines processing videos",
-            )
-            @with_route_exceptions_async
-            async def list_pipelines(_: Request) -> ListPipelinesResponse:
-                return await self.stream_manager_client.list_pipelines()
-
-            @app.get(
-                "/inference_pipelines/{pipeline_id}/status",
-                response_model=InferencePipelineStatusResponse,
-                summary="[EXPERIMENTAL] Get status of InferencePipeline",
-                description="[EXPERIMENTAL] Get status of InferencePipeline",
-            )
-            @with_route_exceptions_async
-            async def get_status(pipeline_id: str) -> InferencePipelineStatusResponse:
-                return await self.stream_manager_client.get_status(
-                    pipeline_id=pipeline_id
-                )
-
-            @app.post(
-                "/inference_pipelines/initialise",
-                response_model=CommandResponse,
-                summary="[EXPERIMENTAL] Starts new InferencePipeline",
-                description="[EXPERIMENTAL] Starts new InferencePipeline",
-            )
-            @with_route_exceptions_async
-            async def initialise(request: InitialisePipelinePayload) -> CommandResponse:
-                return await self.stream_manager_client.initialise_pipeline(
-                    initialisation_request=request
-                )
-
-            @app.post(
-                "/inference_pipelines/initialise_webrtc",
-                response_model=InitializeWebRTCPipelineResponse,
-                summary="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
-                description="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
-            )
-            @with_route_exceptions_async
-            async def initialise_webrtc_inference_pipeline(
-                request: InitialiseWebRTCPipelinePayload,
-            ) -> CommandResponse:
-                logger.debug("Received initialise webrtc inference pipeline request")
-                resp = await self.stream_manager_client.initialise_webrtc_pipeline(
-                    initialisation_request=request
-                )
-                logger.debug("Returning initialise webrtc inference pipeline response")
-                return resp
-
-            @app.post(
-                "/inference_pipelines/{pipeline_id}/pause",
-                response_model=CommandResponse,
-                summary="[EXPERIMENTAL] Pauses the InferencePipeline",
-                description="[EXPERIMENTAL] Pauses the InferencePipeline",
-            )
-            @with_route_exceptions_async
-            async def pause(pipeline_id: str) -> CommandResponse:
-                return await self.stream_manager_client.pause_pipeline(
-                    pipeline_id=pipeline_id
-                )
-
-            @app.post(
-                "/inference_pipelines/{pipeline_id}/resume",
-                response_model=CommandResponse,
-                summary="[EXPERIMENTAL] Resumes the InferencePipeline",
-                description="[EXPERIMENTAL] Resumes the InferencePipeline",
-            )
-            @with_route_exceptions_async
-            async def resume(pipeline_id: str) -> CommandResponse:
-                return await self.stream_manager_client.resume_pipeline(
-                    pipeline_id=pipeline_id
-                )
-
-            @app.post(
-                "/inference_pipelines/{pipeline_id}/terminate",
-                response_model=CommandResponse,
-                summary="[EXPERIMENTAL] Terminates the InferencePipeline",
-                description="[EXPERIMENTAL] Terminates the InferencePipeline",
-            )
-            @with_route_exceptions_async
-            async def terminate(pipeline_id: str) -> CommandResponse:
-                return await self.stream_manager_client.terminate_pipeline(
-                    pipeline_id=pipeline_id
-                )
-
-            @app.get(
-                "/inference_pipelines/{pipeline_id}/consume",
-                response_model=ConsumePipelineResponse,
-                summary="[EXPERIMENTAL] Consumes InferencePipeline result",
-                description="[EXPERIMENTAL] Consumes InferencePipeline result",
-            )
-            @with_route_exceptions_async
-            async def consume(
-                pipeline_id: str,
-                request: Optional[ConsumeResultsPayload] = None,
-            ) -> ConsumePipelineResponse:
-                if request is None:
-                    request = ConsumeResultsPayload()
-                return await self.stream_manager_client.consume_pipeline_result(
-                    pipeline_id=pipeline_id,
-                    excluded_fields=request.excluded_fields,
+            app.include_router(
+                create_stream_router(stream_manager_client=self.stream_manager_client)
                 )
 
         # Enable preloading models at startup
@@ -1892,1265 +852,13 @@ def healthz():
                 return {"status": "healthy"}
 
         if CORE_MODELS_ENABLED:
-            if CORE_MODEL_CLIP_ENABLED:
-
-                @app.post(
-                    "/clip/embed_image",
-                    response_model=ClipEmbeddingResponse,
-                    summary="CLIP Image Embeddings",
-                    description="Run the Open AI CLIP model to embed image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def clip_embed_image(
-                    inference_request: ClipImageEmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the OpenAI CLIP model.
-
-                    Args:
-                        inference_request (ClipImageEmbeddingRequest): The request containing the image to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        ClipEmbeddingResponse: The response containing the embedded image.
-                    """
-                    logger.debug(f"Reached /clip/embed_image")
-                    clip_model_id = load_clip_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        clip_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(clip_model_id, actor)
-                    return response
-
-                @app.post(
-                    "/clip/embed_text",
-                    response_model=ClipEmbeddingResponse,
-                    summary="CLIP Text Embeddings",
-                    description="Run the Open AI CLIP model to embed text data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def clip_embed_text(
-                    inference_request: ClipTextEmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds text data using the OpenAI CLIP model.
-
-                    Args:
-                        inference_request (ClipTextEmbeddingRequest): The request containing the text to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        ClipEmbeddingResponse: The response containing the embedded text.
-                    """
-                    logger.debug(f"Reached /clip/embed_text")
-                    clip_model_id = load_clip_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        clip_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(clip_model_id, actor)
-                    return response
-
-                @app.post(
-                    "/clip/compare",
-                    response_model=ClipCompareResponse,
-                    summary="CLIP Compare",
-                    description="Run the Open AI CLIP model to compute similarity scores.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def clip_compare(
-                    inference_request: ClipCompareRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Computes similarity scores using the OpenAI CLIP model.
-
-                    Args:
-                        inference_request (ClipCompareRequest): The request containing the data to be compared.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        ClipCompareResponse: The response containing the similarity scores.
-                    """
-                    logger.debug(f"Reached /clip/compare")
-                    clip_model_id = load_clip_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        clip_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(clip_model_id, actor, n=2)
-                    return response
-
-            if CORE_MODEL_PE_ENABLED:
-
-                @app.post(
-                    "/perception_encoder/embed_image",
-                    response_model=PerceptionEncoderEmbeddingResponse,
-                    summary="PE Image Embeddings",
-                    description="Run the Meta Perception Encoder model to embed image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def pe_embed_image(
-                    inference_request: PerceptionEncoderImageEmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the Perception Encoder PE model.
-
-                    Args:
-                        inference_request (PerceptionEncoderImageEmbeddingRequest): The request containing the image to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        PerceptionEncoderEmbeddingResponse: The response containing the embedded image.
-                    """
-                    logger.debug(f"Reached /perception_encoder/embed_image")
-                    pe_model_id = load_pe_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        pe_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(pe_model_id, actor)
-                    return response
-
-                @app.post(
-                    "/perception_encoder/embed_text",
-                    response_model=PerceptionEncoderEmbeddingResponse,
-                    summary="Perception Encoder Text Embeddings",
-                    description="Run the Meta Perception Encoder model to embed text data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def pe_embed_text(
-                    inference_request: PerceptionEncoderTextEmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds text data using the Meta Perception Encoder model.
-
-                    Args:
-                        inference_request (PerceptionEncoderTextEmbeddingRequest): The request containing the text to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        PerceptionEncoderEmbeddingResponse: The response containing the embedded text.
-                    """
-                    logger.debug(f"Reached /perception_encoder/embed_text")
-                    pe_model_id = load_pe_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        pe_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(pe_model_id, actor)
-                    return response
-
-                @app.post(
-                    "/perception_encoder/compare",
-                    response_model=PerceptionEncoderCompareResponse,
-                    summary="Perception Encoder Compare",
-                    description="Run the Meta Perception Encoder model to compute similarity scores.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def pe_compare(
-                    inference_request: PerceptionEncoderCompareRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Computes similarity scores using the Meta Perception Encoder model.
-
-                    Args:
-                        inference_request (PerceptionEncoderCompareRequest): The request containing the data to be compared.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        PerceptionEncoderCompareResponse: The response containing the similarity scores.
-                    """
-                    logger.debug(f"Reached /perception_encoder/compare")
-                    pe_model_id = load_pe_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        pe_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(pe_model_id, actor, n=2)
-                    return response
-
-            if CORE_MODEL_GROUNDINGDINO_ENABLED:
-
-                @app.post(
-                    "/grounding_dino/infer",
-                    response_model=ObjectDetectionInferenceResponse,
-                    summary="Grounding DINO inference.",
-                    description="Run the Grounding DINO zero-shot object detection model.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def grounding_dino_infer(
-                    inference_request: GroundingDINOInferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the Grounding DINO model.
-
-                    Args:
-                        inference_request GroundingDINOInferenceRequest): The request containing the image on which to run object detection.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        ObjectDetectionInferenceResponse: The object detection response.
-                    """
-                    logger.debug(f"Reached /grounding_dino/infer")
-                    grounding_dino_model_id = load_grounding_dino_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        grounding_dino_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(grounding_dino_model_id, actor)
-                    return response
-
-            if CORE_MODEL_YOLO_WORLD_ENABLED:
-
-                @app.post(
-                    "/yolo_world/infer",
-                    response_model=ObjectDetectionInferenceResponse,
-                    summary="YOLO-World inference.",
-                    description="Run the YOLO-World zero-shot object detection model.",
-                    response_model_exclude_none=True,
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def yolo_world_infer(
-                    inference_request: YOLOWorldInferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Runs the YOLO-World zero-shot object detection model.
-
-                    Args:
-                        inference_request (YOLOWorldInferenceRequest): The request containing the image on which to run object detection.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        ObjectDetectionInferenceResponse: The object detection response.
-                    """
-                    logger.debug(f"Reached /yolo_world/infer. Loading model")
-                    yolo_world_model_id = load_yolo_world_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    logger.debug("YOLOWorld model loaded. Staring the inference.")
-                    response = self.model_manager.infer_from_request_sync(
-                        yolo_world_model_id, inference_request
-                    )
-                    logger.debug("YOLOWorld prediction available.")
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(yolo_world_model_id, actor)
-                        logger.debug("Usage of YOLOWorld denoted.")
-                    return response
-
-            if CORE_MODEL_DOCTR_ENABLED:
-
-                @app.post(
-                    "/doctr/ocr",
-                    response_model=Union[
-                        OCRInferenceResponse, List[OCRInferenceResponse]
-                    ],
-                    summary="DocTR OCR response",
-                    description="Run the DocTR OCR model to retrieve text in an image.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def doctr_retrieve_text(
-                    inference_request: DoctrOCRInferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the DocTR model.
-
-                    Args:
-                        inference_request (M.DoctrOCRInferenceRequest): The request containing the image from which to retrieve text.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        OCRInferenceResponse: The response containing the embedded image.
-                    """
-                    logger.debug(f"Reached /doctr/ocr")
-                    doctr_model_id = load_doctr_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        doctr_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(doctr_model_id, actor)
-                    return orjson_response_keeping_parent_id(response)
-
-            if CORE_MODEL_EASYOCR_ENABLED:
-
-                @app.post(
-                    "/easy_ocr/ocr",
-                    response_model=Union[
-                        OCRInferenceResponse, List[OCRInferenceResponse]
-                    ],
-                    summary="EasyOCR OCR response",
-                    description="Run the EasyOCR model to retrieve text in an image.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def easy_ocr_retrieve_text(
-                    inference_request: EasyOCRInferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the EasyOCR model.
-
-                    Args:
-                        inference_request (EasyOCRInferenceRequest): The request containing the image from which to retrieve text.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        OCRInferenceResponse: The response containing the embedded image.
-                    """
-                    logger.debug(f"Reached /easy_ocr/ocr")
-                    easy_ocr_model_id = load_easy_ocr_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        easy_ocr_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(easy_ocr_model_id, actor)
-                    return orjson_response_keeping_parent_id(response)
-
-            if CORE_MODEL_SAM_ENABLED:
-
-                @app.post(
-                    "/sam/embed_image",
-                    response_model=SamEmbeddingResponse,
-                    summary="SAM Image Embeddings",
-                    description="Run the Meta AI Segmant Anything Model to embed image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam_embed_image(
-                    inference_request: SamEmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the Meta AI Segmant Anything Model (SAM).
-
-                    Args:
-                        inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        M.SamEmbeddingResponse or Response: The response containing the embedded image.
-                    """
-                    logger.debug(f"Reached /sam/embed_image")
-                    sam_model_id = load_sam_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    model_response = self.model_manager.infer_from_request_sync(
-                        sam_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(sam_model_id, actor)
-                    if inference_request.format == "binary":
-                        return Response(
-                            content=model_response.embeddings,
-                            headers={"Content-Type": "application/octet-stream"},
-                        )
-                    return model_response
-
-                @app.post(
-                    "/sam/segment_image",
-                    response_model=SamSegmentationResponse,
-                    summary="SAM Image Segmentation",
-                    description="Run the Meta AI Segmant Anything Model to generate segmenations for image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam_segment_image(
-                    inference_request: SamSegmentationRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Generates segmentations for image data using the Meta AI Segmant Anything Model (SAM).
-
-                    Args:
-                        inference_request (SamSegmentationRequest): The request containing the image to be segmented.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        M.SamSegmentationResponse or Response: The response containing the segmented image.
-                    """
-                    logger.debug(f"Reached /sam/segment_image")
-                    sam_model_id = load_sam_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    model_response = self.model_manager.infer_from_request_sync(
-                        sam_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(sam_model_id, actor)
-                    if inference_request.format == "binary":
-                        return Response(
-                            content=model_response,
-                            headers={"Content-Type": "application/octet-stream"},
-                        )
-                    return model_response
-
-            if CORE_MODEL_SAM2_ENABLED:
-
-                @app.post(
-                    "/sam2/embed_image",
-                    response_model=Sam2EmbeddingResponse,
-                    summary="SAM2 Image Embeddings",
-                    description="Run the Meta AI Segment Anything 2 Model to embed image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam2_embed_image(
-                    inference_request: Sam2EmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the Meta AI Segment Anything Model (SAM).
-
-                    Args:
-                        inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
-                    """
-                    logger.debug(f"Reached /sam2/embed_image")
-                    sam2_model_id = load_sam2_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    model_response = self.model_manager.infer_from_request_sync(
-                        sam2_model_id, inference_request
-                    )
-                    return model_response
-
-                @app.post(
-                    "/sam2/segment_image",
-                    response_model=Sam2SegmentationResponse,
-                    summary="SAM2 Image Segmentation",
-                    description="Run the Meta AI Segment Anything 2 Model to generate segmenations for image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam2_segment_image(
-                    inference_request: Sam2SegmentationRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Generates segmentations for image data using the Meta AI Segment Anything Model (SAM).
-
-                    Args:
-                        inference_request (Sam2SegmentationRequest): The request containing the image to be segmented.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        M.SamSegmentationResponse or Response: The response containing the segmented image.
-                    """
-                    logger.debug(f"Reached /sam2/segment_image")
-                    sam2_model_id = load_sam2_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    model_response = self.model_manager.infer_from_request_sync(
-                        sam2_model_id, inference_request
-                    )
-                    if inference_request.format == "binary":
-                        return Response(
-                            content=model_response,
-                            headers={"Content-Type": "application/octet-stream"},
-                        )
-                    return model_response
-
-            if CORE_MODEL_SAM3_ENABLED and not GCP_SERVERLESS:
-
-                @app.post(
-                    "/sam3/embed_image",
-                    response_model=Sam3EmbeddingResponse,
-                    summary="Seg preview Image Embeddings",
-                    description="Run the  Model to embed image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam3_embed_image(
-                    inference_request: Sam2EmbeddingRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    logger.debug(f"Reached /sam3/embed_image")
-
-                    if SAM3_EXEC_MODE == "remote":
-                        raise HTTPException(
-                            status_code=501,
-                            detail="SAM3 embedding is not supported in remote execution mode.",
-                        )
-
-                    self.model_manager.add_model(
-                        "sam3/sam3_interactive",
-                        api_key=api_key,
-                        endpoint_type=ModelEndpointType.CORE_MODEL,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-
-                    model_response = self.model_manager.infer_from_request_sync(
-                        "sam3/sam3_interactive", inference_request
-                    )
-                    return model_response
-
-            if CORE_MODEL_SAM3_ENABLED:
-
-                @app.post(
-                    "/sam3/concept_segment",
-                    response_model=Sam3SegmentationResponse,
-                    summary="SAM3 PCS (promptable concept segmentation)",
-                    description="Run the SAM3 PCS (promptable concept segmentation) to generate segmentations for image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam3_segment_image(
-                    inference_request: Sam3SegmentationRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    if not SAM3_FINE_TUNED_MODELS_ENABLED:
-                        if not inference_request.model_id.startswith("sam3/"):
-                            raise HTTPException(
-                                status_code=501,
-                                detail="Fine-tuned SAM3 models are not supported on this deployment. Please use a workflow or self-host the server.",
-                            )
-
-                    if SAM3_EXEC_MODE == "remote":
-                        endpoint = f"{API_BASE_URL}/inferenceproxy/seg-preview"
-
-                        # Construct payload for remote API
-                        # The remote API expects:
-                        # {
-                        #     "image": {"type": "base64", "value": ...},
-                        #     "prompts": [{"type": "text", "text": ...}, ...],
-                        #     "output_prob_thresh": ...
-                        # }
-
-                        # Extract prompts from request
-                        http_prompts = []
-                        for prompt in inference_request.prompts:
-                            p_dict = prompt.dict(exclude_none=True)
-                            # Ensure type is set if missing (default to text if text is present)
-                            if "type" not in p_dict:
-                                if "text" in p_dict:
-                                    p_dict["type"] = "text"
-                            http_prompts.append(p_dict)
-
-                        # Prepare image
-                        # inference_request.image is InferenceRequestImage
-                        if inference_request.image.type == "base64":
-                            http_image = {
-                                "type": "base64",
-                                "value": inference_request.image.value,
-                            }
-                        elif inference_request.image.type == "url":
-                            http_image = {
-                                "type": "url",
-                                "value": inference_request.image.value,
-                            }
-                        elif inference_request.image.type == "numpy":
-                            # Numpy not supported for remote proxy easily without serialization,
-                            # but InferenceRequestImage usually comes as base64/url in HTTP API.
-                            # If it is numpy, we might need to handle it, but for now assume base64/url.
-                            # If it's numpy, it's likely from internal call, but this is HTTP API.
-                            http_image = {
-                                "type": "numpy",
-                                "value": inference_request.image.value,
-                            }
-                        else:
-                            http_image = {
-                                "type": inference_request.image.type,
-                                "value": inference_request.image.value,
-                            }
-
-                        payload = {
-                            "image": http_image,
-                            "prompts": http_prompts,
-                            "output_prob_thresh": inference_request.output_prob_thresh,
-                        }
-
-                        try:
-                            headers = {"Content-Type": "application/json"}
-                            if ROBOFLOW_INTERNAL_SERVICE_NAME:
-                                headers["X-Roboflow-Internal-Service-Name"] = (
-                                    ROBOFLOW_INTERNAL_SERVICE_NAME
-                                )
-                            if ROBOFLOW_INTERNAL_SERVICE_SECRET:
-                                headers["X-Roboflow-Internal-Service-Secret"] = (
-                                    ROBOFLOW_INTERNAL_SERVICE_SECRET
-                                )
-
-                            headers = build_roboflow_api_headers(
-                                explicit_headers=headers
-                            )
-
-                            response = requests.post(
-                                f"{endpoint}?api_key={api_key}",
-                                json=payload,
-                                headers=headers,
-                                timeout=60,
-                            )
-                            response.raise_for_status()
-                            resp_json = response.json()
-
-                            # The remote API returns the same structure as Sam3SegmentationResponse
-                            return Sam3SegmentationResponse(**resp_json)
-
-                        except Exception as e:
-                            logger.error(f"SAM3 remote request failed: {e}")
-                            raise HTTPException(
-                                status_code=500,
-                                detail=f"SAM3 remote request failed: {str(e)}",
-                            )
-
-                    if inference_request.model_id.startswith("sam3/"):
-                        self.model_manager.add_model(
-                            inference_request.model_id,
-                            api_key=api_key,
-                            endpoint_type=ModelEndpointType.CORE_MODEL,
-                            countinference=countinference,
-                            service_secret=service_secret,
-                        )
-                    else:
-                        self.model_manager.add_model(
-                            inference_request.model_id,
-                            api_key=api_key,
-                            endpoint_type=ModelEndpointType.ORT,
-                            countinference=countinference,
-                            service_secret=service_secret,
-                        )
-
-                    model_response = self.model_manager.infer_from_request_sync(
-                        inference_request.model_id, inference_request
-                    )
-                    if inference_request.format == "binary":
-                        return Response(
-                            content=model_response,
-                            headers={"Content-Type": "application/octet-stream"},
-                        )
-                    return model_response
-
-                @app.post(
-                    "/sam3/visual_segment",
-                    response_model=Sam2SegmentationResponse,
-                    summary="SAM3 PVS (promptable visual segmentation)",
-                    description="Run the SAM3 PVS (promptable visual segmentation) to generate segmentations for image data.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam3_visual_segment(
-                    inference_request: Sam2SegmentationRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    logger.debug(f"Reached /sam3/visual_segment")
-
-                    if SAM3_EXEC_MODE == "remote":
-                        endpoint = f"{API_BASE_URL}/inferenceproxy/sam3-pvs"
-
-                        http_image = {
-                            "type": inference_request.image.type,
-                            "value": inference_request.image.value,
-                        }
-
-                        prompts_data = (
-                            inference_request.prompts.dict(exclude_none=True)
-                            if inference_request.prompts
-                            else None
-                        )
-
-                        payload = {
-                            "image": http_image,
-                            "prompts": prompts_data,
-                            "multimask_output": inference_request.multimask_output,
-                        }
-
-                        try:
-                            headers = {"Content-Type": "application/json"}
-                            if ROBOFLOW_INTERNAL_SERVICE_NAME:
-                                headers["X-Roboflow-Internal-Service-Name"] = (
-                                    ROBOFLOW_INTERNAL_SERVICE_NAME
-                                )
-                            if ROBOFLOW_INTERNAL_SERVICE_SECRET:
-                                headers["X-Roboflow-Internal-Service-Secret"] = (
-                                    ROBOFLOW_INTERNAL_SERVICE_SECRET
-                                )
-
-                            headers = build_roboflow_api_headers(
-                                explicit_headers=headers
-                            )
-
-                            response = requests.post(
-                                f"{endpoint}?api_key={api_key}",
-                                json=payload,
-                                headers=headers,
-                                timeout=60,
-                            )
-                            response.raise_for_status()
-                            resp_json = response.json()
-
-                            return Sam2SegmentationResponse(**resp_json)
-
-                        except Exception as e:
-                            logger.error(
-                                f"SAM3 visual_segment remote request failed: {e}"
-                            )
-                            raise HTTPException(
-                                status_code=500,
-                                detail=f"SAM3 visual_segment remote request failed: {str(e)}",
-                            )
-
-                    self.model_manager.add_model(
-                        "sam3/sam3_interactive",
-                        api_key=api_key,
-                        endpoint_type=ModelEndpointType.CORE_MODEL,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-
-                    model_response = self.model_manager.infer_from_request_sync(
-                        "sam3/sam3_interactive", inference_request
-                    )
-                    return model_response
-
-            if CORE_MODEL_SAM3_ENABLED and not GCP_SERVERLESS:
-
-                @app.post(
-                    "/sam3_3d/infer",
-                    summary="SAM3 3D Object Generation",
-                    description="Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def sam3_3d_infer(
-                    inference_request: Sam3_3D_Objects_InferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.
-
-                    Args:
-                        inference_request (Sam3_3D_Objects_InferenceRequest): The request containing
-                            the image and mask input for 3D generation.
-                        api_key (Optional[str]): Roboflow API Key for artifact retrieval.
-
-                    Returns:
-                        dict: Response containing base64-encoded 3D outputs:
-                            - mesh_glb: Scene mesh in GLB format (base64)
-                            - gaussian_ply: Combined Gaussian splatting in PLY format (base64)
-                            - objects: List of individual objects with their 3D data
-                            - time: Inference time in seconds
-                    """
-                    logger.debug("Reached /sam3_3d/infer")
-                    model_id = inference_request.model_id or "sam3-3d-objects"
-
-                    self.model_manager.add_model(
-                        model_id,
-                        api_key=api_key,
-                        endpoint_type=ModelEndpointType.CORE_MODEL,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-
-                    model_response = self.model_manager.infer_from_request_sync(
-                        model_id, inference_request
-                    )
-
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(model_id, actor)
-
-                    # Convert bytes to base64 for JSON serialization
-                    def encode_bytes(data):
-                        if data is None:
-                            return None
-                        return base64.b64encode(data).decode("utf-8")
-
-                    objects_list = []
-                    for obj in model_response.objects:
-                        objects_list.append(
-                            {
-                                "mesh_glb": encode_bytes(obj.mesh_glb),
-                                "gaussian_ply": encode_bytes(obj.gaussian_ply),
-                                "metadata": {
-                                    "rotation": obj.metadata.rotation,
-                                    "translation": obj.metadata.translation,
-                                    "scale": obj.metadata.scale,
-                                },
-                            }
-                        )
-
-                    return {
-                        "mesh_glb": encode_bytes(model_response.mesh_glb),
-                        "gaussian_ply": encode_bytes(model_response.gaussian_ply),
-                        "objects": objects_list,
-                        "time": model_response.time,
-                    }
-
-            if CORE_MODEL_OWLV2_ENABLED:
-
-                @app.post(
-                    "/owlv2/infer",
-                    response_model=ObjectDetectionInferenceResponse,
-                    summary="Owlv2 image prompting",
-                    description="Run the google owlv2 model to few-shot object detect",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def owlv2_infer(
-                    inference_request: OwlV2InferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Embeds image data using the Meta AI Segmant Anything Model (SAM).
-
-                    Args:
-                        inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
-                    """
-                    logger.debug(f"Reached /owlv2/infer")
-                    owl2_model_id = load_owlv2_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    model_response = self.model_manager.infer_from_request_sync(
-                        owl2_model_id, inference_request
-                    )
-                    return model_response
-
-            if CORE_MODEL_GAZE_ENABLED:
-
-                @app.post(
-                    "/gaze/gaze_detection",
-                    response_model=List[GazeDetectionInferenceResponse],
-                    summary="Gaze Detection",
-                    description="Run the gaze detection model to detect gaze.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def gaze_detection(
-                    inference_request: GazeDetectionInferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Detect gaze using the gaze detection model.
-
-                    Args:
-                        inference_request (M.GazeDetectionRequest): The request containing the image to be detected.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        M.GazeDetectionResponse: The response containing all the detected faces and the corresponding gazes.
-                    """
-                    logger.debug(f"Reached /gaze/gaze_detection")
-                    gaze_model_id = load_gaze_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        gaze_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(gaze_model_id, actor)
-                    return response
-
-            if DEPTH_ESTIMATION_ENABLED:
-
-                @app.post(
-                    "/infer/depth-estimation",
-                    response_model=DepthEstimationResponse,
-                    summary="Depth Estimation",
-                    description="Run the depth estimation model to generate a depth map.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def depth_estimation(
-                    inference_request: DepthEstimationRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Generate a depth map using the depth estimation model.
-
-                    Args:
-                        inference_request (DepthEstimationRequest): The request containing the image to estimate depth for.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        DepthEstimationResponse: The response containing the normalized depth map and optional visualization.
-                    """
-                    logger.debug(f"Reached /infer/depth-estimation")
-                    depth_model_id = inference_request.model_id
-                    self.model_manager.add_model(
-                        depth_model_id,
-                        inference_request.api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        depth_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(depth_model_id, actor)
-
-                    # Extract data from nested response structure
-                    depth_data = response.response
-                    depth_response = DepthEstimationResponse(
-                        normalized_depth=depth_data["normalized_depth"].tolist(),
-                        image=depth_data["image"].base64_image,
-                    )
-                    return depth_response
-
-            if CORE_MODEL_TROCR_ENABLED:
-
-                @app.post(
-                    "/ocr/trocr",
-                    response_model=OCRInferenceResponse,
-                    summary="TrOCR OCR response",
-                    description="Run the TrOCR model to retrieve text in an image.",
-                )
-                @with_route_exceptions
-                @usage_collector("request")
-                def trocr_retrieve_text(
-                    inference_request: TrOCRInferenceRequest,
-                    request: Request,
-                    api_key: Optional[str] = Query(
-                        None,
-                        description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                    ),
-                    countinference: Optional[bool] = None,
-                    service_secret: Optional[str] = None,
-                ):
-                    """
-                    Retrieves text from image data using the TrOCR model.
-
-                    Args:
-                        inference_request (TrOCRInferenceRequest): The request containing the image from which to retrieve text.
-                        api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                        request (Request, default Body()): The HTTP request.
-
-                    Returns:
-                        OCRInferenceResponse: The response containing the retrieved text.
-                    """
-                    logger.debug(f"Reached /trocr/ocr")
-                    trocr_model_id = load_trocr_model(
-                        inference_request,
-                        api_key=api_key,
-                        countinference=countinference,
-                        service_secret=service_secret,
-                    )
-                    response = self.model_manager.infer_from_request_sync(
-                        trocr_model_id, inference_request
-                    )
-                    if LAMBDA:
-                        actor = request.scope["aws.event"]["requestContext"][
-                            "authorizer"
-                        ]["lambda"]["actor"]
-                        trackUsage(trocr_model_id, actor)
-                    return orjson_response_keeping_parent_id(response)
+            app.include_router(
+                create_core_models_router(model_manager=self.model_manager)
+            )
 
         if not (LAMBDA or GCP_SERVERLESS):
 
-            @app.get(
-                "/notebook/start",
-                summary="Jupyter Lab Server Start",
-                description="Starts a jupyter lab server for running development code",
-            )
-            @with_route_exceptions
-            def notebook_start(browserless: bool = False):
-                """Starts a jupyter lab server for running development code.
-
-                Args:
-                    inference_request (NotebookStartRequest): The request containing the necessary details for starting a jupyter lab server.
-                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
-
-                Returns:
-                    NotebookStartResponse: The response containing the URL of the jupyter lab server.
-                """
-                logger.debug(f"Reached /notebook/start")
-                if NOTEBOOK_ENABLED:
-                    start_notebook()
-                    if browserless:
-                        return {
-                            "success": True,
-                            "message": f"Jupyter Lab server started at http://localhost:{NOTEBOOK_PORT}?token={NOTEBOOK_PASSWORD}",
-                        }
-                    else:
-                        sleep(2)
-                        return RedirectResponse(
-                            f"http://localhost:{NOTEBOOK_PORT}/lab/tree/quickstart.ipynb?token={NOTEBOOK_PASSWORD}"
-                        )
-                else:
-                    if browserless:
-                        return {
-                            "success": False,
-                            "message": "Notebook server is not enabled. Enable notebooks via the NOTEBOOK_ENABLED environment variable.",
-                        }
-                    else:
-                        return RedirectResponse(f"/notebook-instructions.html")
+            app.include_router(create_notebook_router())
 
         if ENABLE_BUILDER:
             from inference.core.interfaces.http.builder.routes import (
@@ -3172,346 +880,9 @@ def notebook_start(browserless: bool = False):
             # Attach all routes from builder to the /build prefix
             app.include_router(builder_router, prefix="/build", tags=["builder"])
 
-        if LEGACY_ROUTE_ENABLED:
-            # Legacy object detection inference path for backwards compatibility
-            @app.get(
-                "/{dataset_id}/{version_id:str}",
-                # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
-                response_model=Union[
-                    InstanceSegmentationInferenceResponse,
-                    KeypointsDetectionInferenceResponse,
-                    ObjectDetectionInferenceResponse,
-                    ClassificationInferenceResponse,
-                    MultiLabelClassificationInferenceResponse,
-                    SemanticSegmentationInferenceResponse,
-                    StubResponse,
-                    Any,
-                ],
-                response_model_exclude_none=True,
-            )
-            @app.post(
-                "/{dataset_id}/{version_id:str}",
-                # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
-                response_model=Union[
-                    InstanceSegmentationInferenceResponse,
-                    KeypointsDetectionInferenceResponse,
-                    ObjectDetectionInferenceResponse,
-                    ClassificationInferenceResponse,
-                    MultiLabelClassificationInferenceResponse,
-                    SemanticSegmentationInferenceResponse,
-                    StubResponse,
-                    Any,
-                ],
-                response_model_exclude_none=True,
-            )
-            @with_route_exceptions
-            @usage_collector("request")
-            def legacy_infer_from_request(
-                background_tasks: BackgroundTasks,
-                request: Request,
-                request_body: Annotated[
-                    Optional[Union[bytes, UploadFile]],
-                    Depends(parse_body_content_for_legacy_request_handler),
-                ],
-                dataset_id: str = Path(
-                    description="ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID"
-                ),
-                version_id: str = Path(
-                    description="ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID"
-                ),
-                api_key: Optional[str] = Query(
-                    None,
-                    description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
-                ),
-                confidence: float = Query(
-                    0.4,
-                    description="The confidence threshold used to filter out predictions",
-                ),
-                keypoint_confidence: float = Query(
-                    0.0,
-                    description="The confidence threshold used to filter out keypoints that are not visible based on model confidence",
-                ),
-                format: str = Query(
-                    "json",
-                    description="One of 'json' or 'image'. If 'json' prediction data is return as a JSON string. If 'image' prediction data is visualized and overlayed on the original input image.",
-                ),
-                image: Optional[str] = Query(
-                    None,
-                    description="The publically accessible URL of an image to use for inference.",
-                ),
-                image_type: Optional[str] = Query(
-                    "base64",
-                    description="One of base64 or numpy. Note, numpy input is not supported for Roboflow Hosted Inference.",
-                ),
-                labels: Optional[bool] = Query(
-                    False,
-                    description="If true, labels will be include in any inference visualization.",
-                ),
-                mask_decode_mode: Optional[str] = Query(
-                    "accurate",
-                    description="One of 'accurate' or 'fast'. If 'accurate' the mask will be decoded using the original image size. If 'fast' the mask will be decoded using the original mask size. 'accurate' is slower but more accurate.",
-                ),
-                tradeoff_factor: Optional[float] = Query(
-                    0.0,
-                    description="The amount to tradeoff between 0='fast' and 1='accurate'",
-                ),
-                max_detections: int = Query(
-                    300,
-                    description="The maximum number of detections to return. This is used to limit the number of predictions returned by the model. The model may return more predictions than this number, but only the top `max_detections` predictions will be returned.",
-                ),
-                overlap: float = Query(
-                    0.3,
-                    description="The IoU threhsold that must be met for a box pair to be considered duplicate during NMS",
-                ),
-                stroke: int = Query(
-                    1, description="The stroke width used when visualizing predictions"
-                ),
-                countinference: Optional[bool] = Query(
-                    True,
-                    description="If false, does not track inference against usage.",
-                    include_in_schema=False,
-                ),
-                service_secret: Optional[str] = Query(
-                    None,
-                    description="Shared secret used to authenticate requests to the inference server from internal services (e.g. to allow disabling inference usage tracking via the `countinference` query parameter)",
-                    include_in_schema=False,
-                ),
-                disable_preproc_auto_orient: Optional[bool] = Query(
-                    False, description="If true, disables automatic image orientation"
-                ),
-                disable_preproc_contrast: Optional[bool] = Query(
-                    False, description="If true, disables automatic contrast adjustment"
-                ),
-                disable_preproc_grayscale: Optional[bool] = Query(
-                    False,
-                    description="If true, disables automatic grayscale conversion",
-                ),
-                disable_preproc_static_crop: Optional[bool] = Query(
-                    False, description="If true, disables automatic static crop"
-                ),
-                disable_active_learning: Optional[bool] = Query(
-                    default=False,
-                    description="If true, the predictions will be prevented from registration by Active Learning (if the functionality is enabled)",
-                ),
-                active_learning_target_dataset: Optional[str] = Query(
-                    default=None,
-                    description="Parameter to be used when Active Learning data registration should happen against different dataset than the one pointed by model_id",
-                ),
-                source: Optional[str] = Query(
-                    "external",
-                    description="The source of the inference request",
-                ),
-                source_info: Optional[str] = Query(
-                    "external",
-                    description="The detailed source information of the inference request",
-                ),
-                disable_model_monitoring: Optional[bool] = Query(
-                    False,
-                    description="If true, disables model monitoring for this request",
-                    include_in_schema=False,
-                ),
-            ):
-                """
-                Legacy inference endpoint for object detection, instance segmentation, and classification.
-
-                Args:
-                    background_tasks: (BackgroundTasks) pool of fastapi background tasks
-                    dataset_id (str): ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID
-                    version_id (str): ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID
-                    api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
-                    # Other parameters described in the function signature...
-
-                Returns:
-                    Union[InstanceSegmentationInferenceResponse, KeypointsDetectionInferenceRequest, ObjectDetectionInferenceResponse, ClassificationInferenceResponse, MultiLabelClassificationInferenceResponse, SemanticSegmentationInferenceResponse, Any]: The response containing the inference results.
-                """
-                logger.debug(
-                    f"Reached legacy route /:dataset_id/:version_id with {dataset_id}/{version_id}"
-                )
-                model_id = f"{dataset_id}/{version_id}"
-                if confidence >= 1:
-                    confidence /= 100
-                if confidence < CONFIDENCE_LOWER_BOUND_OOM_PREVENTION:
-                    # allowing lower confidence results in RAM usage explosion
-                    confidence = CONFIDENCE_LOWER_BOUND_OOM_PREVENTION
-
-                if overlap >= 1:
-                    overlap /= 100
-                if image is not None:
-                    request_image = InferenceRequestImage(type="url", value=image)
-                else:
-                    if "Content-Type" not in request.headers:
-                        raise ContentTypeMissing(
-                            f"Request must include a Content-Type header"
-                        )
-                    if isinstance(request_body, UploadFile):
-                        base64_image_str = request_body.file.read()
-                        base64_image_str = base64.b64encode(base64_image_str)
-                        request_image = InferenceRequestImage(
-                            type="base64", value=base64_image_str.decode("ascii")
-                        )
-                    elif isinstance(request_body, bytes):
-                        request_image = InferenceRequestImage(
-                            type=image_type, value=request_body
-                        )
-                    elif request_body is None:
-                        raise InputImageLoadError(
-                            message="Image not found in request body.",
-                            public_message="Image not found in request body.",
-                        )
-                    else:
-                        raise ContentTypeInvalid(
-                            f"Invalid Content-Type: {request.headers['Content-Type']}"
-                        )
-
-                if not countinference and service_secret != ROBOFLOW_SERVICE_SECRET:
-                    raise MissingServiceSecretError(
-                        "Service secret is required to disable inference usage tracking"
-                    )
-                if LAMBDA:
-                    logger.debug("request.scope: %s", request.scope)
-                    request_model_id = (
-                        request.scope["aws.event"]["requestContext"]["authorizer"][
-                            "lambda"
-                        ]["model"]["endpoint"]
-                        .replace("--", "/")
-                        .replace("rf-", "")
-                        .replace("nu-", "")
-                    )
-                    actor = request.scope["aws.event"]["requestContext"]["authorizer"][
-                        "lambda"
-                    ]["actor"]
-                    if countinference:
-                        trackUsage(request_model_id, actor)
-                    else:
-                        if service_secret != ROBOFLOW_SERVICE_SECRET:
-                            raise MissingServiceSecretError(
-                                "Service secret is required to disable inference usage tracking"
-                            )
-                        logger.info("Not counting inference for usage")
-                else:
-                    request_model_id = model_id
-                logger.debug(
-                    f"State of model registry: {self.model_manager.describe_models()}"
-                )
-                self.model_manager.add_model(
-                    request_model_id,
-                    api_key,
-                    model_id_alias=model_id,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-                task_type = self.model_manager.get_task_type(model_id, api_key=api_key)
-                inference_request_type = ObjectDetectionInferenceRequest
-                args = dict()
-                if task_type == "instance-segmentation":
-                    inference_request_type = InstanceSegmentationInferenceRequest
-                    args = {
-                        "mask_decode_mode": mask_decode_mode,
-                        "tradeoff_factor": tradeoff_factor,
-                    }
-                elif task_type == "classification":
-                    inference_request_type = ClassificationInferenceRequest
-                elif task_type == "keypoint-detection":
-                    inference_request_type = KeypointsDetectionInferenceRequest
-                    args = {"keypoint_confidence": keypoint_confidence}
-                elif task_type == "semantic-segmentation":
-                    inference_request_type = SemanticSegmentationInferenceRequest
-                inference_request = inference_request_type(
-                    api_key=api_key,
-                    model_id=model_id,
-                    image=request_image,
-                    confidence=confidence,
-                    iou_threshold=overlap,
-                    max_detections=max_detections,
-                    visualization_labels=labels,
-                    visualization_stroke_width=stroke,
-                    visualize_predictions=(
-                        format == "image" or format == "image_and_json"
-                    ),
-                    disable_preproc_auto_orient=disable_preproc_auto_orient,
-                    disable_preproc_contrast=disable_preproc_contrast,
-                    disable_preproc_grayscale=disable_preproc_grayscale,
-                    disable_preproc_static_crop=disable_preproc_static_crop,
-                    disable_active_learning=disable_active_learning,
-                    active_learning_target_dataset=active_learning_target_dataset,
-                    source=source,
-                    source_info=source_info,
-                    usage_billable=countinference,
-                    disable_model_monitoring=disable_model_monitoring,
-                    **args,
-                )
-                inference_response = self.model_manager.infer_from_request_sync(
-                    inference_request.model_id,
-                    inference_request,
-                    active_learning_eligible=True,
-                    background_tasks=background_tasks,
-                )
-                logger.debug("Response ready.")
-                if format == "image":
-                    return Response(
-                        content=inference_response.visualization,
-                        media_type="image/jpeg",
-                    )
-                else:
-                    return orjson_response(inference_response)
-
-        if not (LAMBDA or GCP_SERVERLESS):
-            # Legacy clear cache endpoint for backwards compatibility
-            @app.get("/clear_cache", response_model=str)
-            def legacy_clear_cache():
-                """
-                Clears the model cache.
-
-                This endpoint provides a way to clear the cache of loaded models.
-
-                Returns:
-                    str: A string indicating that the cache has been cleared.
-                """
-                logger.debug(f"Reached /clear_cache")
-                model_clear()
-                return "Cache Cleared"
-
-            # Legacy add model endpoint for backwards compatibility
-            @app.get("/start/{dataset_id}/{version_id}")
-            def model_add_legacy(
-                dataset_id: str,
-                version_id: str,
-                api_key: str = None,
-                countinference: Optional[bool] = None,
-                service_secret: Optional[str] = None,
-            ):
-                """
-                Starts a model inference session.
-
-                This endpoint initializes and starts an inference session for the specified model version.
-
-                Args:
-                    dataset_id (str): ID of a Roboflow dataset corresponding to the model.
-                    version_id (str): ID of a Roboflow dataset version corresponding to the model.
-                    api_key (str, optional): Roboflow API Key for artifact retrieval.
-                    countinference (Optional[bool]): Whether to count inference or not.
-                    service_secret (Optional[str]): The service secret for the request.
-
-                Returns:
-                    JSONResponse: A response object containing the status and a success message.
-                """
-                logger.debug(
-                    f"Reached /start/{dataset_id}/{version_id} with {dataset_id}/{version_id}"
-                )
-                model_id = f"{dataset_id}/{version_id}"
-                self.model_manager.add_model(
-                    model_id,
-                    api_key,
-                    countinference=countinference,
-                    service_secret=service_secret,
-                )
-
-                return JSONResponse(
-                    {
-                        "status": 200,
-                        "message": "inference session started from local memory.",
-                    }
+        # Legacy router: infer route when LEGACY_ROUTE_ENABLED; clear_cache/start when not (LAMBDA or GCP_SERVERLESS)
+        app.include_router(
+            create_legacy_router(model_manager=self.model_manager)
                 )
 
         if not ENABLE_DASHBOARD:
diff --git a/inference/core/interfaces/http/routes/__init__.py b/inference/core/interfaces/http/routes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/inference/core/interfaces/http/routes/core_models.py b/inference/core/interfaces/http/routes/core_models.py
new file mode 100644
index 0000000000..d41766094d
--- /dev/null
+++ b/inference/core/interfaces/http/routes/core_models.py
@@ -0,0 +1,1350 @@
+"""Core model HTTP routes (CLIP, PE, Gaze, Grounding DINO, YOLO World, SAM, etc.)."""
+
+from functools import partial
+from typing import List, Optional, Union
+
+from fastapi import APIRouter, HTTPException, Query, Request
+from fastapi.responses import Response
+
+from inference.core import logger
+from inference.core.entities.requests.doctr import DoctrOCRInferenceRequest
+from inference.core.entities.requests.easy_ocr import EasyOCRInferenceRequest
+from inference.core.entities.requests.groundingdino import GroundingDINOInferenceRequest
+from inference.core.entities.requests.inference import InferenceRequest
+from inference.core.entities.requests.owlv2 import OwlV2InferenceRequest
+from inference.core.entities.requests.perception_encoder import (
+    PerceptionEncoderCompareRequest,
+    PerceptionEncoderImageEmbeddingRequest,
+    PerceptionEncoderTextEmbeddingRequest,
+)
+from inference.core.entities.requests.sam import SamEmbeddingRequest, SamSegmentationRequest
+from inference.core.entities.requests.sam2 import (
+    Sam2EmbeddingRequest,
+    Sam2SegmentationRequest,
+)
+from inference.core.entities.requests.sam3 import Sam3SegmentationRequest
+from inference.core.entities.requests.sam3_3d import Sam3_3D_Objects_InferenceRequest
+from inference.core.entities.requests.trocr import TrOCRInferenceRequest
+from inference.core.entities.requests.yolo_world import YOLOWorldInferenceRequest
+from inference.core.entities.requests.clip import (
+    ClipCompareRequest,
+    ClipImageEmbeddingRequest,
+    ClipTextEmbeddingRequest,
+)
+from inference.core.entities.requests.gaze import GazeDetectionInferenceRequest
+from inference.core.entities.responses.clip import ClipCompareResponse, ClipEmbeddingResponse
+from inference.core.entities.responses.gaze import GazeDetectionInferenceResponse
+from inference.core.entities.responses.inference import ObjectDetectionInferenceResponse
+from inference.core.entities.responses.ocr import OCRInferenceResponse
+from inference.core.entities.responses.perception_encoder import (
+    PerceptionEncoderCompareResponse,
+    PerceptionEncoderEmbeddingResponse,
+)
+from inference.core.entities.responses.sam import (
+    SamEmbeddingResponse,
+    SamSegmentationResponse,
+)
+from inference.core.entities.responses.sam2 import (
+    Sam2EmbeddingResponse,
+    Sam2SegmentationResponse,
+)
+from inference.core.entities.responses.sam3 import (
+    Sam3EmbeddingResponse,
+    Sam3SegmentationResponse,
+)
+from inference.core.env import (
+    API_BASE_URL,
+    CORE_MODEL_CLIP_ENABLED,
+    CORE_MODEL_DOCTR_ENABLED,
+    CORE_MODEL_EASYOCR_ENABLED,
+    CORE_MODEL_GAZE_ENABLED,
+    CORE_MODEL_GROUNDINGDINO_ENABLED,
+    CORE_MODEL_OWLV2_ENABLED,
+    CORE_MODEL_PE_ENABLED,
+    CORE_MODEL_SAM2_ENABLED,
+    CORE_MODEL_SAM3_ENABLED,
+    CORE_MODEL_SAM_ENABLED,
+    CORE_MODEL_TROCR_ENABLED,
+    CORE_MODEL_YOLO_WORLD_ENABLED,
+    GCP_SERVERLESS,
+    LAMBDA,
+    SAM3_EXEC_MODE,
+    SAM3_FINE_TUNED_MODELS_ENABLED,
+)
+from inference.core.interfaces.http.error_handlers import with_route_exceptions
+from inference.core.interfaces.http.orjson_utils import (
+    orjson_response,
+    orjson_response_keeping_parent_id,
+)
+from inference.core.managers.base import ModelManager
+from inference.core.roboflow_api import ModelEndpointType
+from inference.usage_tracking.collector import usage_collector
+
+if LAMBDA:
+    from inference.core.usage import trackUsage
+
+
+def create_core_models_router(model_manager: ModelManager) -> APIRouter:
+    router = APIRouter()
+
+    def load_core_model(
+        inference_request: InferenceRequest,
+        api_key: Optional[str] = None,
+        core_model: str = None,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+    ) -> str:
+        if api_key:
+            inference_request.api_key = api_key
+        version_id_field = f"{core_model}_version_id"
+        core_model_id = (
+            f"{core_model}/{inference_request.__getattribute__(version_id_field)}"
+        )
+        model_manager.add_model(
+            core_model_id,
+            inference_request.api_key,
+            endpoint_type=ModelEndpointType.CORE_MODEL,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+        return core_model_id
+
+    load_clip_model = partial(load_core_model, core_model="clip")
+    load_pe_model = partial(load_core_model, core_model="perception_encoder")
+    load_sam_model = partial(load_core_model, core_model="sam")
+    load_sam2_model = partial(load_core_model, core_model="sam2")
+    load_gaze_model = partial(load_core_model, core_model="gaze")
+    load_doctr_model = partial(load_core_model, core_model="doctr")
+    load_easy_ocr_model = partial(load_core_model, core_model="easy_ocr")
+    load_grounding_dino_model = partial(load_core_model, core_model="grounding_dino")
+    load_yolo_world_model = partial(load_core_model, core_model="yolo_world")
+    load_owlv2_model = partial(load_core_model, core_model="owlv2")
+    load_trocr_model = partial(load_core_model, core_model="trocr")
+    load_paligemma_model = partial(load_core_model, core_model="paligemma")
+
+    if CORE_MODEL_CLIP_ENABLED:
+
+        @router.post(
+            "/clip/embed_image",
+            response_model=ClipEmbeddingResponse,
+            summary="CLIP Image Embeddings",
+            description="Run the Open AI CLIP model to embed image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def clip_embed_image(
+            inference_request: ClipImageEmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the OpenAI CLIP model.
+
+            Args:
+                inference_request (ClipImageEmbeddingRequest): The request containing the image to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                ClipEmbeddingResponse: The response containing the embedded image.
+            """
+            logger.debug(f"Reached /clip/embed_image")
+            clip_model_id = load_clip_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(clip_model_id, actor)
+            return response
+
+        @router.post(
+            "/clip/embed_text",
+            response_model=ClipEmbeddingResponse,
+            summary="CLIP Text Embeddings",
+            description="Run the Open AI CLIP model to embed text data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def clip_embed_text(
+            inference_request: ClipTextEmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds text data using the OpenAI CLIP model.
+
+            Args:
+                inference_request (ClipTextEmbeddingRequest): The request containing the text to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                ClipEmbeddingResponse: The response containing the embedded text.
+            """
+            logger.debug(f"Reached /clip/embed_text")
+            clip_model_id = load_clip_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(clip_model_id, actor)
+            return response
+
+        @router.post(
+            "/clip/compare",
+            response_model=ClipCompareResponse,
+            summary="CLIP Compare",
+            description="Run the Open AI CLIP model to compute similarity scores.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def clip_compare(
+            inference_request: ClipCompareRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Computes similarity scores using the OpenAI CLIP model.
+
+            Args:
+                inference_request (ClipCompareRequest): The request containing the data to be compared.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                ClipCompareResponse: The response containing the similarity scores.
+            """
+            logger.debug(f"Reached /clip/compare")
+            clip_model_id = load_clip_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                clip_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(clip_model_id, actor, n=2)
+            return response
+
+    if CORE_MODEL_PE_ENABLED:
+
+        @router.post(
+            "/perception_encoder/embed_image",
+            response_model=PerceptionEncoderEmbeddingResponse,
+            summary="PE Image Embeddings",
+            description="Run the Meta Perception Encoder model to embed image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def pe_embed_image(
+            inference_request: PerceptionEncoderImageEmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the Perception Encoder PE model.
+
+            Args:
+                inference_request (PerceptionEncoderImageEmbeddingRequest): The request containing the image to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                PerceptionEncoderEmbeddingResponse: The response containing the embedded image.
+            """
+            logger.debug(f"Reached /perception_encoder/embed_image")
+            pe_model_id = load_pe_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                pe_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(pe_model_id, actor)
+            return response
+
+        @router.post(
+            "/perception_encoder/embed_text",
+            response_model=PerceptionEncoderEmbeddingResponse,
+            summary="Perception Encoder Text Embeddings",
+            description="Run the Meta Perception Encoder model to embed text data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def pe_embed_text(
+            inference_request: PerceptionEncoderTextEmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds text data using the Meta Perception Encoder model.
+
+            Args:
+                inference_request (PerceptionEncoderTextEmbeddingRequest): The request containing the text to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                PerceptionEncoderEmbeddingResponse: The response containing the embedded text.
+            """
+            logger.debug(f"Reached /perception_encoder/embed_text")
+            pe_model_id = load_pe_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                pe_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(pe_model_id, actor)
+            return response
+
+        @router.post(
+            "/perception_encoder/compare",
+            response_model=PerceptionEncoderCompareResponse,
+            summary="Perception Encoder Compare",
+            description="Run the Meta Perception Encoder model to compute similarity scores.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def pe_compare(
+            inference_request: PerceptionEncoderCompareRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Computes similarity scores using the Meta Perception Encoder model.
+
+            Args:
+                inference_request (PerceptionEncoderCompareRequest): The request containing the data to be compared.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                PerceptionEncoderCompareResponse: The response containing the similarity scores.
+            """
+            logger.debug(f"Reached /perception_encoder/compare")
+            pe_model_id = load_pe_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                pe_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(pe_model_id, actor, n=2)
+            return response
+
+    if CORE_MODEL_GROUNDINGDINO_ENABLED:
+
+        @router.post(
+            "/grounding_dino/infer",
+            response_model=ObjectDetectionInferenceResponse,
+            summary="Grounding DINO inference.",
+            description="Run the Grounding DINO zero-shot object detection model.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def grounding_dino_infer(
+            inference_request: GroundingDINOInferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the Grounding DINO model.
+
+            Args:
+                inference_request GroundingDINOInferenceRequest): The request containing the image on which to run object detection.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                ObjectDetectionInferenceResponse: The object detection response.
+            """
+            logger.debug(f"Reached /grounding_dino/infer")
+            grounding_dino_model_id = load_grounding_dino_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                grounding_dino_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(grounding_dino_model_id, actor)
+            return response
+
+    if CORE_MODEL_YOLO_WORLD_ENABLED:
+
+        @router.post(
+            "/yolo_world/infer",
+            response_model=ObjectDetectionInferenceResponse,
+            summary="YOLO-World inference.",
+            description="Run the YOLO-World zero-shot object detection model.",
+            response_model_exclude_none=True,
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def yolo_world_infer(
+            inference_request: YOLOWorldInferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Runs the YOLO-World zero-shot object detection model.
+
+            Args:
+                inference_request (YOLOWorldInferenceRequest): The request containing the image on which to run object detection.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                ObjectDetectionInferenceResponse: The object detection response.
+            """
+            logger.debug(f"Reached /yolo_world/infer. Loading model")
+            yolo_world_model_id = load_yolo_world_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            logger.debug("YOLOWorld model loaded. Staring the inference.")
+            response = model_manager.infer_from_request_sync(
+                yolo_world_model_id, inference_request
+            )
+            logger.debug("YOLOWorld prediction available.")
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(yolo_world_model_id, actor)
+                logger.debug("Usage of YOLOWorld denoted.")
+            return response
+
+    if CORE_MODEL_DOCTR_ENABLED:
+
+        @router.post(
+            "/doctr/ocr",
+            response_model=Union[
+                OCRInferenceResponse, List[OCRInferenceResponse]
+            ],
+            summary="DocTR OCR response",
+            description="Run the DocTR OCR model to retrieve text in an image.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def doctr_retrieve_text(
+            inference_request: DoctrOCRInferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the DocTR model.
+
+            Args:
+                inference_request (M.DoctrOCRInferenceRequest): The request containing the image from which to retrieve text.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                OCRInferenceResponse: The response containing the embedded image.
+            """
+            logger.debug(f"Reached /doctr/ocr")
+            doctr_model_id = load_doctr_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                doctr_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(doctr_model_id, actor)
+            return orjson_response_keeping_parent_id(response)
+
+    if CORE_MODEL_EASYOCR_ENABLED:
+
+        @router.post(
+            "/easy_ocr/ocr",
+            response_model=Union[
+                OCRInferenceResponse, List[OCRInferenceResponse]
+            ],
+            summary="EasyOCR OCR response",
+            description="Run the EasyOCR model to retrieve text in an image.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def easy_ocr_retrieve_text(
+            inference_request: EasyOCRInferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the EasyOCR model.
+
+            Args:
+                inference_request (EasyOCRInferenceRequest): The request containing the image from which to retrieve text.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                OCRInferenceResponse: The response containing the embedded image.
+            """
+            logger.debug(f"Reached /easy_ocr/ocr")
+            easy_ocr_model_id = load_easy_ocr_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                easy_ocr_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(easy_ocr_model_id, actor)
+            return orjson_response_keeping_parent_id(response)
+
+    if CORE_MODEL_SAM_ENABLED:
+
+        @router.post(
+            "/sam/embed_image",
+            response_model=SamEmbeddingResponse,
+            summary="SAM Image Embeddings",
+            description="Run the Meta AI Segmant Anything Model to embed image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam_embed_image(
+            inference_request: SamEmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the Meta AI Segmant Anything Model (SAM).
+
+            Args:
+                inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                M.SamEmbeddingResponse or Response: The response containing the embedded image.
+            """
+            logger.debug(f"Reached /sam/embed_image")
+            sam_model_id = load_sam_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            model_response = model_manager.infer_from_request_sync(
+                sam_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(sam_model_id, actor)
+            if inference_request.format == "binary":
+                return Response(
+                    content=model_response.embeddings,
+                    headers={"Content-Type": "application/octet-stream"},
+                )
+            return model_response
+
+        @router.post(
+            "/sam/segment_image",
+            response_model=SamSegmentationResponse,
+            summary="SAM Image Segmentation",
+            description="Run the Meta AI Segmant Anything Model to generate segmenations for image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam_segment_image(
+            inference_request: SamSegmentationRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Generates segmentations for image data using the Meta AI Segmant Anything Model (SAM).
+
+            Args:
+                inference_request (SamSegmentationRequest): The request containing the image to be segmented.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                M.SamSegmentationResponse or Response: The response containing the segmented image.
+            """
+            logger.debug(f"Reached /sam/segment_image")
+            sam_model_id = load_sam_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            model_response = model_manager.infer_from_request_sync(
+                sam_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(sam_model_id, actor)
+            if inference_request.format == "binary":
+                return Response(
+                    content=model_response,
+                    headers={"Content-Type": "application/octet-stream"},
+                )
+            return model_response
+
+    if CORE_MODEL_SAM2_ENABLED:
+
+        @router.post(
+            "/sam2/embed_image",
+            response_model=Sam2EmbeddingResponse,
+            summary="SAM2 Image Embeddings",
+            description="Run the Meta AI Segment Anything 2 Model to embed image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam2_embed_image(
+            inference_request: Sam2EmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the Meta AI Segment Anything Model (SAM).
+
+            Args:
+                inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
+            """
+            logger.debug(f"Reached /sam2/embed_image")
+            sam2_model_id = load_sam2_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            model_response = model_manager.infer_from_request_sync(
+                sam2_model_id, inference_request
+            )
+            return model_response
+
+        @router.post(
+            "/sam2/segment_image",
+            response_model=Sam2SegmentationResponse,
+            summary="SAM2 Image Segmentation",
+            description="Run the Meta AI Segment Anything 2 Model to generate segmenations for image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam2_segment_image(
+            inference_request: Sam2SegmentationRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Generates segmentations for image data using the Meta AI Segment Anything Model (SAM).
+
+            Args:
+                inference_request (Sam2SegmentationRequest): The request containing the image to be segmented.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                M.SamSegmentationResponse or Response: The response containing the segmented image.
+            """
+            logger.debug(f"Reached /sam2/segment_image")
+            sam2_model_id = load_sam2_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            model_response = model_manager.infer_from_request_sync(
+                sam2_model_id, inference_request
+            )
+            if inference_request.format == "binary":
+                return Response(
+                    content=model_response,
+                    headers={"Content-Type": "application/octet-stream"},
+                )
+            return model_response
+
+    if CORE_MODEL_SAM3_ENABLED and not GCP_SERVERLESS:
+
+        @router.post(
+            "/sam3/embed_image",
+            response_model=Sam3EmbeddingResponse,
+            summary="Seg preview Image Embeddings",
+            description="Run the  Model to embed image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam3_embed_image(
+            inference_request: Sam2EmbeddingRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            logger.debug(f"Reached /sam3/embed_image")
+
+            if SAM3_EXEC_MODE == "remote":
+                raise HTTPException(
+                    status_code=501,
+                    detail="SAM3 embedding is not supported in remote execution mode.",
+                )
+
+            model_manager.add_model(
+                "sam3/sam3_interactive",
+                api_key=api_key,
+                endpoint_type=ModelEndpointType.CORE_MODEL,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+            model_response = model_manager.infer_from_request_sync(
+                "sam3/sam3_interactive", inference_request
+            )
+            return model_response
+
+    if CORE_MODEL_SAM3_ENABLED:
+
+        @router.post(
+            "/sam3/concept_segment",
+            response_model=Sam3SegmentationResponse,
+            summary="SAM3 PCS (promptable concept segmentation)",
+            description="Run the SAM3 PCS (promptable concept segmentation) to generate segmentations for image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam3_segment_image(
+            inference_request: Sam3SegmentationRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+        if not SAM3_FINE_TUNED_MODELS_ENABLED:
+                    if not inference_request.model_id.startswith("sam3/"):
+                        raise HTTPException(
+                            status_code=501,
+                            detail="Fine-tuned SAM3 models are not supported on this deployment. Please use a workflow or self-host the server.",
+                        )
+
+            if SAM3_EXEC_MODE == "remote":
+                if not inference_request.model_id.startswith("sam3/"):
+                    raise HTTPException(
+                        status_code=501,
+                        detail="Fine-tuned SAM3 models are not supported in remote execution mode yet. Please use a workflow or self-host the server.",
+                    )
+                endpoint = f"{API_BASE_URL}/inferenceproxy/seg-preview"
+
+                # Construct payload for remote API
+                # The remote API expects:
+                # {
+                #     "image": {"type": "base64", "value": ...},
+                #     "prompts": [{"type": "text", "text": ...}, ...],
+                #     "output_prob_thresh": ...
+                # }
+
+                # Extract prompts from request
+                http_prompts = []
+                for prompt in inference_request.prompts:
+                    p_dict = prompt.dict(exclude_none=True)
+                    # Ensure type is set if missing (default to text if text is present)
+                    if "type" not in p_dict:
+                        if "text" in p_dict:
+                            p_dict["type"] = "text"
+                    http_prompts.append(p_dict)
+
+                # Prepare image
+                # inference_request.image is InferenceRequestImage
+                if inference_request.image.type == "base64":
+                    http_image = {
+                        "type": "base64",
+                        "value": inference_request.image.value,
+                    }
+                elif inference_request.image.type == "url":
+                    http_image = {
+                        "type": "url",
+                        "value": inference_request.image.value,
+                    }
+                elif inference_request.image.type == "numpy":
+                    # Numpy not supported for remote proxy easily without serialization,
+                    # but InferenceRequestImage usually comes as base64/url in HTTP API.
+                    # If it is numpy, we might need to handle it, but for now assume base64/url.
+                    # If it's numpy, it's likely from internal call, but this is HTTP API.
+                    http_image = {
+                        "type": "numpy",
+                        "value": inference_request.image.value,
+                    }
+                else:
+                    http_image = {
+                        "type": inference_request.image.type,
+                        "value": inference_request.image.value,
+                    }
+
+                payload = {
+                    "image": http_image,
+                    "prompts": http_prompts,
+                    "output_prob_thresh": inference_request.output_prob_thresh,
+                }
+
+                try:
+                    headers = {"Content-Type": "application/json"}
+                    if ROBOFLOW_INTERNAL_SERVICE_NAME:
+                        headers["X-Roboflow-Internal-Service-Name"] = (
+                            ROBOFLOW_INTERNAL_SERVICE_NAME
+                        )
+                    if ROBOFLOW_INTERNAL_SERVICE_SECRET:
+                        headers["X-Roboflow-Internal-Service-Secret"] = (
+                            ROBOFLOW_INTERNAL_SERVICE_SECRET
+                        )
+
+                    headers = build_roboflow_api_headers(
+                        explicit_headers=headers
+                    )
+
+                    response = requests.post(
+                        f"{endpoint}?api_key={api_key}",
+                        json=payload,
+                        headers=headers,
+                        timeout=60,
+                    )
+                    response.raise_for_status()
+                    resp_json = response.json()
+
+                    # The remote API returns the same structure as Sam3SegmentationResponse
+                    return Sam3SegmentationResponse(**resp_json)
+
+                except Exception as e:
+                    logger.error(f"SAM3 remote request failed: {e}")
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"SAM3 remote request failed: {str(e)}",
+                    )
+
+            if inference_request.model_id.startswith("sam3/"):
+                model_manager.add_model(
+                    inference_request.model_id,
+                    api_key=api_key,
+                    endpoint_type=ModelEndpointType.CORE_MODEL,
+                    countinference=countinference,
+                    service_secret=service_secret,
+                )
+            else:
+                model_manager.add_model(
+                    inference_request.model_id,
+                    api_key=api_key,
+                    endpoint_type=ModelEndpointType.ORT,
+                    countinference=countinference,
+                    service_secret=service_secret,
+                )
+
+            model_response = model_manager.infer_from_request_sync(
+                inference_request.model_id, inference_request
+            )
+            if inference_request.format == "binary":
+                return Response(
+                    content=model_response,
+                    headers={"Content-Type": "application/octet-stream"},
+                )
+            return model_response
+
+        @router.post(
+            "/sam3/visual_segment",
+            response_model=Sam2SegmentationResponse,
+            summary="SAM3 PVS (promptable visual segmentation)",
+            description="Run the SAM3 PVS (promptable visual segmentation) to generate segmentations for image data.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam3_visual_segment(
+            inference_request: Sam2SegmentationRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            logger.debug(f"Reached /sam3/visual_segment")
+
+            if SAM3_EXEC_MODE == "remote":
+                endpoint = f"{API_BASE_URL}/inferenceproxy/sam3-pvs"
+
+                http_image = {
+                    "type": inference_request.image.type,
+                    "value": inference_request.image.value,
+                }
+
+                prompts_data = (
+                    inference_request.prompts.dict(exclude_none=True)
+                    if inference_request.prompts
+                    else None
+                )
+
+                payload = {
+                    "image": http_image,
+                    "prompts": prompts_data,
+                    "multimask_output": inference_request.multimask_output,
+                }
+
+                try:
+                    headers = {"Content-Type": "application/json"}
+                    if ROBOFLOW_INTERNAL_SERVICE_NAME:
+                        headers["X-Roboflow-Internal-Service-Name"] = (
+                            ROBOFLOW_INTERNAL_SERVICE_NAME
+                        )
+                    if ROBOFLOW_INTERNAL_SERVICE_SECRET:
+                        headers["X-Roboflow-Internal-Service-Secret"] = (
+                            ROBOFLOW_INTERNAL_SERVICE_SECRET
+                        )
+
+                    headers = build_roboflow_api_headers(
+                        explicit_headers=headers
+                    )
+
+                    response = requests.post(
+                        f"{endpoint}?api_key={api_key}",
+                        json=payload,
+                        headers=headers,
+                        timeout=60,
+                    )
+                    response.raise_for_status()
+                    resp_json = response.json()
+
+                    return Sam2SegmentationResponse(**resp_json)
+
+                except Exception as e:
+                    logger.error(
+                        f"SAM3 visual_segment remote request failed: {e}"
+                    )
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"SAM3 visual_segment remote request failed: {str(e)}",
+                    )
+
+            model_manager.add_model(
+                "sam3/sam3_interactive",
+                api_key=api_key,
+                endpoint_type=ModelEndpointType.CORE_MODEL,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+            model_response = model_manager.infer_from_request_sync(
+                "sam3/sam3_interactive", inference_request
+            )
+            return model_response
+
+    if CORE_MODEL_SAM3_ENABLED and not GCP_SERVERLESS:
+
+        @router.post(
+            "/sam3_3d/infer",
+            summary="SAM3 3D Object Generation",
+            description="Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def sam3_3d_infer(
+            inference_request: Sam3_3D_Objects_InferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """Generate 3D meshes and Gaussian splatting from 2D images with mask prompts.
+
+            Args:
+                inference_request (Sam3_3D_Objects_InferenceRequest): The request containing
+                    the image and mask input for 3D generation.
+                api_key (Optional[str]): Roboflow API Key for artifact retrieval.
+
+            Returns:
+                dict: Response containing base64-encoded 3D outputs:
+                    - mesh_glb: Scene mesh in GLB format (base64)
+                    - gaussian_ply: Combined Gaussian splatting in PLY format (base64)
+                    - objects: List of individual objects with their 3D data
+                    - time: Inference time in seconds
+            """
+            logger.debug("Reached /sam3_3d/infer")
+            model_id = inference_request.model_id or "sam3-3d-objects"
+
+            model_manager.add_model(
+                model_id,
+                api_key=api_key,
+                endpoint_type=ModelEndpointType.CORE_MODEL,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+            model_response = model_manager.infer_from_request_sync(
+                model_id, inference_request
+            )
+
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(model_id, actor)
+
+            # Convert bytes to base64 for JSON serialization
+            def encode_bytes(data):
+                if data is None:
+                    return None
+                return base64.b64encode(data).decode("utf-8")
+
+            objects_list = []
+            for obj in model_response.objects:
+                objects_list.append(
+                    {
+                        "mesh_glb": encode_bytes(obj.mesh_glb),
+                        "gaussian_ply": encode_bytes(obj.gaussian_ply),
+                        "metadata": {
+                            "rotation": obj.metadata.rotation,
+                            "translation": obj.metadata.translation,
+                            "scale": obj.metadata.scale,
+                        },
+                    }
+                )
+
+            return {
+                "mesh_glb": encode_bytes(model_response.mesh_glb),
+                "gaussian_ply": encode_bytes(model_response.gaussian_ply),
+                "objects": objects_list,
+                "time": model_response.time,
+            }
+
+    if CORE_MODEL_OWLV2_ENABLED:
+
+        @router.post(
+            "/owlv2/infer",
+            response_model=ObjectDetectionInferenceResponse,
+            summary="Owlv2 image prompting",
+            description="Run the google owlv2 model to few-shot object detect",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def owlv2_infer(
+            inference_request: OwlV2InferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Embeds image data using the Meta AI Segmant Anything Model (SAM).
+
+            Args:
+                inference_request (SamEmbeddingRequest): The request containing the image to be embedded.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                M.Sam2EmbeddingResponse or Response: The response affirming the image has been embedded
+            """
+            logger.debug(f"Reached /owlv2/infer")
+            owl2_model_id = load_owlv2_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            model_response = model_manager.infer_from_request_sync(
+                owl2_model_id, inference_request
+            )
+            return model_response
+
+    if CORE_MODEL_GAZE_ENABLED:
+
+        @router.post(
+            "/gaze/gaze_detection",
+            response_model=List[GazeDetectionInferenceResponse],
+            summary="Gaze Detection",
+            description="Run the gaze detection model to detect gaze.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def gaze_detection(
+            inference_request: GazeDetectionInferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Detect gaze using the gaze detection model.
+
+            Args:
+                inference_request (M.GazeDetectionRequest): The request containing the image to be detected.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                M.GazeDetectionResponse: The response containing all the detected faces and the corresponding gazes.
+            """
+            logger.debug(f"Reached /gaze/gaze_detection")
+            gaze_model_id = load_gaze_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                gaze_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(gaze_model_id, actor)
+            return response
+
+    if DEPTH_ESTIMATION_ENABLED:
+
+        @router.post(
+            "/core/depth-estimation",
+            response_model=DepthEstimationResponse,
+            summary="Depth Estimation",
+            description="Run the depth estimation model to generate a depth map.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def depth_estimation(
+            inference_request: DepthEstimationRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Generate a depth map using the depth estimation model.
+
+            Args:
+                inference_request (DepthEstimationRequest): The request containing the image to estimate depth for.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                DepthEstimationResponse: The response containing the normalized depth map and optional visualization.
+            """
+            logger.debug(f"Reached /infer/depth-estimation")
+            depth_model_id = inference_request.model_id
+            model_manager.add_model(
+                depth_model_id,
+                inference_request.api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                depth_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(depth_model_id, actor)
+
+            # Extract data from nested response structure
+            depth_data = response.response
+            depth_response = DepthEstimationResponse(
+                normalized_depth=depth_data["normalized_depth"].tolist(),
+                image=depth_data["image"].base64_image,
+            )
+            return depth_response
+
+    if CORE_MODEL_TROCR_ENABLED:
+
+        @router.post(
+            "/ocr/trocr",
+            response_model=OCRInferenceResponse,
+            summary="TrOCR OCR response",
+            description="Run the TrOCR model to retrieve text in an image.",
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def trocr_retrieve_text(
+            inference_request: TrOCRInferenceRequest,
+            request: Request,
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Retrieves text from image data using the TrOCR model.
+
+            Args:
+                inference_request (TrOCRInferenceRequest): The request containing the image from which to retrieve text.
+                api_key (Optional[str], default None): Roboflow API Key passed to the model during initialization for artifact retrieval.
+                request (Request, default Body()): The HTTP request.
+
+            Returns:
+                OCRInferenceResponse: The response containing the retrieved text.
+            """
+            logger.debug(f"Reached /trocr/ocr")
+            trocr_model_id = load_trocr_model(
+                inference_request,
+                api_key=api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                trocr_model_id, inference_request
+            )
+            if LAMBDA:
+                actor = request.scope["aws.event"]["requestContext"][
+                    "authorizer"
+                ]["lambda"]["actor"]
+                trackUsage(trocr_model_id, actor)
+            return orjson_response_keeping_parent_id(response)
+
+    return router
diff --git a/inference/core/interfaces/http/routes/health.py b/inference/core/interfaces/http/routes/health.py
new file mode 100644
index 0000000000..4a95258252
--- /dev/null
+++ b/inference/core/interfaces/http/routes/health.py
@@ -0,0 +1,58 @@
+"""Health, readiness, and device stats HTTP routes."""
+
+from fastapi import APIRouter, Depends
+from typing import Any, Optional
+from starlette.responses import JSONResponse
+
+from inference.core.env import DOCKER_SOCKET_PATH
+from inference.core.managers.metrics import get_container_stats
+from inference.core.utils.container import is_docker_socket_mounted
+
+
+def create_health_router(model_init_state: Optional[Any] = None) -> APIRouter:
+    router = APIRouter()
+
+    @router.get("/device/stats", summary="Device/container statistics")
+    def device_stats():
+        not_configured_error_message = {
+            "error": "Device statistics endpoint is not enabled.",
+            "hint": (
+                "Mount the Docker socket and point its location when running the docker "
+                "container to collect device stats "
+                "(i.e. `docker run ... -v /var/run/docker.sock:/var/run/docker.sock "
+                "-e DOCKER_SOCKET_PATH=/var/run/docker.sock ...`)."
+            ),
+        }
+        if not DOCKER_SOCKET_PATH:
+            return JSONResponse(
+                status_code=404,
+                content=not_configured_error_message,
+            )
+        if not is_docker_socket_mounted(docker_socket_path=DOCKER_SOCKET_PATH):
+            return JSONResponse(
+                status_code=500,
+                content=not_configured_error_message,
+            )
+
+        container_stats = get_container_stats(docker_socket_path=DOCKER_SOCKET_PATH)
+        return JSONResponse(status_code=200, content=container_stats)
+    
+    @router.get("/readiness", status_code=200)
+    def readiness(state: Any = Depends(lambda: model_init_state)):
+        """Readiness endpoint for Kubernetes readiness probe."""
+        if state is None:
+            return {"status": "ready"}
+        with state.lock:
+            if state.is_ready:
+                return {"status": "ready"}
+            return JSONResponse(
+                content={"status": "not ready"}, status_code=503
+            )
+
+    @router.get("/healthz", status_code=200)
+    def healthz():
+        """Health endpoint for Kubernetes liveness probe."""
+        return {"status": "healthy"}
+
+    return router
+    
\ No newline at end of file
diff --git a/inference/core/interfaces/http/routes/inference.py b/inference/core/interfaces/http/routes/inference.py
new file mode 100644
index 0000000000..9cc6496331
--- /dev/null
+++ b/inference/core/interfaces/http/routes/inference.py
@@ -0,0 +1,303 @@
+"""Roboflow trained-model inference HTTP routes (/infer/*)."""
+
+from typing import List, Optional, Union
+
+from fastapi import APIRouter, BackgroundTasks, Query, Request, HTTPException
+
+from inference.core import logger
+from inference.core.entities.requests.inference import (
+    ClassificationInferenceRequest,
+    DepthEstimationRequest,
+    InferenceRequest,
+    InstanceSegmentationInferenceRequest,
+    KeypointsDetectionInferenceRequest,
+    ObjectDetectionInferenceRequest,
+    LMMInferenceRequest,
+    SemanticSegmentationInferenceRequest,
+)
+from inference.core.entities.responses.inference import (
+    ClassificationInferenceResponse,
+    DepthEstimationResponse,
+    InferenceResponse,
+    InstanceSegmentationInferenceResponse,
+    KeypointsDetectionInferenceResponse,
+    ObjectDetectionInferenceResponse,
+    MultiLabelClassificationInferenceResponse,
+    StubResponse,
+    LMMInferenceResponse,
+    SemanticSegmentationInferenceResponse,
+)
+from inference.core.env import DEPTH_ESTIMATION_ENABLED, LMM_ENABLED, MOONDREAM2_ENABLED
+from inference.core.interfaces.http.error_handlers import with_route_exceptions
+from inference.core.interfaces.http.orjson_utils import orjson_response
+from inference.core.managers.base import ModelManager
+from inference.models.aliases import resolve_roboflow_model_alias
+from inference.usage_tracking.collector import usage_collector
+
+
+def create_inference_router(
+    model_manager: ModelManager,
+) -> APIRouter:
+    router = APIRouter()
+
+    def process_inference_request(
+        inference_request: InferenceRequest,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+        **kwargs,
+    ) -> InferenceResponse:
+        de_aliased_model_id = resolve_roboflow_model_alias(
+            model_id=inference_request.model_id
+        )
+        model_manager.add_model(
+            de_aliased_model_id,
+            inference_request.api_key,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+        resp = model_manager.infer_from_request_sync(
+            de_aliased_model_id,
+            inference_request,
+            **kwargs,
+        )
+        return orjson_response(resp)
+
+    @router.post(
+        "/infer/object_detection",
+        response_model=Union[
+            ObjectDetectionInferenceResponse,
+            List[ObjectDetectionInferenceResponse],
+            StubResponse,
+        ],
+        summary="Object detection infer",
+        description="Run inference with the specified object detection model",
+        response_model_exclude_none=True,
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_object_detection(
+        inference_request: ObjectDetectionInferenceRequest,
+        background_tasks: BackgroundTasks,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+    ):
+        logger.debug("Reached /infer/object_detection")
+        return process_inference_request(
+            inference_request,
+            active_learning_eligible=True,
+            background_tasks=background_tasks,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+
+    @router.post(
+        "/infer/instance_segmentation",
+        response_model=Union[InstanceSegmentationInferenceResponse, StubResponse],
+        summary="Instance segmentation infer",
+        description="Run inference with the specified instance segmentation model",
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_instance_segmentation(
+        inference_request: InstanceSegmentationInferenceRequest,
+        background_tasks: BackgroundTasks,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+    ):
+        logger.debug("Reached /infer/instance_segmentation")
+        return process_inference_request(
+            inference_request,
+            active_learning_eligible=True,
+            background_tasks=background_tasks,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+
+    @router.post(
+        "/infer/semantic_segmentation",
+        response_model=Union[SemanticSegmentationInferenceResponse, StubResponse],
+        summary="Semantic segmentation infer",
+        description="Run inference with the specified semantic segmentation model",
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_semantic_segmentation(
+        inference_request: SemanticSegmentationInferenceRequest,
+        background_tasks: BackgroundTasks,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+    ):
+        logger.debug("Reached /infer/semantic_segmentation")
+        return process_inference_request(
+            inference_request,
+            active_learning_eligible=True,
+            background_tasks=background_tasks,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+
+    @router.post(
+        "/infer/classification",
+        response_model=Union[
+            ClassificationInferenceResponse,
+            MultiLabelClassificationInferenceResponse,
+            StubResponse,
+        ],
+        summary="Classification infer",
+        description="Run inference with the specified classification model",
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_classification(
+        inference_request: ClassificationInferenceRequest,
+        background_tasks: BackgroundTasks,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+    ):
+        logger.debug("Reached /infer/classification")
+        return process_inference_request(
+            inference_request,
+            active_learning_eligible=True,
+            background_tasks=background_tasks,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+
+    @router.post(
+        "/infer/keypoints_detection",
+        response_model=Union[KeypointsDetectionInferenceResponse, StubResponse],
+        summary="Keypoints detection infer",
+        description="Run inference with the specified keypoints detection model",
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_keypoints(
+        inference_request: KeypointsDetectionInferenceRequest,
+        countinference: Optional[bool] = None,
+        service_secret: Optional[str] = None,
+    ):
+        logger.debug("Reached /infer/keypoints_detection")
+        return process_inference_request(
+            inference_request,
+            countinference=countinference,
+            service_secret=service_secret,
+        )
+
+    if LMM_ENABLED or MOONDREAM2_ENABLED:
+        @router.post(
+            "/infer/lmm",
+            response_model=Union[
+                LMMInferenceResponse,
+                List[LMMInferenceResponse],
+                StubResponse,
+            ],
+            summary="Large multi-modal model infer",
+            description="Run inference with the specified large multi-modal model",
+            response_model_exclude_none=True,
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def infer_lmm(
+            inference_request: LMMInferenceRequest,
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """Run inference with the specified large multi-modal model.
+
+            Args:
+                inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.
+
+            Returns:
+                Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.
+            """
+            logger.debug(f"Reached /infer/lmm")
+            return process_inference_request(
+                inference_request,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+        @router.post(
+            "/infer/lmm/{model_id:path}",
+            response_model=Union[
+                LMMInferenceResponse,
+                List[LMMInferenceResponse],
+                StubResponse,
+            ],
+            summary="Large multi-modal model infer with model ID in path",
+            description="Run inference with the specified large multi-modal model. Model ID is specified in the URL path (can contain slashes).",
+            response_model_exclude_none=True,
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def infer_lmm_with_model_id(
+            model_id: str,
+            inference_request: LMMInferenceRequest,
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """Run inference with the specified large multi-modal model.
+
+            The model_id can be specified in the URL path. If model_id is also provided
+            in the request body, it must match the path parameter.
+
+            Args:
+                model_id (str): The model identifier from the URL path.
+                inference_request (LMMInferenceRequest): The request containing the necessary details for LMM inference.
+
+            Returns:
+                Union[LMMInferenceResponse, List[LMMInferenceResponse]]: The response containing the inference results.
+
+            Raises:
+                HTTPException: If model_id in path and request body don't match.
+            """
+            logger.debug(f"Reached /infer/lmm/{model_id}")
+
+            # Validate model_id consistency between path and request body
+            if (
+                inference_request.model_id is not None
+                and inference_request.model_id != model_id
+            ):
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Model ID mismatch: path specifies '{model_id}' but request body specifies '{inference_request.model_id}'",
+                )
+
+            # Set the model_id from path if not in request body
+            inference_request.model_id = model_id
+
+            return process_inference_request(
+                inference_request,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+    if DEPTH_ESTIMATION_ENABLED:
+
+        @router.post(
+            "/infer/depth-estimation",
+            response_model=DepthEstimationResponse,
+            summary="Depth Estimation",
+            description="Run the depth estimation model to generate a depth map.",
+        )
+        @with_route_exceptions
+        def depth_estimation(
+            inference_request: DepthEstimationRequest,
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            logger.debug("Reached /infer/depth-estimation")
+            depth_model_id = inference_request.model_id
+            model_manager.add_model(
+                depth_model_id,
+                inference_request.api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            response = model_manager.infer_from_request_sync(
+                depth_model_id, inference_request
+            )
+            return response
+
+    return router
+
diff --git a/inference/core/interfaces/http/routes/info.py b/inference/core/interfaces/http/routes/info.py
new file mode 100644
index 0000000000..ef978ad430
--- /dev/null
+++ b/inference/core/interfaces/http/routes/info.py
@@ -0,0 +1,70 @@
+"""Server info and version HTTP routes (/info, server identity)."""
+
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException, Query
+
+from inference.core.version import __version__
+from inference.core.devices.utils import GLOBAL_INFERENCE_SERVER_ID
+from inference.core.entities.responses.server_state import ServerVersionInfo
+
+
+def create_info_router() -> APIRouter:
+    router = APIRouter()
+
+    @router.get(
+        "/info",
+        response_model=ServerVersionInfo,
+        summary="Info",
+        description="Get the server name and version number",
+    )
+    def root():
+        """Endpoint to get the server name and version number.
+
+        Returns:
+            ServerVersionInfo: The server version information.
+        """
+        return ServerVersionInfo(
+            name="Roboflow Inference Server",
+            version=__version__,
+            uuid=GLOBAL_INFERENCE_SERVER_ID,
+        )
+
+    @router.get(
+        "/logs",
+        summary="Get Recent Logs",
+        description="Get recent application logs for debugging",
+    )
+    def get_logs(
+        limit: Optional[int] = Query(
+            100, description="Maximum number of log entries to return"
+        ),
+        level: Optional[str] = Query(
+            None,
+            description="Filter by log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
+        ),
+        since: Optional[str] = Query(
+            None, description="Return logs since this ISO timestamp"
+        ),
+    ):
+        """Only available when ENABLE_IN_MEMORY_LOGS is set to 'true'."""
+        from inference.core.logging.memory_handler import (
+            get_recent_logs,
+            is_memory_logging_enabled,
+        )
+
+        if not is_memory_logging_enabled():
+            raise HTTPException(
+                status_code=404, detail="Logs endpoint not available"
+            )
+
+        try:
+            logs = get_recent_logs(limit=limit or 100, level=level, since=since)
+            return {"logs": logs, "total_count": len(logs)}
+        except (ImportError, ModuleNotFoundError):
+            raise HTTPException(
+                status_code=500, detail="Logging system not properly initialized"
+            )
+
+    return router
+
diff --git a/inference/core/interfaces/http/routes/legacy.py b/inference/core/interfaces/http/routes/legacy.py
new file mode 100644
index 0000000000..6c6f31ea2c
--- /dev/null
+++ b/inference/core/interfaces/http/routes/legacy.py
@@ -0,0 +1,402 @@
+"""Legacy inference and upload-style HTTP routes."""
+
+import base64
+from typing import Annotated, Any, Optional, Union
+
+from fastapi import APIRouter, BackgroundTasks, Depends, Path, Query, Request
+from fastapi.responses import JSONResponse, Response
+from starlette.datastructures import UploadFile
+
+from inference.core import logger
+from inference.core.entities.requests.inference import (
+    ClassificationInferenceRequest,
+    InstanceSegmentationInferenceRequest,
+    InferenceRequestImage,
+    KeypointsDetectionInferenceRequest,
+    ObjectDetectionInferenceRequest,
+    SemanticSegmentationInferenceRequest,
+)
+from inference.core.entities.responses.inference import (
+    ClassificationInferenceResponse,
+    InstanceSegmentationInferenceResponse,
+    KeypointsDetectionInferenceResponse,
+    MultiLabelClassificationInferenceResponse,
+    ObjectDetectionInferenceResponse,
+    SemanticSegmentationInferenceResponse,
+    StubResponse,
+)
+from inference.core.env import (
+    CONFIDENCE_LOWER_BOUND_OOM_PREVENTION,
+    GCP_SERVERLESS,
+    LAMBDA,
+    LEGACY_ROUTE_ENABLED,
+    ROBOFLOW_SERVICE_SECRET,
+)
+from inference.core.exceptions import (
+    ContentTypeInvalid,
+    ContentTypeMissing,
+    InputImageLoadError,
+    MissingServiceSecretError,
+)
+from inference.core.interfaces.http.dependencies import (
+    parse_body_content_for_legacy_request_handler,
+)
+from inference.core.interfaces.http.error_handlers import with_route_exceptions
+from inference.core.interfaces.http.orjson_utils import orjson_response
+from inference.core.managers.base import ModelManager
+from inference.usage_tracking.collector import usage_collector
+
+if LAMBDA:
+    from inference.core.usage import trackUsage
+
+
+def create_legacy_router(model_manager: ModelManager) -> APIRouter:
+    """Create router for legacy inference and cache endpoints.
+    Infer route is added when LEGACY_ROUTE_ENABLED; clear_cache/start when not (LAMBDA or GCP_SERVERLESS).
+    """
+    router = APIRouter()
+
+    if LEGACY_ROUTE_ENABLED:
+        # Legacy object detection inference path for backwards compatibility
+        @router.get(
+            "/{dataset_id}/{version_id:str}",
+            # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
+            response_model=Union[
+                InstanceSegmentationInferenceResponse,
+                KeypointsDetectionInferenceResponse,
+                ObjectDetectionInferenceResponse,
+                ClassificationInferenceResponse,
+                MultiLabelClassificationInferenceResponse,
+                SemanticSegmentationInferenceResponse,
+                StubResponse,
+                Any,
+            ],
+            response_model_exclude_none=True,
+        )
+        @router.post(
+            "/{dataset_id}/{version_id:str}",
+            # Order matters in this response model Union. It will use the first matching model. For example, Object Detection Inference Response is a subset of Instance segmentation inference response, so instance segmentation must come first in order for the matching logic to work.
+            response_model=Union[
+                InstanceSegmentationInferenceResponse,
+                KeypointsDetectionInferenceResponse,
+                ObjectDetectionInferenceResponse,
+                ClassificationInferenceResponse,
+                MultiLabelClassificationInferenceResponse,
+                SemanticSegmentationInferenceResponse,
+                StubResponse,
+                Any,
+            ],
+            response_model_exclude_none=True,
+        )
+        @with_route_exceptions
+        @usage_collector("request")
+        def legacy_infer_from_request(
+            background_tasks: BackgroundTasks,
+            request: Request,
+            request_body: Annotated[
+                Optional[Union[bytes, UploadFile]],
+                Depends(parse_body_content_for_legacy_request_handler),
+            ],
+            dataset_id: str = Path(
+                description="ID of a Roboflow dataset corresponding to the model to use for inference OR workspace ID"
+            ),
+            version_id: str = Path(
+                description="ID of a Roboflow dataset version corresponding to the model to use for inference OR model ID"
+            ),
+            api_key: Optional[str] = Query(
+                None,
+                description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+            ),
+            confidence: float = Query(
+                0.4,
+                description="The confidence threshold used to filter out predictions",
+            ),
+            keypoint_confidence: float = Query(
+                0.0,
+                description="The confidence threshold used to filter out keypoints that are not visible based on model confidence",
+            ),
+            format: str = Query(
+                "json",
+                description="One of 'json' or 'image'. If 'json' prediction data is return as a JSON string. If 'image' prediction data is visualized and overlayed on the original input image.",
+            ),
+            image: Optional[str] = Query(
+                None,
+                description="The publically accessible URL of an image to use for inference.",
+            ),
+            image_type: Optional[str] = Query(
+                "base64",
+                description="One of base64 or numpy. Note, numpy input is not supported for Roboflow Hosted Inference.",
+            ),
+            labels: Optional[bool] = Query(
+                False,
+                description="If true, labels will be include in any inference visualization.",
+            ),
+            mask_decode_mode: Optional[str] = Query(
+                "accurate",
+                description="One of 'accurate' or 'fast'. If 'accurate' the mask will be decoded using the original image size. If 'fast' the mask will be decoded using the original mask size. 'accurate' is slower but more accurate.",
+            ),
+            tradeoff_factor: Optional[float] = Query(
+                0.0,
+                description="The amount to tradeoff between 0='fast' and 1='accurate'",
+            ),
+            max_detections: int = Query(
+                300,
+                description="The maximum number of detections to return. This is used to limit the number of predictions returned by the model. The model may return more predictions than this number, but only the top `max_detections` predictions will be returned.",
+            ),
+            overlap: float = Query(
+                0.3,
+                description="The IoU threhsold that must be met for a box pair to be considered duplicate during NMS",
+            ),
+            stroke: int = Query(
+                1, description="The stroke width used when visualizing predictions"
+            ),
+            countinference: Optional[bool] = Query(
+                True,
+                description="If false, does not track inference against usage.",
+                include_in_schema=False,
+            ),
+            service_secret: Optional[str] = Query(
+                None,
+                description="Shared secret used to authenticate requests to the inference server from internal services (e.g. to allow disabling inference usage tracking via the `countinference` query parameter)",
+                include_in_schema=False,
+            ),
+            disable_preproc_auto_orient: Optional[bool] = Query(
+                False, description="If true, disables automatic image orientation"
+            ),
+            disable_preproc_contrast: Optional[bool] = Query(
+                False, description="If true, disables automatic contrast adjustment"
+            ),
+            disable_preproc_grayscale: Optional[bool] = Query(
+                False,
+                description="If true, disables automatic grayscale conversion",
+            ),
+            disable_preproc_static_crop: Optional[bool] = Query(
+                False, description="If true, disables automatic static crop"
+            ),
+            disable_active_learning: Optional[bool] = Query(
+                default=False,
+                description="If true, the predictions will be prevented from registration by Active Learning (if the functionality is enabled)",
+            ),
+            active_learning_target_dataset: Optional[str] = Query(
+                default=None,
+                description="Parameter to be used when Active Learning data registration should happen against different dataset than the one pointed by model_id",
+            ),
+            source: Optional[str] = Query(
+                "external",
+                description="The source of the inference request",
+            ),
+            source_info: Optional[str] = Query(
+                "external",
+                description="The detailed source information of the inference request",
+            ),
+            disable_model_monitoring: Optional[bool] = Query(
+                False,
+                description="If true, disables model monitoring for this request",
+                include_in_schema=False,
+            ),
+        ):
+            """
+            Legacy inference endpoint for object detection, instance segmentation, and classification.
+
+            Args:
+                background_tasks: FastAPI background tasks.
+                dataset_id: Roboflow dataset ID or workspace ID.
+                version_id: Dataset version ID or model ID.
+                api_key: Optional API key for artifact retrieval.
+
+            Returns:
+                Inference result (type varies by model: detection, segmentation, classification, etc.).
+            """
+            logger.debug(
+                f"Reached legacy route /:dataset_id/:version_id with {dataset_id}/{version_id}"
+            )
+            model_id = f"{dataset_id}/{version_id}"
+            if confidence >= 1:
+                confidence /= 100
+            if confidence < CONFIDENCE_LOWER_BOUND_OOM_PREVENTION:
+                # allowing lower confidence results in RAM usage explosion
+                confidence = CONFIDENCE_LOWER_BOUND_OOM_PREVENTION
+
+            if overlap >= 1:
+                overlap /= 100
+            if image is not None:
+                request_image = InferenceRequestImage(type="url", value=image)
+            else:
+                if "Content-Type" not in request.headers:
+                    raise ContentTypeMissing(
+                        f"Request must include a Content-Type header"
+                    )
+                if isinstance(request_body, UploadFile):
+                    base64_image_str = request_body.file.read()
+                    base64_image_str = base64.b64encode(base64_image_str)
+                    request_image = InferenceRequestImage(
+                        type="base64", value=base64_image_str.decode("ascii")
+                    )
+                elif isinstance(request_body, bytes):
+                    request_image = InferenceRequestImage(
+                        type=image_type, value=request_body
+                    )
+                elif request_body is None:
+                    raise InputImageLoadError(
+                        message="Image not found in request body.",
+                        public_message="Image not found in request body.",
+                    )
+                else:
+                    raise ContentTypeInvalid(
+                        f"Invalid Content-Type: {request.headers['Content-Type']}"
+                    )
+
+            if not countinference and service_secret != ROBOFLOW_SERVICE_SECRET:
+                raise MissingServiceSecretError(
+                    "Service secret is required to disable inference usage tracking"
+                )
+            if LAMBDA:
+                logger.debug("request.scope: %s", request.scope)
+                request_model_id = (
+                    request.scope["aws.event"]["requestContext"]["authorizer"][
+                        "lambda"
+                    ]["model"]["endpoint"]
+                    .replace("--", "/")
+                    .replace("rf-", "")
+                    .replace("nu-", "")
+                )
+                actor = request.scope["aws.event"]["requestContext"]["authorizer"][
+                    "lambda"
+                ]["actor"]
+                if countinference:
+                    trackUsage(request_model_id, actor)
+                else:
+                    if service_secret != ROBOFLOW_SERVICE_SECRET:
+                        raise MissingServiceSecretError(
+                            "Service secret is required to disable inference usage tracking"
+                        )
+                    logger.info("Not counting inference for usage")
+            else:
+                request_model_id = model_id
+            logger.debug(
+                f"State of model registry: {model_manager.describe_models()}"
+            )
+            model_manager.add_model(
+                request_model_id,
+                api_key,
+                model_id_alias=model_id,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+            task_type = model_manager.get_task_type(model_id, api_key=api_key)
+            inference_request_type = ObjectDetectionInferenceRequest
+            args = dict()
+            if task_type == "instance-segmentation":
+                inference_request_type = InstanceSegmentationInferenceRequest
+                args = {
+                    "mask_decode_mode": mask_decode_mode,
+                    "tradeoff_factor": tradeoff_factor,
+                }
+            elif task_type == "classification":
+                inference_request_type = ClassificationInferenceRequest
+            elif task_type == "keypoint-detection":
+                inference_request_type = KeypointsDetectionInferenceRequest
+                args = {"keypoint_confidence": keypoint_confidence}
+            elif task_type == "semantic-segmentation":
+                inference_request_type = SemanticSegmentationInferenceRequest
+            inference_request = inference_request_type(
+                api_key=api_key,
+                model_id=model_id,
+                image=request_image,
+                confidence=confidence,
+                iou_threshold=overlap,
+                max_detections=max_detections,
+                visualization_labels=labels,
+                visualization_stroke_width=stroke,
+                visualize_predictions=(
+                    format == "image" or format == "image_and_json"
+                ),
+                disable_preproc_auto_orient=disable_preproc_auto_orient,
+                disable_preproc_contrast=disable_preproc_contrast,
+                disable_preproc_grayscale=disable_preproc_grayscale,
+                disable_preproc_static_crop=disable_preproc_static_crop,
+                disable_active_learning=disable_active_learning,
+                active_learning_target_dataset=active_learning_target_dataset,
+                source=source,
+                source_info=source_info,
+                usage_billable=countinference,
+                disable_model_monitoring=disable_model_monitoring,
+                **args,
+            )
+            inference_response = model_manager.infer_from_request_sync(
+                inference_request.model_id,
+                inference_request,
+                active_learning_eligible=True,
+                background_tasks=background_tasks,
+            )
+            logger.debug("Response ready.")
+            if format == "image":
+                return Response(
+                    content=inference_response.visualization,
+                    media_type="image/jpeg",
+                )
+            else:
+                return orjson_response(inference_response)
+
+    if not (LAMBDA or GCP_SERVERLESS):
+        # Legacy clear cache endpoint for backwards compatibility
+        @router.get("/clear_cache", response_model=str)
+        @with_route_exceptions
+        def legacy_clear_cache():
+            """
+            Clears the model cache.
+
+            This endpoint provides a way to clear the cache of loaded models.
+
+            Returns:
+                str: A string indicating that the cache has been cleared.
+            """
+            logger.debug(f"Reached /clear_cache")
+            model_manager.clear()
+            return "Cache Cleared"
+
+        # Legacy add model endpoint for backwards compatibility
+        @router.get("/start/{dataset_id}/{version_id}")
+        @with_route_exceptions
+        def model_add_legacy(
+            dataset_id: str,
+            version_id: str,
+            api_key: str = None,
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """
+            Starts a model inference session.
+
+            This endpoint initializes and starts an inference session for the specified model version.
+
+            Args:
+                dataset_id (str): ID of a Roboflow dataset corresponding to the model.
+                version_id (str): ID of a Roboflow dataset version corresponding to the model.
+                api_key (str, optional): Roboflow API Key for artifact retrieval.
+                countinference (Optional[bool]): Whether to count inference or not.
+                service_secret (Optional[str]): The service secret for the request.
+
+            Returns:
+                JSONResponse: A response object containing the status and a success message.
+            """
+            logger.debug(
+                f"Reached /start/{dataset_id}/{version_id} with {dataset_id}/{version_id}"
+            )
+            model_id = f"{dataset_id}/{version_id}"
+            model_manager.add_model(
+                model_id,
+                api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+
+            return JSONResponse(
+                {
+                    "status": 200,
+                    "message": "inference session started from local memory.",
+                }
+            )
+
+    return router
diff --git a/inference/core/interfaces/http/routes/models.py b/inference/core/interfaces/http/routes/models.py
new file mode 100644
index 0000000000..d75032b938
--- /dev/null
+++ b/inference/core/interfaces/http/routes/models.py
@@ -0,0 +1,132 @@
+"""Model registry and server-state HTTP routes (list models, add/clear)."""
+
+from typing import Optional
+
+from fastapi import APIRouter
+
+from inference.core import logger
+from inference.core.entities.requests.server_state import (
+    AddModelRequest,
+    ClearModelRequest,
+)
+from inference.core.entities.responses.server_state import ModelsDescriptions
+from inference.core.env import (
+    GET_MODEL_REGISTRY_ENABLED,
+    GCP_SERVERLESS,
+    LAMBDA,
+)
+from inference.core.interfaces.http.error_handlers import with_route_exceptions
+from inference.core.managers.base import ModelManager
+from inference.models.aliases import resolve_roboflow_model_alias
+
+def create_models_router(model_manager: ModelManager) -> APIRouter:
+    router = APIRouter()
+
+    if not LAMBDA and GET_MODEL_REGISTRY_ENABLED:
+
+        @router.get(
+            "/model/registry",
+            response_model=ModelsDescriptions,
+            summary="Get model keys",
+            description="Get the ID of each loaded model",
+        )
+        def registry():
+            """Get the ID of each loaded model in the registry.
+
+            Returns:
+                ModelsDescriptions: The object containing models descriptions
+            """
+            logger.debug("Reached /model/registry")
+            models_descriptions = model_manager.describe_models()
+            return ModelsDescriptions.from_models_descriptions(
+                models_descriptions=models_descriptions
+            )
+
+    if not (LAMBDA or GCP_SERVERLESS):
+
+        @router.post(
+            "/model/add",
+            response_model=ModelsDescriptions,
+            summary="Load a model",
+            description="Load the model with the given model ID",
+        )
+        @with_route_exceptions
+        def model_add(
+            request: AddModelRequest,
+            countinference: Optional[bool] = None,
+            service_secret: Optional[str] = None,
+        ):
+            """Load the model with the given model ID into the model manager.
+
+            Args:
+                request (AddModelRequest): The request containing the model ID and optional API key.
+                countinference (Optional[bool]): Whether to count inference or not.
+                service_secret (Optional[str]): The service secret for the request.
+
+            Returns:
+                ModelsDescriptions: The object containing models descriptions
+            """
+            logger.debug("Reached /model/add")
+            de_aliased_model_id = resolve_roboflow_model_alias(
+                model_id=request.model_id
+            )
+            logger.info(f"Loading model: {de_aliased_model_id}")
+            model_manager.add_model(
+                de_aliased_model_id,
+                request.api_key,
+                countinference=countinference,
+                service_secret=service_secret,
+            )
+            models_descriptions = model_manager.describe_models()
+            return ModelsDescriptions.from_models_descriptions(
+                models_descriptions=models_descriptions
+            )
+
+        @router.post(
+            "/model/remove",
+            response_model=ModelsDescriptions,
+            summary="Remove a model",
+            description="Remove the model with the given model ID",
+        )
+        @with_route_exceptions
+        def model_remove(request: ClearModelRequest):
+            """Remove the model with the given model ID from the model manager.
+
+            Args:
+                request (ClearModelRequest): The request containing the model ID to be removed.
+
+            Returns:
+                ModelsDescriptions: The object containing models descriptions
+            """
+            logger.debug("Reached /model/remove")
+            de_aliased_model_id = resolve_roboflow_model_alias(
+                model_id=request.model_id
+            )
+            model_manager.remove(de_aliased_model_id)
+            models_descriptions = model_manager.describe_models()
+            return ModelsDescriptions.from_models_descriptions(
+                models_descriptions=models_descriptions
+            )
+
+        @router.post(
+            "/model/clear",
+            response_model=ModelsDescriptions,
+            summary="Remove all models",
+            description="Remove all loaded models",
+        )
+        @with_route_exceptions
+        def model_clear():
+            """Remove all loaded models from the model manager.
+
+            Returns:
+                ModelsDescriptions: The object containing models descriptions
+            """
+            logger.debug("Reached /model/clear")
+            model_manager.clear()
+            models_descriptions = model_manager.describe_models()
+            return ModelsDescriptions.from_models_descriptions(
+                models_descriptions=models_descriptions
+            )
+
+    return router
+
diff --git a/inference/core/interfaces/http/routes/notebook.py b/inference/core/interfaces/http/routes/notebook.py
new file mode 100644
index 0000000000..378ebd2d7f
--- /dev/null
+++ b/inference/core/interfaces/http/routes/notebook.py
@@ -0,0 +1,52 @@
+"""Jupyter notebook server HTTP routes."""
+
+from fastapi import APIRouter
+from inference.core.env import NOTEBOOK_ENABLED, NOTEBOOK_PASSWORD, NOTEBOOK_PORT
+from inference.core.interfaces.http.error_handlers import with_route_exceptions
+from inference.core.utils.notebooks import start_notebook
+from time import sleep
+from starlette.responses import RedirectResponse
+from inference.core import logger
+
+
+def create_notebook_router() -> APIRouter:
+    router = APIRouter()
+    @router.get(
+        "/notebook/start",
+        summary="Jupyter Lab Server Start",
+        description="Starts a jupyter lab server for running development code",
+    )
+    @with_route_exceptions
+    def notebook_start(browserless: bool = False):
+        """Starts a jupyter lab server for running development code.
+
+        Args:
+            inference_request (NotebookStartRequest): The request containing the necessary details for starting a jupyter lab server.
+            background_tasks: (BackgroundTasks) pool of fastapi background tasks
+
+        Returns:
+            NotebookStartResponse: The response containing the URL of the jupyter lab server.
+        """
+        logger.debug(f"Reached /notebook/start")
+        if NOTEBOOK_ENABLED:
+            start_notebook()
+            if browserless:
+                return {
+                    "success": True,
+                    "message": f"Jupyter Lab server started at http://localhost:{NOTEBOOK_PORT}?token={NOTEBOOK_PASSWORD}",
+                }
+            else:
+                sleep(2)
+                return RedirectResponse(
+                    f"http://localhost:{NOTEBOOK_PORT}/lab/tree/quickstart.ipynb?token={NOTEBOOK_PASSWORD}"
+                )
+        else:
+            if browserless:
+                return {
+                    "success": False,
+                    "message": "Notebook server is not enabled. Enable notebooks via the NOTEBOOK_ENABLED environment variable.",
+                }
+            else:
+                return RedirectResponse(f"/notebook-instructions.html")
+
+    return router
diff --git a/inference/core/interfaces/http/routes/stream.py b/inference/core/interfaces/http/routes/stream.py
new file mode 100644
index 0000000000..4b5e7f659b
--- /dev/null
+++ b/inference/core/interfaces/http/routes/stream.py
@@ -0,0 +1,134 @@
+"""Stream / inference pipelines HTTP routes."""
+
+from typing import Optional
+
+from fastapi import APIRouter, Request
+
+from inference.core import logger
+from inference.core.interfaces.stream_manager.api.entities import (
+    CommandResponse,
+    ConsumePipelineResponse,
+    InferencePipelineStatusResponse,
+    InitializeWebRTCPipelineResponse,
+    ListPipelinesResponse,
+)
+from inference.core.interfaces.stream_manager.api.stream_manager_client import (
+    StreamManagerClient,
+)
+from inference.core.interfaces.stream_manager.manager_app.entities import (
+    ConsumeResultsPayload,
+    InitialisePipelinePayload,
+    InitialiseWebRTCPipelinePayload,
+)
+from inference.core.interfaces.http.error_handlers import with_route_exceptions_async
+
+
+def create_stream_router(stream_manager_client: StreamManagerClient) -> APIRouter:
+    router = APIRouter()
+
+    @router.get(
+        "/inference_pipelines/list",
+        response_model=ListPipelinesResponse,
+        summary="[EXPERIMENTAL] List active InferencePipelines",
+        description="[EXPERIMENTAL] Listing all active InferencePipelines processing videos",
+    )
+    @with_route_exceptions_async
+    async def list_pipelines(_: Request) -> ListPipelinesResponse:
+        return await stream_manager_client.list_pipelines()
+
+    @router.get(
+        "/inference_pipelines/{pipeline_id}/status",
+        response_model=InferencePipelineStatusResponse,
+        summary="[EXPERIMENTAL] Get status of InferencePipeline",
+        description="[EXPERIMENTAL] Get status of InferencePipeline",
+    )
+    @with_route_exceptions_async
+    async def get_status(pipeline_id: str) -> InferencePipelineStatusResponse:
+        return await stream_manager_client.get_status(
+            pipeline_id=pipeline_id
+        )
+
+    @router.post(
+        "/inference_pipelines/initialise",
+        response_model=CommandResponse,
+        summary="[EXPERIMENTAL] Starts new InferencePipeline",
+        description="[EXPERIMENTAL] Starts new InferencePipeline",
+    )
+    @with_route_exceptions_async
+    async def initialise(request: InitialisePipelinePayload) -> CommandResponse:
+        return await stream_manager_client.initialise_pipeline(
+            initialisation_request=request
+        )
+
+    @router.post(
+        "/inference_pipelines/initialise_webrtc",
+        response_model=InitializeWebRTCPipelineResponse,
+        summary="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
+        description="[EXPERIMENTAL] Establishes WebRTC peer connection and starts new InferencePipeline consuming video track",
+    )
+    @with_route_exceptions_async
+    async def initialise_webrtc_inference_pipeline(
+        request: InitialiseWebRTCPipelinePayload,
+    ) -> InitializeWebRTCPipelineResponse:
+        logger.debug("Received initialise webrtc inference pipeline request")
+        resp = await stream_manager_client.initialise_webrtc_pipeline(
+            initialisation_request=request
+        )
+        logger.debug("Returning initialise webrtc inference pipeline response")
+        return resp
+
+    @router.post(
+        "/inference_pipelines/{pipeline_id}/pause",
+        response_model=CommandResponse,
+        summary="[EXPERIMENTAL] Pauses the InferencePipeline",
+        description="[EXPERIMENTAL] Pauses the InferencePipeline",
+    )
+    @with_route_exceptions_async
+    async def pause(pipeline_id: str) -> CommandResponse:
+        return await stream_manager_client.pause_pipeline(
+            pipeline_id=pipeline_id
+        )
+
+    @router.post(
+        "/inference_pipelines/{pipeline_id}/resume",
+        response_model=CommandResponse,
+        summary="[EXPERIMENTAL] Resumes the InferencePipeline",
+        description="[EXPERIMENTAL] Resumes the InferencePipeline",
+    )
+    @with_route_exceptions_async
+    async def resume(pipeline_id: str) -> CommandResponse:
+        return await stream_manager_client.resume_pipeline(
+            pipeline_id=pipeline_id
+        )
+
+    @router.post(
+        "/inference_pipelines/{pipeline_id}/terminate",
+        response_model=CommandResponse,
+        summary="[EXPERIMENTAL] Terminates the InferencePipeline",
+        description="[EXPERIMENTAL] Terminates the InferencePipeline",
+    )
+    @with_route_exceptions_async
+    async def terminate(pipeline_id: str) -> CommandResponse:
+        return await stream_manager_client.terminate_pipeline(
+            pipeline_id=pipeline_id
+        )
+
+    @router.get(
+        "/inference_pipelines/{pipeline_id}/consume",
+        response_model=ConsumePipelineResponse,
+        summary="[EXPERIMENTAL] Consumes InferencePipeline result",
+        description="[EXPERIMENTAL] Consumes InferencePipeline result",
+    )
+    @with_route_exceptions_async
+    async def consume(
+        pipeline_id: str,
+        request: Optional[ConsumeResultsPayload] = None,
+    ) -> ConsumePipelineResponse:
+        if request is None:
+            request = ConsumeResultsPayload()
+        return await stream_manager_client.consume_pipeline_result(
+            pipeline_id=pipeline_id,
+            excluded_fields=request.excluded_fields,
+        )
+
+    return router
diff --git a/inference/core/interfaces/http/routes/webrtc.py b/inference/core/interfaces/http/routes/webrtc.py
new file mode 100644
index 0000000000..6b94373b7f
--- /dev/null
+++ b/inference/core/interfaces/http/routes/webrtc.py
@@ -0,0 +1,89 @@
+"""WebRTC worker initialization HTTP routes."""
+
+import re
+from fastapi import APIRouter, Request
+from pydantic import ValidationError
+from inference.core import logger
+from inference.core.env import BUILDER_ORIGIN
+from inference.core.exceptions import (
+    MissingApiKeyError,
+    RoboflowAPINotAuthorizedError,
+    RoboflowAPINotNotFoundError,
+    WebRTCConfigurationError,
+)
+from inference.core.interfaces.http.error_handlers import with_route_exceptions_async
+from inference.core.interfaces.stream_manager.api.entities import (
+    CommandContext,
+    InitializeWebRTCResponse,
+)
+from inference.core.interfaces.stream_manager.manager_app.entities import OperationStatus
+from inference.core.interfaces.webrtc_worker import start_worker
+from inference.core.interfaces.webrtc_worker.entities import (
+    WebRTCWorkerRequest,
+    WebRTCWorkerResult,
+)
+from inference.core.workflows.errors import WorkflowError, WorkflowSyntaxError
+
+
+def create_webrtc_worker_router() -> APIRouter:
+    router = APIRouter()
+    @router.post(
+        "/initialise_webrtc_worker",
+        response_model=InitializeWebRTCResponse,
+        summary="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
+        description="[EXPERIMENTAL] Establishes WebRTC peer connection and processes video stream in spawned process or modal function",
+    )
+    @with_route_exceptions_async
+    async def initialise_webrtc_worker(
+        request: WebRTCWorkerRequest,
+        r: Request,
+    ) -> InitializeWebRTCResponse:
+        if str(r.headers.get("origin")).lower() == BUILDER_ORIGIN.lower():
+            if re.search(
+                r"^https://[^.]+\.roboflow\.[^./]+/", str(r.url).lower()
+            ):
+                request.is_preview = True
+
+        logger.debug("Received initialise_webrtc_worker request")
+        worker_result: WebRTCWorkerResult = await start_worker(
+            webrtc_request=request,
+        )
+        if worker_result.exception_type is not None:
+            if worker_result.exception_type == "WorkflowSyntaxError":
+                raise WorkflowSyntaxError(
+                    public_message=worker_result.error_message,
+                    context=worker_result.error_context,
+                    inner_error=worker_result.inner_error,
+                )
+            if worker_result.exception_type == "WorkflowError":
+                raise WorkflowError(
+                    public_message=worker_result.error_message,
+                    context=worker_result.error_context,
+                )
+            expected_exceptions = {
+                "Exception": Exception,
+                "KeyError": KeyError,
+                "MissingApiKeyError": MissingApiKeyError,
+                "NotImplementedError": NotImplementedError,
+                "RoboflowAPINotAuthorizedError": RoboflowAPINotAuthorizedError,
+                "RoboflowAPINotNotFoundError": RoboflowAPINotNotFoundError,
+                "ValidationError": ValidationError,
+                "WebRTCConfigurationError": WebRTCConfigurationError,
+            }
+            exc = expected_exceptions.get(
+                worker_result.exception_type, Exception
+            )(worker_result.error_message)
+            logger.error(
+                f"Initialise webrtc worker failed with %s: %s",
+                worker_result.exception_type,
+                worker_result.error_message,
+            )
+            raise exc
+        logger.debug("Returning initialise_webrtc_worker response")
+        return InitializeWebRTCResponse(
+            context=CommandContext(),
+            status=OperationStatus.SUCCESS,
+            sdp=worker_result.answer.sdp,
+            type=worker_result.answer.type,
+        )
+    return router
diff --git a/inference/core/interfaces/http/routes/workflows.py b/inference/core/interfaces/http/routes/workflows.py
new file mode 100644
index 0000000000..b481292b7c
--- /dev/null
+++ b/inference/core/interfaces/http/routes/workflows.py
@@ -0,0 +1,369 @@
+"""Workflow-related HTTP routes (describe, run, blocks, validate)."""
+
+from typing import Any, Dict, List, Optional, Union
+
+from fastapi import APIRouter, BackgroundTasks, Query, Request
+from fastapi.responses import Response
+from concurrent.futures import ThreadPoolExecutor
+
+from inference.core import logger
+from inference.core.entities.requests.workflows import (
+    DescribeBlocksRequest,
+    PredefinedWorkflowDescribeInterfaceRequest,
+    PredefinedWorkflowInferenceRequest,
+    WorkflowInferenceRequest,
+    WorkflowSpecificationDescribeInterfaceRequest,
+    WorkflowSpecificationInferenceRequest,
+)
+from inference.core.entities.responses.workflows import (
+    DescribeInterfaceResponse,
+    ExecutionEngineVersions,
+    WorkflowInferenceResponse,
+    WorkflowsBlocksDescription,
+    WorkflowsBlocksSchemaDescription,
+    WorkflowValidationStatus,
+)
+from inference.core.env import (
+    ENABLE_WORKFLOWS_PROFILING,
+    GCP_SERVERLESS,
+    LAMBDA,
+    WORKFLOWS_MAX_CONCURRENT_STEPS,
+    WORKFLOWS_PROFILER_BUFFER_SIZE,
+    WORKFLOWS_STEP_EXECUTION_MODE,
+)
+from inference.core.interfaces.http.error_handlers import with_route_exceptions
+from inference.core.interfaces.http.handlers.workflows import (
+    filter_out_unwanted_workflow_outputs,
+    handle_describe_workflows_blocks_request,
+    handle_describe_workflows_interface,
+)
+from inference.core.interfaces.http.middlewares.gzip import gzip_response_if_requested
+from inference.core.interfaces.http.orjson_utils import orjson_response
+from inference.core.managers.base import ModelManager
+from inference.core.roboflow_api import get_workflow_specification
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.core import (
+    ExecutionEngine,
+    get_available_versions,
+)
+from inference.core.workflows.execution_engine.entities.base import OutputDefinition
+from inference.core.workflows.execution_engine.introspection.blocks_loader import (
+    load_workflow_blocks,
+)
+from inference.core.workflows.execution_engine.profiling.core import (
+    BaseWorkflowsProfiler,
+    NullWorkflowsProfiler,
+    WorkflowsProfiler,
+)
+from inference.core.workflows.execution_engine.v1.compiler.syntactic_parser import (
+    get_workflow_schema_description,
+    parse_workflow_definition,
+)
+from inference.usage_tracking.collector import usage_collector
+
+
+def create_workflows_router(
+    model_manager: ModelManager,
+    shared_thread_pool_executor: Optional[ThreadPoolExecutor],
+) -> APIRouter:
+    router = APIRouter()
+
+    def process_workflow_inference_request(
+        workflow_request: WorkflowInferenceRequest,
+        workflow_specification: dict,
+        background_tasks: Optional[BackgroundTasks],
+        profiler: WorkflowsProfiler,
+    ) -> WorkflowInferenceResponse:
+        workflow_init_parameters = {
+            "workflows_core.model_manager": model_manager,
+            "workflows_core.api_key": workflow_request.api_key,
+            "workflows_core.background_tasks": background_tasks,
+        }
+        execution_engine = ExecutionEngine.init(
+            workflow_definition=workflow_specification,
+            init_parameters=workflow_init_parameters,
+            max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+            prevent_local_images_loading=True,
+            profiler=profiler,
+            executor=shared_thread_pool_executor,
+            workflow_id=workflow_request.workflow_id,
+        )
+        is_preview = False
+        if hasattr(workflow_request, "is_preview"):
+            is_preview = workflow_request.is_preview
+        workflow_results = execution_engine.run(
+            runtime_parameters=workflow_request.inputs,
+            serialize_results=True,
+            _is_preview=is_preview,
+        )
+        with profiler.profile_execution_phase(
+            name="workflow_results_filtering",
+            categories=["inference_package_operation"],
+        ):
+            outputs = filter_out_unwanted_workflow_outputs(
+                workflow_results=workflow_results,
+                excluded_fields=workflow_request.excluded_fields,
+            )
+        profiler_trace = profiler.export_trace()
+        response = WorkflowInferenceResponse(
+            outputs=outputs,
+            profiler_trace=profiler_trace,
+        )
+        return orjson_response(response=response)
+
+    @router.post(
+        "/{workspace_name}/workflows/{workflow_id}/describe_interface",
+        response_model=DescribeInterfaceResponse,
+        summary="Endpoint to describe interface of predefined workflow",
+        description="Checks Roboflow API for workflow definition, once acquired - describes workflow inputs and outputs",
+    )
+    @with_route_exceptions
+    def describe_predefined_workflow_interface(
+        workspace_name: str,
+        workflow_id: str,
+        workflow_request: PredefinedWorkflowDescribeInterfaceRequest,
+    ) -> DescribeInterfaceResponse:
+        workflow_specification = get_workflow_specification(
+            api_key=workflow_request.api_key,
+            workspace_id=workspace_name,
+            workflow_id=workflow_id,
+            use_cache=workflow_request.use_cache,
+            workflow_version_id=workflow_request.workflow_version_id,
+        )
+        return handle_describe_workflows_interface(
+            definition=workflow_specification,
+        )
+
+    @router.post(
+        "/workflows/describe_interface",
+        response_model=DescribeInterfaceResponse,
+        summary="Endpoint to describe interface of workflow given in request",
+        description="Parses workflow definition and retrieves describes inputs and outputs",
+    )
+    @with_route_exceptions
+    def describe_workflow_interface(
+        workflow_request: WorkflowSpecificationDescribeInterfaceRequest,
+    ) -> DescribeInterfaceResponse:
+        return handle_describe_workflows_interface(
+            definition=workflow_request.specification,
+        )
+
+    @router.post(
+        "/{workspace_name}/workflows/{workflow_id}",
+        response_model=WorkflowInferenceResponse,
+        summary="Endpoint to run predefined workflow",
+        description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body",
+    )
+    @router.post(
+        "/infer/workflows/{workspace_name}/{workflow_id}",
+        response_model=WorkflowInferenceResponse,
+        summary="[LEGACY] Endpoint to run predefined workflow",
+        description="Checks Roboflow API for workflow definition, once acquired - parses and executes injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024",
+        deprecated=True,
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_from_predefined_workflow(
+        workspace_name: str,
+        workflow_id: str,
+        workflow_request: PredefinedWorkflowInferenceRequest,
+        background_tasks: BackgroundTasks,
+    ) -> WorkflowInferenceResponse:
+        if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
+            profiler = BaseWorkflowsProfiler.init(
+                max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
+            )
+        else:
+            profiler = NullWorkflowsProfiler.init()
+        with profiler.profile_execution_phase(
+            name="workflow_definition_fetching",
+            categories=["inference_package_operation"],
+        ):
+            workflow_specification = get_workflow_specification(
+                api_key=workflow_request.api_key,
+                workspace_id=workspace_name,
+                workflow_id=workflow_id,
+                use_cache=workflow_request.use_cache,
+                workflow_version_id=workflow_request.workflow_version_id,
+            )
+        if not workflow_request.workflow_id:
+            workflow_request.workflow_id = workflow_id
+        if not workflow_specification.get("id"):
+            logger.warning(
+                "Internal workflow ID missing in specification for '%s'",
+                workflow_id,
+            )
+        return process_workflow_inference_request(
+            workflow_request=workflow_request,
+            workflow_specification=workflow_specification,
+            background_tasks=(
+                background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
+            ),
+            profiler=profiler,
+        )
+
+    @router.post(
+        "/workflows/run",
+        response_model=WorkflowInferenceResponse,
+        summary="Endpoint to run workflow specification provided in payload",
+        description="Parses and executes workflow specification, injecting runtime parameters from request body.",
+    )
+    @router.post(
+        "/infer/workflows",
+        response_model=WorkflowInferenceResponse,
+        summary="[LEGACY] Endpoint to run workflow specification provided in payload",
+        description="Parses and executes workflow specification, injecting runtime parameters from request body. This endpoint is deprecated and will be removed end of Q2 2024.",
+        deprecated=True,
+    )
+    @with_route_exceptions
+    @usage_collector("request")
+    def infer_from_workflow(
+        workflow_request: WorkflowSpecificationInferenceRequest,
+        background_tasks: BackgroundTasks,
+    ) -> WorkflowInferenceResponse:
+        if ENABLE_WORKFLOWS_PROFILING and workflow_request.enable_profiling:
+            profiler = BaseWorkflowsProfiler.init(
+                max_runs_in_buffer=WORKFLOWS_PROFILER_BUFFER_SIZE,
+            )
+        else:
+            profiler = NullWorkflowsProfiler.init()
+        return process_workflow_inference_request(
+            workflow_request=workflow_request,
+            workflow_specification=workflow_request.specification,
+            background_tasks=(
+                background_tasks if not (LAMBDA or GCP_SERVERLESS) else None
+            ),
+            profiler=profiler,
+        )
+
+    @router.get(
+        "/workflows/execution_engine/versions",
+        response_model=ExecutionEngineVersions,
+        summary="Returns available Execution Engine versions sorted from oldest to newest",
+        description="Returns available Execution Engine versions sorted from oldest to newest",
+    )
+    @with_route_exceptions
+    def get_execution_engine_versions() -> ExecutionEngineVersions:
+        versions = get_available_versions()
+        return ExecutionEngineVersions(versions=versions)
+
+    @router.get(
+        "/workflows/blocks/describe",
+        response_model=WorkflowsBlocksDescription,
+        summary="[LEGACY] Endpoint to get definition of workflows blocks that are accessible",
+        description="Endpoint provides detailed information about workflows building blocks that are "
+        "accessible in the inference server. This information could be used to programmatically "
+        "build / display workflows.",
+        deprecated=True,
+    )
+    @with_route_exceptions
+    def describe_workflows_blocks_get(
+        request: Request,
+    ) -> Union[WorkflowsBlocksDescription, Response]:
+        result = handle_describe_workflows_blocks_request()
+        return gzip_response_if_requested(request=request, response=result)
+
+    @router.post(
+        "/workflows/blocks/describe",
+        response_model=WorkflowsBlocksDescription,
+        summary="[EXPERIMENTAL] Endpoint to get definition of workflows blocks that are accessible",
+        description="Endpoint provides detailed information about workflows building blocks that are "
+        "accessible in the inference server. This information could be used to programmatically "
+        "build / display workflows. Additionally - in request body one can specify list of "
+        "dynamic blocks definitions which will be transformed into blocks and used to generate "
+        "schemas and definitions of connections",
+    )
+    @with_route_exceptions
+    def describe_workflows_blocks_post(
+        request: Request,
+        request_payload: Optional[DescribeBlocksRequest] = None,
+    ) -> Union[WorkflowsBlocksDescription, Response]:
+        dynamic_blocks_definitions = None
+        requested_execution_engine_version = None
+        api_key = None
+        if request_payload is not None:
+            dynamic_blocks_definitions = (
+                request_payload.dynamic_blocks_definitions
+            )
+            requested_execution_engine_version = (
+                request_payload.execution_engine_version
+            )
+            api_key = request_payload.api_key or request.query_params.get(
+                "api_key", None
+            )
+        result = handle_describe_workflows_blocks_request(
+            dynamic_blocks_definitions=dynamic_blocks_definitions,
+            requested_execution_engine_version=requested_execution_engine_version,
+            api_key=api_key,
+        )
+        return gzip_response_if_requested(request=request, response=result)
+
+    @router.get(
+        "/workflows/definition/schema",
+        response_model=WorkflowsBlocksSchemaDescription,
+        summary="Endpoint to fetch the workflows block schema",
+        description="Endpoint to fetch the schema of all available blocks. This information can be "
+        "used to validate workflow definitions and suggest syntax in the JSON editor.",
+    )
+    @with_route_exceptions
+    def get_workflow_schema(
+        request: Request,
+    ) -> WorkflowsBlocksSchemaDescription:
+        result = get_workflow_schema_description()
+        return gzip_response_if_requested(request, response=result)
+
+    @router.post(
+        "/workflows/blocks/dynamic_outputs",
+        response_model=List[OutputDefinition],
+        summary="[EXPERIMENTAL] Endpoint to get definition of dynamic output for workflow step",
+        description="Endpoint to be used when step outputs can be discovered only after "
+        "filling manifest with data.",
+    )
+    @with_route_exceptions
+    def get_dynamic_block_outputs(
+        step_manifest: Dict[str, Any],
+    ) -> List[OutputDefinition]:
+        dummy_workflow_definition = {
+            "version": "1.0",
+            "inputs": [],
+            "steps": [step_manifest],
+            "outputs": [],
+        }
+        available_blocks = load_workflow_blocks()
+        parsed_definition = parse_workflow_definition(
+            raw_workflow_definition=dummy_workflow_definition,
+            available_blocks=available_blocks,
+        )
+        parsed_manifest = parsed_definition.steps[0]
+        return parsed_manifest.get_actual_outputs()
+
+    @router.post(
+        "/workflows/validate",
+        response_model=WorkflowValidationStatus,
+        summary="[EXPERIMENTAL] Endpoint to validate",
+        description="Endpoint provides a way to check validity of JSON workflow definition.",
+    )
+    @with_route_exceptions
+    def validate_workflow(
+        specification: dict,
+        api_key: Optional[str] = Query(
+            None,
+            description="Roboflow API Key that will be passed to the model during initialization for artifact retrieval",
+        ),
+    ) -> WorkflowValidationStatus:
+        step_execution_mode = StepExecutionMode(WORKFLOWS_STEP_EXECUTION_MODE)
+        workflow_init_parameters = {
+            "workflows_core.model_manager": model_manager,
+            "workflows_core.api_key": api_key,
+            "workflows_core.background_tasks": None,
+            "workflows_core.step_execution_mode": step_execution_mode,
+        }
+        _ = ExecutionEngine.init(
+            workflow_definition=specification,
+            init_parameters=workflow_init_parameters,
+            max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
+            prevent_local_images_loading=True,
+        )
+        return WorkflowValidationStatus(status="ok")
+
+    return router