diff --git a/stop_time_diagnostics/diagnostics/__init__.py b/stop_time_diagnostics/diagnostics/__init__.py
new file mode 100644
index 0000000..0e9e075
--- /dev/null
+++ b/stop_time_diagnostics/diagnostics/__init__.py
@@ -0,0 +1,17 @@
+"""Stop-time diagnostic module for fault-tolerant training."""
+
+from .actor import DiagnosticActor
+from .runner import (
+    StopReason,
+    DiagnosticConfig,
+    DiagnosticResult,
+    DiagnosticRunner,
+)
+
+__all__ = [
+    "DiagnosticActor",
+    "StopReason",
+    "DiagnosticConfig",
+    "DiagnosticResult",
+    "DiagnosticRunner",
+]
diff --git a/stop_time_diagnostics/diagnostics/actor.py b/stop_time_diagnostics/diagnostics/actor.py
new file mode 100644
index 0000000..64e6678
--- /dev/null
+++ b/stop_time_diagnostics/diagnostics/actor.py
@@ -0,0 +1,316 @@
+"""
+Independent diagnostic Ray actor for GPU and communication tests.
+
+This can be spawned on any GPU to run diagnostic tests and manages its own NCCL process group.
+"""
+
+import os
+import socket
+import subprocess
+import traceback
+import logging
+from typing import Dict, Any, List, Optional
+import torch
+import torch.distributed as dist
+import ray
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@ray.remote(num_gpus=1)
+class DiagnosticActor:
+    """
+    Independent Ray actor for running diagnostic tests.
+
+    Can be spawned on any GPU without depending on application actors.
+    Manages its own NCCL process group for communication tests.
+    """
+
+    def __init__(
+        self, rank: int, world_size: int, master_addr: str, master_port: int
+    ):
+        self._rank = rank
+        self._world_size = world_size
+        self._master_addr = master_addr
+        self._master_port = master_port
+        self._process_group_initialized = False
+        self._intra_node_group: Optional[dist.ProcessGroup] = None
+        self._intra_node_ranks: Optional[List[int]] = None
+
+    def set_master_info(self, master_addr: str, master_port: int):
+        """Update master address and port (used when first actor discovers these values)."""
+        self._master_addr = master_addr
+        self._master_port = master_port
+
+    def init_process_group(self):
+        """Initialize NCCL process group for communication tests."""
+        os.environ["MASTER_ADDR"] = self._master_addr
+        os.environ["MASTER_PORT"] = str(self._master_port)
+        os.environ["WORLD_SIZE"] = str(self._world_size)
+        os.environ["RANK"] = str(self._rank)
+        os.environ["LOCAL_RANK"] = "0"
+
+        if not dist.is_initialized():
+            dist.init_process_group(backend="nccl")
+        self._process_group_initialized = True
+        logger.info(f"DiagnosticActor rank {self._rank}: process group initialized")
+
+    def destroy_process_group(self):
+        """Clean up process group."""
+        # Destroy intra-node group first if it exists
+        if self._intra_node_group is not None:
+            dist.destroy_process_group(self._intra_node_group)
+            self._intra_node_group = None
+            self._intra_node_ranks = None
+        if dist.is_initialized():
+            dist.destroy_process_group()
+        self._process_group_initialized = False
+
+    def init_intra_node_group(self, ranks: List[int]):
+        """
+        Create a sub-group for intra-node communication.
+
+        Args:
+            ranks: List of global ranks that are on the same node as this actor
+        """
+        self._intra_node_ranks = ranks
+        # All ranks must call new_group, but only ranks in the list will be part of the group
+        self._intra_node_group = dist.new_group(ranks=ranks)
+        logger.info(f"DiagnosticActor rank {self._rank}: intra-node group created with ranks {ranks}")
+
+    def get_node_id(self) -> str:
+        """Get Ray node ID."""
+        return ray.get_runtime_context().get_node_id()
+
+    def get_gpu_id(self):
+        """Get assigned GPU ID."""
+        return ray.get_gpu_ids()[0] if ray.get_gpu_ids() else -1
+
+    @staticmethod
+    def get_node_ip() -> str:
+        """Get current node IP address."""
+        return ray._private.services.get_node_ip_address().strip("[]")
+
+    @staticmethod
+    def get_free_port() -> int:
+        """Get a free port on the current node."""
+        with socket.socket() as sock:
+            sock.bind(("", 0))
+            return sock.getsockname()[1]
+
+    def run_gpu_health_check(self) -> Dict[str, Any]:
+        """
+        Phase 1: Check GPU health via nvidia-smi and simple CUDA operation.
+
+        Returns:
+            Dict with 'passed', 'phase', 'rank', and optional 'metrics' or 'error'
+        """
+        # Collect base info for all results
+        base_info = {
+            "phase": "gpu_health",
+            "rank": self._rank,
+            "node_id": self.get_node_id(),
+            "node_ip": self.get_node_ip(),
+            "ray_gpu_ids": ray.get_gpu_ids(),
+        }
+
+        metrics = {}
+        errors = []
+
+        # Step 1: Query GPU info via nvidia-smi
+        try:
+            result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,ecc.errors.uncorrected.volatile.total",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=30,
+            )
+
+            metrics["nvidia_smi_stdout"] = result.stdout.strip()
+            metrics["nvidia_smi_stderr"] = result.stderr.strip()
+            metrics["nvidia_smi_returncode"] = result.returncode
+
+            # Parse nvidia-smi output
+            if result.returncode == 0 and result.stdout.strip():
+                # nvidia-smi returns info for all GPUs, parse each line
+                gpu_infos = []
+                for line in result.stdout.strip().split("\n"):
+                    parts = [p.strip() for p in line.split(", ")]
+                    if len(parts) >= 5:
+                        gpu_info = {
+                            "index": parts[0],
+                            "name": parts[1],
+                            "temperature_c": parts[2],
+                            "memory_used_mb": parts[3],
+                            "memory_total_mb": parts[4],
+                        }
+                        if len(parts) >= 6:
+                            gpu_info["utilization_percent"] = parts[5]
+                        if len(parts) >= 7 and parts[6] not in ["[N/A]", "N/A"]:
+                            gpu_info["ecc_errors"] = parts[6]
+                        gpu_infos.append(gpu_info)
+                metrics["all_gpus"] = gpu_infos
+            elif result.returncode != 0:
+                errors.append(f"nvidia-smi failed with code {result.returncode}: {result.stderr}")
+        except subprocess.TimeoutExpired:
+            errors.append("nvidia-smi timed out after 30 seconds")
+        except FileNotFoundError:
+            errors.append("nvidia-smi not found in PATH")
+        except Exception as e:
+            errors.append(f"nvidia-smi error: {str(e)}")
+
+        # Step 2: Check CUDA visibility
+        try:
+            metrics["cuda_visible_devices"] = os.environ.get("CUDA_VISIBLE_DEVICES", "not set")
+            metrics["cuda_available"] = torch.cuda.is_available()
+            metrics["cuda_device_count"] = torch.cuda.device_count()
+            if torch.cuda.is_available():
+                metrics["cuda_current_device"] = torch.cuda.current_device()
+                metrics["cuda_device_name"] = torch.cuda.get_device_name()
+                metrics["cuda_device_capability"] = torch.cuda.get_device_capability()
+        except Exception as e:
+            errors.append(f"CUDA info error: {str(e)}")
+
+        # Step 3: Simple CUDA compute test
+        try:
+            if not torch.cuda.is_available():
+                errors.append("CUDA not available for compute test")
+            else:
+                device = torch.cuda.current_device()
+                test_tensor = torch.zeros(1000, 1000, device=device)
+                test_tensor = test_tensor + 1
+                torch.cuda.synchronize(device)
+                del test_tensor
+                metrics["cuda_compute_test"] = "passed"
+        except Exception as e:
+            errors.append(f"CUDA compute test failed: {str(e)}\n{traceback.format_exc()}")
+            metrics["cuda_compute_test"] = "failed"
+
+        # Determine overall pass/fail
+        passed = len(errors) == 0
+
+        result = {
+            **base_info,
+            "passed": passed,
+            "metrics": metrics,
+        }
+
+        if errors:
+            result["errors"] = errors
+
+        return result
+
+    def run_intra_node_comm_test(self) -> Dict[str, Any]:
+        """
+        Phase 2: Test intra-node GPU communication via all-to-all.
+
+        Uses the intra-node sub-group (GPUs on same node only) for communication test.
+
+        Returns:
+            Dict with 'passed', 'phase', 'rank', and optional 'error'
+        """
+        base_info = {
+            "phase": "intra_node_comm",
+            "rank": self._rank,
+            "node_id": self.get_node_id(),
+            "node_ip": self.get_node_ip(),
+            "ray_gpu_ids": ray.get_gpu_ids(),
+        }
+
+        if not self._process_group_initialized:
+            return {
+                **base_info,
+                "passed": False,
+                "errors": ["Process group not initialized"],
+            }
+
+        # If no intra-node group, this is a single GPU on the node - skip test
+        if self._intra_node_group is None or self._intra_node_ranks is None:
+            return {
+                **base_info,
+                "passed": True,
+                "metrics": {"skipped": "single_gpu_on_node", "intra_node_ranks": [self._rank]},
+            }
+
+        try:
+            device = torch.cuda.current_device()
+            group_size = len(self._intra_node_ranks)
+
+            # Create test tensors sized for the intra-node group
+            input_tensor = torch.randn(group_size, 1024, device=device)
+            output_tensor = torch.empty_like(input_tensor)
+
+            # Run all-to-all within intra-node sub-group only
+            dist.all_to_all_single(output_tensor, input_tensor, group=self._intra_node_group)
+            torch.cuda.synchronize(device)
+
+            return {
+                **base_info,
+                "passed": True,
+                "metrics": {
+                    "group_size": group_size,
+                    "intra_node_ranks": self._intra_node_ranks,
+                    "data_elements": group_size * 1024,
+                },
+            }
+
+        except Exception as e:
+            return {
+                **base_info,
+                "passed": False,
+                "errors": [f"{str(e)}\n{traceback.format_exc()}"],
+            }
+
+    def run_inter_node_comm_test(self) -> Dict[str, Any]:
+        """
+        Phase 3: Test inter-node communication via all-gather.
+
+        Uses the diagnostic actor's NCCL process group to test cross-node communication.
+
+        Returns:
+            Dict with 'passed', 'phase', 'rank', and optional 'error'
+        """
+        base_info = {
+            "phase": "inter_node_comm",
+            "rank": self._rank,
+            "node_id": self.get_node_id(),
+            "node_ip": self.get_node_ip(),
+            "ray_gpu_ids": ray.get_gpu_ids(),
+        }
+
+        if not self._process_group_initialized:
+            return {
+                **base_info,
+                "passed": False,
+                "errors": ["Process group not initialized"],
+            }
+
+        try:
+            device = torch.cuda.current_device()
+            world_size = dist.get_world_size()
+
+            # Create test tensor for all-gather
+            input_tensor = torch.randn(1024, device=device)
+            output_tensors = [torch.empty_like(input_tensor) for _ in range(world_size)]
+
+            # Run all-gather across all nodes
+            dist.all_gather(output_tensors, input_tensor)
+            torch.cuda.synchronize(device)
+
+            return {
+                **base_info,
+                "passed": True,
+                "metrics": {"world_size": world_size, "data_elements": 1024},
+            }
+
+        except Exception as e:
+            return {
+                **base_info,
+                "passed": False,
+                "errors": [f"{str(e)}\n{traceback.format_exc()}"],
+            }
diff --git a/stop_time_diagnostics/diagnostics/runner.py b/stop_time_diagnostics/diagnostics/runner.py
new file mode 100644
index 0000000..72154fa
--- /dev/null
+++ b/stop_time_diagnostics/diagnostics/runner.py
@@ -0,0 +1,263 @@
+"""
+Stop-time diagnostic runner for hierarchical failure diagnosis.
+
+This module provides:
+- Failure classification (StopReason)
+- DiagnosticRunner to orchestrate independent DiagnosticActors
+- Hierarchical test execution (GPU health, intra-node comm, inter-node comm)
+"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional, Dict, Any, List
+import logging
+import ray
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from ray.util.placement_group import PlacementGroup
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from .actor import DiagnosticActor
+
+
+class StopReason(Enum):
+    """Classification of stop/failure reasons."""
+
+    UNKNOWN = "unknown"
+    GRACEFUL_STOP = "graceful_stop"  # User-initiated or planned stop
+    GPU_ERROR = "gpu_error"  # GPU hardware error
+    NCCL_ERROR = "nccl_error"  # NCCL communication error
+
+
+@dataclass
+class DiagnosticConfig:
+    """Configuration for the diagnostic module."""
+
+    # Timeouts (in seconds)
+    timeout: float = 120.0
+
+
+@dataclass
+class DiagnosticResult:
+    """Aggregated diagnostic results."""
+
+    stop_reason: StopReason
+    is_graceful_stop: bool = False
+    test_results: List[Dict[str, Any]] = field(default_factory=list)
+    faulty_gpus: List[int] = field(default_factory=list)
+    faulty_nodes: List[str] = field(default_factory=list)
+
+    def summary(self) -> str:
+        """Return a summary string of the diagnostic result."""
+        total = len(self.test_results)
+        passed = sum(1 for r in self.test_results if r.get("passed"))
+        return (
+            f"StopReason: {self.stop_reason.value}, "
+            f"Tests: {passed}/{total} passed, "
+            f"GracefulStop: {self.is_graceful_stop}"
+        )
+
+
+class DiagnosticRunner:
+    """
+    Orchestrates diagnostic tests using independent DiagnosticActors.
+
+    This runner spawns DiagnosticActors on specified GPU bundles and runs
+    hierarchical diagnostic tests. It is completely independent of any
+    application-specific actors (like MegatronActor).
+
+    Usage:
+        runner = DiagnosticRunner()
+        result = runner.run_on_placement_group(pg, bundle_indices)
+    """
+
+    def __init__(self, config: Optional[DiagnosticConfig] = None):
+        self.config = config or DiagnosticConfig()
+
+    def run_on_placement_group(
+        self,
+        pg: PlacementGroup,
+        bundle_indices: List[int],
+    ) -> DiagnosticResult:
+        """
+        Spawn DiagnosticActors on a placement group and run all tests.
+
+        This method:
+        1. Spawns independent DiagnosticActors on the specified bundles
+        2. Initializes NCCL process groups for communication tests
+        3. Runs hierarchical tests (GPU → intra-node → inter-node)
+        4. Cleans up actors after tests complete
+        5. Returns aggregated results
+
+        Args:
+            pg: Ray PlacementGroup to schedule actors on
+            bundle_indices: List of bundle indices to use for diagnostic actors
+
+        Returns:
+            DiagnosticResult with test outcomes and failure classification
+        """
+        world_size = len(bundle_indices)
+        if world_size == 0:
+            return DiagnosticResult(
+                stop_reason=StopReason.UNKNOWN,
+                is_graceful_stop=False,
+            )
+
+        logger.info(f"Starting diagnostics on {world_size} GPUs")
+
+        # Spawn first actor to get master addr/port from a GPU worker node
+        # (Ray head node doesn't have GPUs, so we need a worker node's IP for NCCL)
+        first_actor = DiagnosticActor.options(
+            num_gpus=1,
+            scheduling_strategy=PlacementGroupSchedulingStrategy(
+                placement_group=pg,
+                placement_group_bundle_index=bundle_indices[0],
+            ),
+        ).remote(rank=0, world_size=world_size, master_addr="", master_port=0)
+
+        master_addr = ray.get(first_actor.get_node_ip.remote())
+        master_port = ray.get(first_actor.get_free_port.remote())
+
+        # Update first actor with correct master info
+        ray.get(first_actor.set_master_info.remote(master_addr, master_port))
+
+        logger.info(f"Using master_addr={master_addr}, master_port={master_port}")
+
+        # Spawn remaining diagnostic actors
+        actors = [first_actor]
+        for rank, bundle_idx in enumerate(bundle_indices[1:], start=1):
+            actor = DiagnosticActor.options(
+                num_gpus=1,
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=pg,
+                    placement_group_bundle_index=bundle_idx,
+                ),
+            ).remote(rank, world_size, master_addr, master_port)
+            actors.append(actor)
+
+        try:
+            # Collect node topology from all actors
+            logger.info("Collecting node topology...")
+            node_ids = ray.get(
+                [a.get_node_id.remote() for a in actors],
+                timeout=30,
+            )
+
+            # Group ranks by node
+            node_to_ranks: Dict[str, List[int]] = {}
+            for rank, node_id in enumerate(node_ids):
+                node_to_ranks.setdefault(node_id, []).append(rank)
+
+            logger.info(f"Node topology: {node_to_ranks}")
+
+            # Initialize global process groups on all actors
+            logger.info("Initializing process groups...")
+            ray.get(
+                [a.init_process_group.remote() for a in actors],
+                timeout=self.config.timeout,
+            )
+
+            # Create intra-node sub-groups for nodes with multiple GPUs
+            # IMPORTANT: dist.new_group() must be called by ALL ranks, even those not in the group
+            for node_id, ranks in node_to_ranks.items():
+                if len(ranks) > 1:
+                    logger.info(f"Creating intra-node group for node {node_id[:8]}... with ranks {ranks}")
+                    # All actors must call init_intra_node_group with the same ranks list
+                    ray.get(
+                        [a.init_intra_node_group.remote(ranks) for a in actors],
+                        timeout=30,
+                    )
+
+            # Run hierarchical tests
+            result = self._run_all_phases(actors)
+
+            # Cleanup process groups
+            logger.info("Cleaning up process groups...")
+            ray.get(
+                [a.destroy_process_group.remote() for a in actors],
+                timeout=30,
+            )
+
+        except Exception as e:
+            logger.error(f"Diagnostic run failed: {e}")
+            result = DiagnosticResult(
+                stop_reason=StopReason.UNKNOWN,
+                is_graceful_stop=False,
+                test_results=[{"passed": False, "phase": "setup", "error": str(e)}],
+            )
+
+        finally:
+            # Kill diagnostic actors
+            for actor in actors:
+                try:
+                    ray.kill(actor)
+                except Exception:
+                    pass
+
+        logger.info(f"Diagnostics complete: {result.summary()}")
+        return result
+
+    def _run_all_phases(self, actors: List) -> DiagnosticResult:
+        """Run all diagnostic phases hierarchically."""
+        result = DiagnosticResult(stop_reason=StopReason.UNKNOWN)
+
+        # Phase 1: GPU health
+        logger.info("Phase 1: Running GPU health checks...")
+        gpu_results = ray.get(
+            [a.run_gpu_health_check.remote() for a in actors],
+            timeout=self.config.timeout,
+        )
+        result.test_results.extend(gpu_results)
+
+        gpu_failures = [r for r in gpu_results if not r.get("passed")]
+        if gpu_failures:
+            logger.warning(f"GPU health check failed on {len(gpu_failures)} actors")
+            result.faulty_gpus = [r.get("rank", -1) for r in gpu_failures]
+            result.stop_reason = StopReason.GPU_ERROR
+            return result
+
+        logger.info("Phase 1 passed: All GPUs healthy")
+
+        # Phase 2: Intra-node communication
+        logger.info("Phase 2: Running intra-node communication tests...")
+        intra_results = ray.get(
+            [a.run_intra_node_comm_test.remote() for a in actors],
+            timeout=self.config.timeout,
+        )
+        result.test_results.extend(intra_results)
+
+        intra_failures = [r for r in intra_results if not r.get("passed")]
+        if intra_failures:
+            logger.warning(
+                f"Intra-node comm test failed on {len(intra_failures)} actors"
+            )
+            result.stop_reason = StopReason.NCCL_ERROR
+            return result
+
+        logger.info("Phase 2 passed: Intra-node communication healthy")
+
+        # Phase 3: Inter-node communication
+        logger.info("Phase 3: Running inter-node communication tests...")
+        inter_results = ray.get(
+            [a.run_inter_node_comm_test.remote() for a in actors],
+            timeout=self.config.timeout,
+        )
+        result.test_results.extend(inter_results)
+
+        inter_failures = [r for r in inter_results if not r.get("passed")]
+        if inter_failures:
+            logger.warning(
+                f"Inter-node comm test failed on {len(inter_failures)} actors"
+            )
+            result.stop_reason = StopReason.NCCL_ERROR
+            return result
+
+        logger.info("Phase 3 passed: Inter-node communication healthy")
+
+        # All tests passed - classify as graceful stop
+        result.is_graceful_stop = True
+        result.stop_reason = StopReason.GRACEFUL_STOP
+        logger.info("All diagnostic phases passed - classified as graceful stop")
+
+        return result
diff --git a/stop_time_diagnostics/job.yaml b/stop_time_diagnostics/job.yaml
new file mode 100644
index 0000000..09b8a12
--- /dev/null
+++ b/stop_time_diagnostics/job.yaml
@@ -0,0 +1,30 @@
+# GPU diagnostics job configuration
+# Run GPU health and NCCL communication tests on a Ray cluster
+
+name: stop-time-diagnostics
+
+# Use a base image with CUDA support
+image_uri: anyscale/ray:2.52.0-slim-py312-cu128
+
+compute_config:
+  worker_nodes:
+    - instance_type: g5.12xlarge
+      min_nodes: 3
+      max_nodes: 3
+  min_resources:
+    CPU: 0
+    GPU: 0
+  max_resources:
+    CPU: 144
+    GPU: 12
+
+working_dir: .
+
+env_vars:
+  NCCL_P2P_DISABLE: "1"
+  NCCL_SHM_DISABLE: "1"
+
+# Run diagnostics on 8 GPUs (triggers auto-scaling)
+entrypoint: uv run --isolated main.py --num-gpus 8
+
+max_retries: 0
diff --git a/stop_time_diagnostics/main.py b/stop_time_diagnostics/main.py
new file mode 100644
index 0000000..d173a0e
--- /dev/null
+++ b/stop_time_diagnostics/main.py
@@ -0,0 +1,148 @@
+"""
+Standalone entry point for running stop-time diagnostics as a Ray job.
+
+This script runs GPU health checks and communication tests on specified GPUs
+to diagnose failures in distributed training systems.
+"""
+
+import argparse
+from dataclasses import dataclass
+
+import ray
+from ray.util.placement_group import placement_group
+
+from diagnostics import DiagnosticRunner, DiagnosticConfig
+
+
+@dataclass
+class Config:
+    """Configuration for the diagnostics job."""
+    num_gpus: int = 0  # Number of GPUs (from max_resources or specified)
+    timeout: float = 120.0  # Timeout for each test phase
+
+
+def get_max_gpus_from_cluster() -> int:
+    """
+    Get the maximum number of GPUs that can be provisioned in the cluster.
+
+    This looks at max_resources which defines the cluster's scaling limits.
+    For auto-scaling clusters, this tells us how many GPUs we can request.
+
+    Returns:
+        Maximum number of GPUs available, or 0 if none configured
+    """
+    # Check cluster resources first (already provisioned)
+    resources = ray.cluster_resources()
+    current_gpus = int(resources.get("GPU", 0))
+
+    if current_gpus > 0:
+        print(f"Found {current_gpus} GPUs already provisioned")
+        return current_gpus
+
+    # For auto-scaling clusters, we need to look at available node types
+    # The cluster will scale up when we create a placement group
+    # Default to a reasonable number that matches job.yaml max_resources
+    print("No GPUs currently provisioned - cluster will auto-scale when placement group is created")
+    return 0
+
+
+def main():
+    config = Config()
+
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Run stop-time GPU diagnostics")
+    parser.add_argument("--num-gpus", type=int, default=None,
+                        help="Number of GPUs to request (required for auto-scaling clusters)")
+    parser.add_argument("--timeout", type=float, default=config.timeout,
+                        help="Timeout in seconds for each test phase")
+    args = parser.parse_args()
+
+    config.timeout = args.timeout
+
+    # Initialize Ray (connects to existing cluster if running as a job)
+    ray.init()
+
+    # Determine number of GPUs
+    if args.num_gpus is not None:
+        config.num_gpus = args.num_gpus
+    else:
+        # Try to get from already-provisioned resources
+        config.num_gpus = get_max_gpus_from_cluster()
+
+    if config.num_gpus == 0:
+        raise RuntimeError(
+            "No GPUs found and --num-gpus not specified. "
+            "For auto-scaling clusters, you must specify --num-gpus to trigger scaling."
+        )
+
+    print(f"Starting stop-time diagnostics on {config.num_gpus} GPUs...")
+
+    # Create placement group for the diagnostic actors
+    # This triggers auto-scaling in clusters with min_nodes=0
+    print("Creating placement group (this will trigger auto-scaling if needed)...")
+    pg = placement_group(
+        [{"GPU": 1, "CPU": 1}] * config.num_gpus,
+        strategy="PACK",
+    )
+    ray.get(pg.ready(), timeout=600)  # 10 min timeout for auto-scaling
+    print("Placement group ready")
+
+    # Run diagnostics
+    runner = DiagnosticRunner(DiagnosticConfig(timeout=config.timeout))
+    bundle_indices = list(range(config.num_gpus))
+    result = runner.run_on_placement_group(pg, bundle_indices)
+
+    # Print results
+    print("\n" + "=" * 60)
+    print("DIAGNOSTIC RESULTS")
+    print("=" * 60)
+    print(f"Summary: {result.summary()}")
+    print(f"Stop Reason: {result.stop_reason.value}")
+    print(f"Is Graceful Stop: {result.is_graceful_stop}")
+
+    if result.faulty_gpus:
+        print(f"Faulty GPUs: {result.faulty_gpus}")
+    if result.faulty_nodes:
+        print(f"Faulty Nodes: {result.faulty_nodes}")
+
+    print("\nDetailed Test Results:")
+    print("-" * 60)
+    for test_result in result.test_results:
+        phase = test_result.get("phase", "unknown")
+        rank = test_result.get("rank", "?")
+        passed = test_result.get("passed", False)
+        status = "PASS" if passed else "FAIL"
+
+        print(f"[{status}] Phase: {phase}, Rank: {rank}")
+
+        if test_result.get("node_ip"):
+            print(f"       Node IP: {test_result['node_ip']}")
+        if test_result.get("ray_gpu_ids"):
+            print(f"       GPU IDs: {test_result['ray_gpu_ids']}")
+
+        if test_result.get("errors"):
+            print("       Errors:")
+            for error in test_result["errors"]:
+                for line in error.split("\n")[:5]:  # Limit traceback lines
+                    print(f"         {line}")
+
+        if test_result.get("metrics"):
+            metrics = test_result["metrics"]
+            if metrics.get("cuda_device_name"):
+                print(f"       Device: {metrics['cuda_device_name']}")
+            if metrics.get("cuda_compute_test"):
+                print(f"       CUDA Compute: {metrics['cuda_compute_test']}")
+
+    print("=" * 60)
+
+    # Exit with appropriate code
+    if result.is_graceful_stop:
+        print("\nAll diagnostics passed - system is healthy")
+        return 0
+    else:
+        print(f"\nDiagnostics detected issues - stop reason: {result.stop_reason.value}")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/stop_time_diagnostics/pyproject.toml b/stop_time_diagnostics/pyproject.toml
new file mode 100644
index 0000000..02c2fbf
--- /dev/null
+++ b/stop_time_diagnostics/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "stop-time-diagnostics"
+version = "0.1.0"
+description = "GPU health and NCCL communication diagnostics for Ray clusters"
+requires-python = ">=3.10"
+dependencies = [
+    "ray[default]==2.52.0",
+    "torch>=2.0.0",
+]
+
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.packages.find]
+where = ["."]