diff --git a/stop_time_diagnostics/diagnostics/__init__.py b/stop_time_diagnostics/diagnostics/__init__.py new file mode 100644 index 0000000..0e9e075 --- /dev/null +++ b/stop_time_diagnostics/diagnostics/__init__.py @@ -0,0 +1,17 @@ +"""Stop-time diagnostic module for fault-tolerant training.""" + +from .actor import DiagnosticActor +from .runner import ( + StopReason, + DiagnosticConfig, + DiagnosticResult, + DiagnosticRunner, +) + +__all__ = [ + "DiagnosticActor", + "StopReason", + "DiagnosticConfig", + "DiagnosticResult", + "DiagnosticRunner", +] diff --git a/stop_time_diagnostics/diagnostics/actor.py b/stop_time_diagnostics/diagnostics/actor.py new file mode 100644 index 0000000..64e6678 --- /dev/null +++ b/stop_time_diagnostics/diagnostics/actor.py @@ -0,0 +1,316 @@ +""" +Independent diagnostic Ray actor for GPU and communication tests. + +This can be spawned on any GPU to run diagnostic tests and manages its own NCCL process group. +""" + +import os +import socket +import subprocess +import traceback +import logging +from typing import Dict, Any, List, Optional +import torch +import torch.distributed as dist +import ray + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +@ray.remote(num_gpus=1) +class DiagnosticActor: + """ + Independent Ray actor for running diagnostic tests. + + Can be spawned on any GPU without depending on application actors. + Manages its own NCCL process group for communication tests. + """ + + def __init__( + self, rank: int, world_size: int, master_addr: str, master_port: int + ): + self._rank = rank + self._world_size = world_size + self._master_addr = master_addr + self._master_port = master_port + self._process_group_initialized = False + self._intra_node_group: Optional[dist.ProcessGroup] = None + self._intra_node_ranks: Optional[List[int]] = None + + def set_master_info(self, master_addr: str, master_port: int): + """Update master address and port (used when first actor discovers these values).""" + self._master_addr = master_addr + self._master_port = master_port + + def init_process_group(self): + """Initialize NCCL process group for communication tests.""" + os.environ["MASTER_ADDR"] = self._master_addr + os.environ["MASTER_PORT"] = str(self._master_port) + os.environ["WORLD_SIZE"] = str(self._world_size) + os.environ["RANK"] = str(self._rank) + os.environ["LOCAL_RANK"] = "0" + + if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + self._process_group_initialized = True + logger.info(f"DiagnosticActor rank {self._rank}: process group initialized") + + def destroy_process_group(self): + """Clean up process group.""" + # Destroy intra-node group first if it exists + if self._intra_node_group is not None: + dist.destroy_process_group(self._intra_node_group) + self._intra_node_group = None + self._intra_node_ranks = None + if dist.is_initialized(): + dist.destroy_process_group() + self._process_group_initialized = False + + def init_intra_node_group(self, ranks: List[int]): + """ + Create a sub-group for intra-node communication. + + Args: + ranks: List of global ranks that are on the same node as this actor + """ + self._intra_node_ranks = ranks + # All ranks must call new_group, but only ranks in the list will be part of the group + self._intra_node_group = dist.new_group(ranks=ranks) + logger.info(f"DiagnosticActor rank {self._rank}: intra-node group created with ranks {ranks}") + + def get_node_id(self) -> str: + """Get Ray node ID.""" + return ray.get_runtime_context().get_node_id() + + def get_gpu_id(self): + """Get assigned GPU ID.""" + return ray.get_gpu_ids()[0] if ray.get_gpu_ids() else -1 + + @staticmethod + def get_node_ip() -> str: + """Get current node IP address.""" + return ray._private.services.get_node_ip_address().strip("[]") + + @staticmethod + def get_free_port() -> int: + """Get a free port on the current node.""" + with socket.socket() as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + def run_gpu_health_check(self) -> Dict[str, Any]: + """ + Phase 1: Check GPU health via nvidia-smi and simple CUDA operation. + + Returns: + Dict with 'passed', 'phase', 'rank', and optional 'metrics' or 'error' + """ + # Collect base info for all results + base_info = { + "phase": "gpu_health", + "rank": self._rank, + "node_id": self.get_node_id(), + "node_ip": self.get_node_ip(), + "ray_gpu_ids": ray.get_gpu_ids(), + } + + metrics = {} + errors = [] + + # Step 1: Query GPU info via nvidia-smi + try: + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,ecc.errors.uncorrected.volatile.total", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=30, + ) + + metrics["nvidia_smi_stdout"] = result.stdout.strip() + metrics["nvidia_smi_stderr"] = result.stderr.strip() + metrics["nvidia_smi_returncode"] = result.returncode + + # Parse nvidia-smi output + if result.returncode == 0 and result.stdout.strip(): + # nvidia-smi returns info for all GPUs, parse each line + gpu_infos = [] + for line in result.stdout.strip().split("\n"): + parts = [p.strip() for p in line.split(", ")] + if len(parts) >= 5: + gpu_info = { + "index": parts[0], + "name": parts[1], + "temperature_c": parts[2], + "memory_used_mb": parts[3], + "memory_total_mb": parts[4], + } + if len(parts) >= 6: + gpu_info["utilization_percent"] = parts[5] + if len(parts) >= 7 and parts[6] not in ["[N/A]", "N/A"]: + gpu_info["ecc_errors"] = parts[6] + gpu_infos.append(gpu_info) + metrics["all_gpus"] = gpu_infos + elif result.returncode != 0: + errors.append(f"nvidia-smi failed with code {result.returncode}: {result.stderr}") + except subprocess.TimeoutExpired: + errors.append("nvidia-smi timed out after 30 seconds") + except FileNotFoundError: + errors.append("nvidia-smi not found in PATH") + except Exception as e: + errors.append(f"nvidia-smi error: {str(e)}") + + # Step 2: Check CUDA visibility + try: + metrics["cuda_visible_devices"] = os.environ.get("CUDA_VISIBLE_DEVICES", "not set") + metrics["cuda_available"] = torch.cuda.is_available() + metrics["cuda_device_count"] = torch.cuda.device_count() + if torch.cuda.is_available(): + metrics["cuda_current_device"] = torch.cuda.current_device() + metrics["cuda_device_name"] = torch.cuda.get_device_name() + metrics["cuda_device_capability"] = torch.cuda.get_device_capability() + except Exception as e: + errors.append(f"CUDA info error: {str(e)}") + + # Step 3: Simple CUDA compute test + try: + if not torch.cuda.is_available(): + errors.append("CUDA not available for compute test") + else: + device = torch.cuda.current_device() + test_tensor = torch.zeros(1000, 1000, device=device) + test_tensor = test_tensor + 1 + torch.cuda.synchronize(device) + del test_tensor + metrics["cuda_compute_test"] = "passed" + except Exception as e: + errors.append(f"CUDA compute test failed: {str(e)}\n{traceback.format_exc()}") + metrics["cuda_compute_test"] = "failed" + + # Determine overall pass/fail + passed = len(errors) == 0 + + result = { + **base_info, + "passed": passed, + "metrics": metrics, + } + + if errors: + result["errors"] = errors + + return result + + def run_intra_node_comm_test(self) -> Dict[str, Any]: + """ + Phase 2: Test intra-node GPU communication via all-to-all. + + Uses the intra-node sub-group (GPUs on same node only) for communication test. + + Returns: + Dict with 'passed', 'phase', 'rank', and optional 'error' + """ + base_info = { + "phase": "intra_node_comm", + "rank": self._rank, + "node_id": self.get_node_id(), + "node_ip": self.get_node_ip(), + "ray_gpu_ids": ray.get_gpu_ids(), + } + + if not self._process_group_initialized: + return { + **base_info, + "passed": False, + "errors": ["Process group not initialized"], + } + + # If no intra-node group, this is a single GPU on the node - skip test + if self._intra_node_group is None or self._intra_node_ranks is None: + return { + **base_info, + "passed": True, + "metrics": {"skipped": "single_gpu_on_node", "intra_node_ranks": [self._rank]}, + } + + try: + device = torch.cuda.current_device() + group_size = len(self._intra_node_ranks) + + # Create test tensors sized for the intra-node group + input_tensor = torch.randn(group_size, 1024, device=device) + output_tensor = torch.empty_like(input_tensor) + + # Run all-to-all within intra-node sub-group only + dist.all_to_all_single(output_tensor, input_tensor, group=self._intra_node_group) + torch.cuda.synchronize(device) + + return { + **base_info, + "passed": True, + "metrics": { + "group_size": group_size, + "intra_node_ranks": self._intra_node_ranks, + "data_elements": group_size * 1024, + }, + } + + except Exception as e: + return { + **base_info, + "passed": False, + "errors": [f"{str(e)}\n{traceback.format_exc()}"], + } + + def run_inter_node_comm_test(self) -> Dict[str, Any]: + """ + Phase 3: Test inter-node communication via all-gather. + + Uses the diagnostic actor's NCCL process group to test cross-node communication. + + Returns: + Dict with 'passed', 'phase', 'rank', and optional 'error' + """ + base_info = { + "phase": "inter_node_comm", + "rank": self._rank, + "node_id": self.get_node_id(), + "node_ip": self.get_node_ip(), + "ray_gpu_ids": ray.get_gpu_ids(), + } + + if not self._process_group_initialized: + return { + **base_info, + "passed": False, + "errors": ["Process group not initialized"], + } + + try: + device = torch.cuda.current_device() + world_size = dist.get_world_size() + + # Create test tensor for all-gather + input_tensor = torch.randn(1024, device=device) + output_tensors = [torch.empty_like(input_tensor) for _ in range(world_size)] + + # Run all-gather across all nodes + dist.all_gather(output_tensors, input_tensor) + torch.cuda.synchronize(device) + + return { + **base_info, + "passed": True, + "metrics": {"world_size": world_size, "data_elements": 1024}, + } + + except Exception as e: + return { + **base_info, + "passed": False, + "errors": [f"{str(e)}\n{traceback.format_exc()}"], + } diff --git a/stop_time_diagnostics/diagnostics/runner.py b/stop_time_diagnostics/diagnostics/runner.py new file mode 100644 index 0000000..72154fa --- /dev/null +++ b/stop_time_diagnostics/diagnostics/runner.py @@ -0,0 +1,263 @@ +""" +Stop-time diagnostic runner for hierarchical failure diagnosis. + +This module provides: +- Failure classification (StopReason) +- DiagnosticRunner to orchestrate independent DiagnosticActors +- Hierarchical test execution (GPU health, intra-node comm, inter-node comm) +""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional, Dict, Any, List +import logging +import ray + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +from ray.util.placement_group import PlacementGroup +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +from .actor import DiagnosticActor + + +class StopReason(Enum): + """Classification of stop/failure reasons.""" + + UNKNOWN = "unknown" + GRACEFUL_STOP = "graceful_stop" # User-initiated or planned stop + GPU_ERROR = "gpu_error" # GPU hardware error + NCCL_ERROR = "nccl_error" # NCCL communication error + + +@dataclass +class DiagnosticConfig: + """Configuration for the diagnostic module.""" + + # Timeouts (in seconds) + timeout: float = 120.0 + + +@dataclass +class DiagnosticResult: + """Aggregated diagnostic results.""" + + stop_reason: StopReason + is_graceful_stop: bool = False + test_results: List[Dict[str, Any]] = field(default_factory=list) + faulty_gpus: List[int] = field(default_factory=list) + faulty_nodes: List[str] = field(default_factory=list) + + def summary(self) -> str: + """Return a summary string of the diagnostic result.""" + total = len(self.test_results) + passed = sum(1 for r in self.test_results if r.get("passed")) + return ( + f"StopReason: {self.stop_reason.value}, " + f"Tests: {passed}/{total} passed, " + f"GracefulStop: {self.is_graceful_stop}" + ) + + +class DiagnosticRunner: + """ + Orchestrates diagnostic tests using independent DiagnosticActors. + + This runner spawns DiagnosticActors on specified GPU bundles and runs + hierarchical diagnostic tests. It is completely independent of any + application-specific actors (like MegatronActor). + + Usage: + runner = DiagnosticRunner() + result = runner.run_on_placement_group(pg, bundle_indices) + """ + + def __init__(self, config: Optional[DiagnosticConfig] = None): + self.config = config or DiagnosticConfig() + + def run_on_placement_group( + self, + pg: PlacementGroup, + bundle_indices: List[int], + ) -> DiagnosticResult: + """ + Spawn DiagnosticActors on a placement group and run all tests. + + This method: + 1. Spawns independent DiagnosticActors on the specified bundles + 2. Initializes NCCL process groups for communication tests + 3. Runs hierarchical tests (GPU → intra-node → inter-node) + 4. Cleans up actors after tests complete + 5. Returns aggregated results + + Args: + pg: Ray PlacementGroup to schedule actors on + bundle_indices: List of bundle indices to use for diagnostic actors + + Returns: + DiagnosticResult with test outcomes and failure classification + """ + world_size = len(bundle_indices) + if world_size == 0: + return DiagnosticResult( + stop_reason=StopReason.UNKNOWN, + is_graceful_stop=False, + ) + + logger.info(f"Starting diagnostics on {world_size} GPUs") + + # Spawn first actor to get master addr/port from a GPU worker node + # (Ray head node doesn't have GPUs, so we need a worker node's IP for NCCL) + first_actor = DiagnosticActor.options( + num_gpus=1, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_bundle_index=bundle_indices[0], + ), + ).remote(rank=0, world_size=world_size, master_addr="", master_port=0) + + master_addr = ray.get(first_actor.get_node_ip.remote()) + master_port = ray.get(first_actor.get_free_port.remote()) + + # Update first actor with correct master info + ray.get(first_actor.set_master_info.remote(master_addr, master_port)) + + logger.info(f"Using master_addr={master_addr}, master_port={master_port}") + + # Spawn remaining diagnostic actors + actors = [first_actor] + for rank, bundle_idx in enumerate(bundle_indices[1:], start=1): + actor = DiagnosticActor.options( + num_gpus=1, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_bundle_index=bundle_idx, + ), + ).remote(rank, world_size, master_addr, master_port) + actors.append(actor) + + try: + # Collect node topology from all actors + logger.info("Collecting node topology...") + node_ids = ray.get( + [a.get_node_id.remote() for a in actors], + timeout=30, + ) + + # Group ranks by node + node_to_ranks: Dict[str, List[int]] = {} + for rank, node_id in enumerate(node_ids): + node_to_ranks.setdefault(node_id, []).append(rank) + + logger.info(f"Node topology: {node_to_ranks}") + + # Initialize global process groups on all actors + logger.info("Initializing process groups...") + ray.get( + [a.init_process_group.remote() for a in actors], + timeout=self.config.timeout, + ) + + # Create intra-node sub-groups for nodes with multiple GPUs + # IMPORTANT: dist.new_group() must be called by ALL ranks, even those not in the group + for node_id, ranks in node_to_ranks.items(): + if len(ranks) > 1: + logger.info(f"Creating intra-node group for node {node_id[:8]}... with ranks {ranks}") + # All actors must call init_intra_node_group with the same ranks list + ray.get( + [a.init_intra_node_group.remote(ranks) for a in actors], + timeout=30, + ) + + # Run hierarchical tests + result = self._run_all_phases(actors) + + # Cleanup process groups + logger.info("Cleaning up process groups...") + ray.get( + [a.destroy_process_group.remote() for a in actors], + timeout=30, + ) + + except Exception as e: + logger.error(f"Diagnostic run failed: {e}") + result = DiagnosticResult( + stop_reason=StopReason.UNKNOWN, + is_graceful_stop=False, + test_results=[{"passed": False, "phase": "setup", "error": str(e)}], + ) + + finally: + # Kill diagnostic actors + for actor in actors: + try: + ray.kill(actor) + except Exception: + pass + + logger.info(f"Diagnostics complete: {result.summary()}") + return result + + def _run_all_phases(self, actors: List) -> DiagnosticResult: + """Run all diagnostic phases hierarchically.""" + result = DiagnosticResult(stop_reason=StopReason.UNKNOWN) + + # Phase 1: GPU health + logger.info("Phase 1: Running GPU health checks...") + gpu_results = ray.get( + [a.run_gpu_health_check.remote() for a in actors], + timeout=self.config.timeout, + ) + result.test_results.extend(gpu_results) + + gpu_failures = [r for r in gpu_results if not r.get("passed")] + if gpu_failures: + logger.warning(f"GPU health check failed on {len(gpu_failures)} actors") + result.faulty_gpus = [r.get("rank", -1) for r in gpu_failures] + result.stop_reason = StopReason.GPU_ERROR + return result + + logger.info("Phase 1 passed: All GPUs healthy") + + # Phase 2: Intra-node communication + logger.info("Phase 2: Running intra-node communication tests...") + intra_results = ray.get( + [a.run_intra_node_comm_test.remote() for a in actors], + timeout=self.config.timeout, + ) + result.test_results.extend(intra_results) + + intra_failures = [r for r in intra_results if not r.get("passed")] + if intra_failures: + logger.warning( + f"Intra-node comm test failed on {len(intra_failures)} actors" + ) + result.stop_reason = StopReason.NCCL_ERROR + return result + + logger.info("Phase 2 passed: Intra-node communication healthy") + + # Phase 3: Inter-node communication + logger.info("Phase 3: Running inter-node communication tests...") + inter_results = ray.get( + [a.run_inter_node_comm_test.remote() for a in actors], + timeout=self.config.timeout, + ) + result.test_results.extend(inter_results) + + inter_failures = [r for r in inter_results if not r.get("passed")] + if inter_failures: + logger.warning( + f"Inter-node comm test failed on {len(inter_failures)} actors" + ) + result.stop_reason = StopReason.NCCL_ERROR + return result + + logger.info("Phase 3 passed: Inter-node communication healthy") + + # All tests passed - classify as graceful stop + result.is_graceful_stop = True + result.stop_reason = StopReason.GRACEFUL_STOP + logger.info("All diagnostic phases passed - classified as graceful stop") + + return result diff --git a/stop_time_diagnostics/job.yaml b/stop_time_diagnostics/job.yaml new file mode 100644 index 0000000..09b8a12 --- /dev/null +++ b/stop_time_diagnostics/job.yaml @@ -0,0 +1,30 @@ +# GPU diagnostics job configuration +# Run GPU health and NCCL communication tests on a Ray cluster + +name: stop-time-diagnostics + +# Use a base image with CUDA support +image_uri: anyscale/ray:2.52.0-slim-py312-cu128 + +compute_config: + worker_nodes: + - instance_type: g5.12xlarge + min_nodes: 3 + max_nodes: 3 + min_resources: + CPU: 0 + GPU: 0 + max_resources: + CPU: 144 + GPU: 12 + +working_dir: . + +env_vars: + NCCL_P2P_DISABLE: "1" + NCCL_SHM_DISABLE: "1" + +# Run diagnostics on 8 GPUs (triggers auto-scaling) +entrypoint: uv run --isolated main.py --num-gpus 8 + +max_retries: 0 diff --git a/stop_time_diagnostics/main.py b/stop_time_diagnostics/main.py new file mode 100644 index 0000000..d173a0e --- /dev/null +++ b/stop_time_diagnostics/main.py @@ -0,0 +1,148 @@ +""" +Standalone entry point for running stop-time diagnostics as a Ray job. + +This script runs GPU health checks and communication tests on specified GPUs +to diagnose failures in distributed training systems. +""" + +import argparse +from dataclasses import dataclass + +import ray +from ray.util.placement_group import placement_group + +from diagnostics import DiagnosticRunner, DiagnosticConfig + + +@dataclass +class Config: + """Configuration for the diagnostics job.""" + num_gpus: int = 0 # Number of GPUs (from max_resources or specified) + timeout: float = 120.0 # Timeout for each test phase + + +def get_max_gpus_from_cluster() -> int: + """ + Get the maximum number of GPUs that can be provisioned in the cluster. + + This looks at max_resources which defines the cluster's scaling limits. + For auto-scaling clusters, this tells us how many GPUs we can request. + + Returns: + Maximum number of GPUs available, or 0 if none configured + """ + # Check cluster resources first (already provisioned) + resources = ray.cluster_resources() + current_gpus = int(resources.get("GPU", 0)) + + if current_gpus > 0: + print(f"Found {current_gpus} GPUs already provisioned") + return current_gpus + + # For auto-scaling clusters, we need to look at available node types + # The cluster will scale up when we create a placement group + # Default to a reasonable number that matches job.yaml max_resources + print("No GPUs currently provisioned - cluster will auto-scale when placement group is created") + return 0 + + +def main(): + config = Config() + + # Parse command line arguments + parser = argparse.ArgumentParser(description="Run stop-time GPU diagnostics") + parser.add_argument("--num-gpus", type=int, default=None, + help="Number of GPUs to request (required for auto-scaling clusters)") + parser.add_argument("--timeout", type=float, default=config.timeout, + help="Timeout in seconds for each test phase") + args = parser.parse_args() + + config.timeout = args.timeout + + # Initialize Ray (connects to existing cluster if running as a job) + ray.init() + + # Determine number of GPUs + if args.num_gpus is not None: + config.num_gpus = args.num_gpus + else: + # Try to get from already-provisioned resources + config.num_gpus = get_max_gpus_from_cluster() + + if config.num_gpus == 0: + raise RuntimeError( + "No GPUs found and --num-gpus not specified. " + "For auto-scaling clusters, you must specify --num-gpus to trigger scaling." + ) + + print(f"Starting stop-time diagnostics on {config.num_gpus} GPUs...") + + # Create placement group for the diagnostic actors + # This triggers auto-scaling in clusters with min_nodes=0 + print("Creating placement group (this will trigger auto-scaling if needed)...") + pg = placement_group( + [{"GPU": 1, "CPU": 1}] * config.num_gpus, + strategy="PACK", + ) + ray.get(pg.ready(), timeout=600) # 10 min timeout for auto-scaling + print("Placement group ready") + + # Run diagnostics + runner = DiagnosticRunner(DiagnosticConfig(timeout=config.timeout)) + bundle_indices = list(range(config.num_gpus)) + result = runner.run_on_placement_group(pg, bundle_indices) + + # Print results + print("\n" + "=" * 60) + print("DIAGNOSTIC RESULTS") + print("=" * 60) + print(f"Summary: {result.summary()}") + print(f"Stop Reason: {result.stop_reason.value}") + print(f"Is Graceful Stop: {result.is_graceful_stop}") + + if result.faulty_gpus: + print(f"Faulty GPUs: {result.faulty_gpus}") + if result.faulty_nodes: + print(f"Faulty Nodes: {result.faulty_nodes}") + + print("\nDetailed Test Results:") + print("-" * 60) + for test_result in result.test_results: + phase = test_result.get("phase", "unknown") + rank = test_result.get("rank", "?") + passed = test_result.get("passed", False) + status = "PASS" if passed else "FAIL" + + print(f"[{status}] Phase: {phase}, Rank: {rank}") + + if test_result.get("node_ip"): + print(f" Node IP: {test_result['node_ip']}") + if test_result.get("ray_gpu_ids"): + print(f" GPU IDs: {test_result['ray_gpu_ids']}") + + if test_result.get("errors"): + print(" Errors:") + for error in test_result["errors"]: + for line in error.split("\n")[:5]: # Limit traceback lines + print(f" {line}") + + if test_result.get("metrics"): + metrics = test_result["metrics"] + if metrics.get("cuda_device_name"): + print(f" Device: {metrics['cuda_device_name']}") + if metrics.get("cuda_compute_test"): + print(f" CUDA Compute: {metrics['cuda_compute_test']}") + + print("=" * 60) + + # Exit with appropriate code + if result.is_graceful_stop: + print("\nAll diagnostics passed - system is healthy") + return 0 + else: + print(f"\nDiagnostics detected issues - stop reason: {result.stop_reason.value}") + return 1 + + +if __name__ == "__main__": + exit(main()) diff --git a/stop_time_diagnostics/pyproject.toml b/stop_time_diagnostics/pyproject.toml new file mode 100644 index 0000000..02c2fbf --- /dev/null +++ b/stop_time_diagnostics/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "stop-time-diagnostics" +version = "0.1.0" +description = "GPU health and NCCL communication diagnostics for Ray clusters" +requires-python = ">=3.10" +dependencies = [ + "ray[default]==2.52.0", + "torch>=2.0.0", +] + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +where = ["."]