diff --git a/docs/trouble_shooting.md b/docs/trouble_shooting.md index adc2b953..da300651 100644 --- a/docs/trouble_shooting.md +++ b/docs/trouble_shooting.md @@ -50,7 +50,7 @@ The resource dictionary parameter `resource_dict` can contain one or more of the * `exclusive` (bool): boolean flag to reserve exclusive access to selected compute nodes - do not allow other tasks to use the same compute node. * `error_log_file` (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. -* `run_time_limit` (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. +* `run_time_max` (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * `priority` (int): the queuing system priority assigned to a given Python function to influence the scheduling. * `slurm_cmd_args` (list): Additional command line arguments for the srun call (SLURM only) diff --git a/src/executorlib/executor/flux.py b/src/executorlib/executor/flux.py index 14112ebf..c7de8987 100644 --- a/src/executorlib/executor/flux.py +++ b/src/executorlib/executor/flux.py @@ -51,8 +51,8 @@ class FluxJobExecutor(BaseExecutor): do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) @@ -154,7 +154,7 @@ def __init__( - do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python + * run_time_max (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. @@ -283,8 +283,8 @@ class FluxClusterExecutor(BaseExecutor): do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) @@ -380,7 +380,7 @@ def __init__( - do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python + * run_time_max (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. @@ -417,7 +417,7 @@ def __init__( "cwd": None, "openmpi_oversubscribe": openmpi_oversubscribe, "slurm_cmd_args": [], - "run_time_limit": None, + "run_time_max": None, } if resource_dict is None: resource_dict = {} @@ -540,8 +540,8 @@ def create_flux_executor( do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) diff --git a/src/executorlib/executor/single.py b/src/executorlib/executor/single.py index 913d1b82..1aa52605 100644 --- a/src/executorlib/executor/single.py +++ b/src/executorlib/executor/single.py @@ -50,8 +50,8 @@ class SingleNodeExecutor(BaseExecutor): do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) @@ -145,7 +145,7 @@ def __init__( - do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python + * run_time_max (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. @@ -259,8 +259,8 @@ class TestClusterExecutor(BaseExecutor): do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) @@ -350,7 +350,7 @@ def __init__( - do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python + * run_time_max (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. @@ -479,8 +479,8 @@ def create_single_node_executor( do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) diff --git a/src/executorlib/executor/slurm.py b/src/executorlib/executor/slurm.py index 46370e51..49a7a840 100644 --- a/src/executorlib/executor/slurm.py +++ b/src/executorlib/executor/slurm.py @@ -52,8 +52,8 @@ class SlurmClusterExecutor(BaseExecutor): do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) @@ -149,7 +149,7 @@ def __init__( - do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python + * run_time_max (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. @@ -293,8 +293,8 @@ class SlurmJobExecutor(BaseExecutor): do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) @@ -390,7 +390,7 @@ def __init__( - do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python + * run_time_max (int): the maximum time the execution of the submitted Python function is allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. @@ -518,8 +518,8 @@ def create_slurm_executor( do not allow other tasks to use the same compute node. * error_log_file (str): path to the error log file, primarily used to merge the log of multiple tasks in one file. - * run_time_limit (int): the maximum time the execution of the submitted Python function is - allowed to take in seconds. + * run_time_max (int): the maximum time the execution of the submitted Python function is + allowed to take in seconds. * priority (int): the queuing system priority assigned to a given Python function to influence the scheduling. * slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only) diff --git a/src/executorlib/standalone/command.py b/src/executorlib/standalone/command.py index 926de511..4557eeee 100644 --- a/src/executorlib/standalone/command.py +++ b/src/executorlib/standalone/command.py @@ -126,7 +126,7 @@ def generate_slurm_command( openmpi_oversubscribe: bool = False, slurm_cmd_args: Optional[list[str]] = None, pmi_mode: Optional[str] = None, - run_time_limit: Optional[int] = None, + run_time_max: Optional[int] = None, ) -> list[str]: """ Generate the command list for the SLURM interface. @@ -141,7 +141,7 @@ def generate_slurm_command( openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to []. pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None - run_time_limit (int): The maximum runtime in seconds for each task. Default: None + run_time_max (int): The maximum runtime in seconds for each task. Default: None Returns: list[str]: The generated command list. @@ -161,8 +161,8 @@ def generate_slurm_command( command_prepend_lst += ["--exact"] if openmpi_oversubscribe: command_prepend_lst += ["--oversubscribe"] - if run_time_limit is not None: - command_prepend_lst += ["--time=" + str(run_time_limit // 60 + 1)] + if run_time_max is not None: + command_prepend_lst += ["--time=" + str(run_time_max // 60 + 1)] if slurm_cmd_args is not None and len(slurm_cmd_args) > 0: command_prepend_lst += slurm_cmd_args return command_prepend_lst diff --git a/src/executorlib/standalone/validate.py b/src/executorlib/standalone/validate.py index ec4251be..6b455272 100644 --- a/src/executorlib/standalone/validate.py +++ b/src/executorlib/standalone/validate.py @@ -23,7 +23,7 @@ class ResourceDictValidation(BaseModel): num_nodes: Optional[int] = None exclusive: Optional[bool] = None error_log_file: Optional[str] = None - run_time_limit: Optional[int] = None + run_time_max: Optional[int] = None priority: Optional[int] = None slurm_cmd_args: Optional[list[str]] = None diff --git a/src/executorlib/task_scheduler/file/spawner_pysqa.py b/src/executorlib/task_scheduler/file/spawner_pysqa.py index 37993f25..ecf88a01 100644 --- a/src/executorlib/task_scheduler/file/spawner_pysqa.py +++ b/src/executorlib/task_scheduler/file/spawner_pysqa.py @@ -69,7 +69,7 @@ def execute_with_pysqa( "command": " ".join(command), "dependency_list": [str(qid) for qid in task_dependent_lst], "working_directory": os.path.abspath(cwd), - "run_time_max": resource_dict.get("run_time_limit"), + "run_time_max": resource_dict.get("run_time_max"), } if "cwd" in resource_dict: del resource_dict["cwd"] diff --git a/src/executorlib/task_scheduler/interactive/spawner_flux.py b/src/executorlib/task_scheduler/interactive/spawner_flux.py index 07c61c0a..5989c624 100644 --- a/src/executorlib/task_scheduler/interactive/spawner_flux.py +++ b/src/executorlib/task_scheduler/interactive/spawner_flux.py @@ -44,7 +44,7 @@ class FluxPythonSpawner(BaseSpawner): flux_executor (flux.job.FluxExecutor, optional): The FluxExecutor instance. Defaults to None. flux_executor_nesting (bool, optional): Whether to use nested FluxExecutor. Defaults to False. flux_log_files (bool, optional): Write flux stdout and stderr files. Defaults to False. - run_time_limit (int): The maximum runtime in seconds for each task. Default: None + run_time_max (int): The maximum runtime in seconds for each task. Default: None """ def __init__( @@ -62,7 +62,7 @@ def __init__( flux_executor: Optional[flux.job.FluxExecutor] = None, flux_executor_nesting: bool = False, flux_log_files: bool = False, - run_time_limit: Optional[int] = None, + run_time_max: Optional[int] = None, ): super().__init__( cwd=cwd, @@ -80,7 +80,7 @@ def __init__( self._flux_log_files = flux_log_files self._priority = priority self._future = None - self._run_time_limit = run_time_limit + self._run_time_max = run_time_max def bootup( self, @@ -131,8 +131,8 @@ def bootup( if self._cwd is not None: jobspec.cwd = self._cwd os.makedirs(self._cwd, exist_ok=True) - if self._run_time_limit is not None: - jobspec.duration = self._run_time_limit + if self._run_time_max is not None: + jobspec.duration = self._run_time_max file_prefix = "flux_" + str(self._worker_id) if self._flux_log_files and self._cwd is not None: jobspec.stderr = os.path.join(self._cwd, file_prefix + ".err") diff --git a/src/executorlib/task_scheduler/interactive/spawner_pysqa.py b/src/executorlib/task_scheduler/interactive/spawner_pysqa.py index 3cdfc709..3d5b9db5 100644 --- a/src/executorlib/task_scheduler/interactive/spawner_pysqa.py +++ b/src/executorlib/task_scheduler/interactive/spawner_pysqa.py @@ -32,7 +32,7 @@ def __init__( pmi_mode: Optional[str] = None, config_directory: Optional[str] = None, backend: Optional[str] = None, - run_time_limit: Optional[int] = None, + run_time_max: Optional[int] = None, **kwargs, ): """ @@ -52,7 +52,7 @@ def __init__( pmi_mode (str, optional): PMI interface to use (OpenMPI v5 requires pmix) default is None config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend). backend (str): name of the backend used to spawn tasks. - run_time_limit (int): The maximum runtime in seconds for each task. Default: None + run_time_max (int): The maximum runtime in seconds for each task. Default: None """ super().__init__( cwd=cwd, @@ -71,7 +71,7 @@ def __init__( self._pysqa_submission_kwargs = kwargs self._process: Optional[int] = None self._queue_adapter: Optional[QueueAdapter] = None - self._run_time_limit = run_time_limit + self._run_time_max = run_time_max def bootup( self, @@ -195,7 +195,7 @@ def _start_process_helper( command=" ".join(self.generate_command(command_lst=command_lst)), working_directory=working_directory, cores=int(self._cores * self._threads_per_core), - run_time_max=self._run_time_limit, + run_time_max=self._run_time_max, **self._pysqa_submission_kwargs, ) diff --git a/src/executorlib/task_scheduler/interactive/spawner_slurm.py b/src/executorlib/task_scheduler/interactive/spawner_slurm.py index 6a549e28..34262b29 100644 --- a/src/executorlib/task_scheduler/interactive/spawner_slurm.py +++ b/src/executorlib/task_scheduler/interactive/spawner_slurm.py @@ -32,7 +32,7 @@ def __init__( openmpi_oversubscribe: bool = False, slurm_cmd_args: Optional[list[str]] = None, pmi_mode: Optional[str] = None, - run_time_limit: Optional[int] = None, + run_time_max: Optional[int] = None, ): """ Srun interface implementation. @@ -48,7 +48,7 @@ def __init__( openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to []. pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None - run_time_limit (int): The maximum runtime in seconds for each task. Default: None + run_time_max (int): The maximum runtime in seconds for each task. Default: None """ super().__init__( cwd=cwd, @@ -62,7 +62,7 @@ def __init__( self._num_nodes = num_nodes self._exclusive = exclusive self._pmi_mode = pmi_mode - self._run_time_limit = run_time_limit + self._run_time_max = run_time_max def generate_command(self, command_lst: list[str]) -> list[str]: """ @@ -84,7 +84,7 @@ def generate_command(self, command_lst: list[str]) -> list[str]: openmpi_oversubscribe=self._openmpi_oversubscribe, slurm_cmd_args=self._slurm_cmd_args, pmi_mode=self._pmi_mode, - run_time_limit=self._run_time_limit, + run_time_max=self._run_time_max, ) return super().generate_command( command_lst=command_prepend_lst + command_lst, diff --git a/tests/unit/executor/test_flux_job.py b/tests/unit/executor/test_flux_job.py index 07c385f6..990d7ac0 100644 --- a/tests/unit/executor/test_flux_job.py +++ b/tests/unit/executor/test_flux_job.py @@ -117,7 +117,7 @@ def test_single_task(self): [[(1, 2, 0), (1, 2, 1)], [(2, 2, 0), (2, 2, 1)], [(3, 2, 0), (3, 2, 1)]], ) - def test_run_time_limit(self): + def test_run_time_max(self): with FluxJobExecutor( max_cores=1, resource_dict={"cores": 1}, @@ -125,8 +125,8 @@ def test_run_time_limit(self): block_allocation=False, pmi_mode=pmi, ) as p: - f1 = p.submit(delayed_calc, 1, resource_dict={"run_time_limit": 1}) - f2 = p.submit(delayed_calc, 2, resource_dict={"run_time_limit": 5}) + f1 = p.submit(delayed_calc, 1, resource_dict={"run_time_max": 1}) + f2 = p.submit(delayed_calc, 2, resource_dict={"run_time_max": 5}) self.assertFalse(f1.done()) self.assertFalse(f2.done()) self.assertEqual(f2.result(), 2) diff --git a/tests/unit/standalone/test_slurm_command.py b/tests/unit/standalone/test_slurm_command.py index 98c5d58d..140a45d0 100644 --- a/tests/unit/standalone/test_slurm_command.py +++ b/tests/unit/standalone/test_slurm_command.py @@ -55,7 +55,7 @@ def test_generate_slurm_command(self): exclusive=True, openmpi_oversubscribe=True, slurm_cmd_args=["--help"], - run_time_limit=250, + run_time_max=250, ) self.assertEqual(len(command_lst), 13) reply_lst = ['srun', '-n', '1', '-D', '/tmp/test', '-N', '1', '--cpus-per-task=2', '--gpus-per-task=1', '--exact', '--oversubscribe', '--time=5', '--help'] diff --git a/tests/unit/standalone/test_validate.py b/tests/unit/standalone/test_validate.py index 7b6c4904..f23db388 100644 --- a/tests/unit/standalone/test_validate.py +++ b/tests/unit/standalone/test_validate.py @@ -55,7 +55,7 @@ def test_get_accepted_keys(self): "num_nodes", "exclusive", "error_log_file", - "run_time_limit", + "run_time_max", "priority", "slurm_cmd_args" ]