diff --git a/arc/job/adapter.py b/arc/job/adapter.py index de8c747718..79a3ac8f97 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -49,6 +49,18 @@ constraint_type_dict = {2: 'B', 3: 'A', 4: 'D'} +# ARC keeps job-memory math in base-2 units internally. In other words, ARC's +# "GB" behaves as GiB and is converted using 1 GiB = 1024 MiB. This is +# deliberate: many chemistry codes and cluster templates ultimately consume an +# integer "MB"-style value, and using a consistent binary convention avoids +# mixing 1000- and 1024-based conversions in different parts of the pipeline. +# Human-facing decimal capacities are smaller when expressed in base-2, e.g. +# 10 GB (decimal) ~= 9.31 GiB. ARC therefore interprets job_memory_gb=10 as +# 10 GiB, not 10 decimal GB. +MEMORY_GB_TO_MIB = 1024 +DEFAULT_JOB_MEMORY_OVERHEAD = 1.10 +CAPPED_JOB_MEMORY_OVERHEAD = 1.05 + class JobEnum(str, Enum): """ @@ -581,21 +593,25 @@ def set_cpu_and_mem(self): f'exceeds {100 * job_max_server_node_memory_allocation}% of the the maximum node memory on ' f'{self.server}. Setting it to {job_max_server_node_memory_allocation * max_mem:.2f} GB.') self.job_memory_gb = job_max_server_node_memory_allocation * max_mem - total_submit_script_memory = self.job_memory_gb * 1024 * 1.05 # MB + total_submit_script_memory_mib = math.ceil(self.job_memory_gb * MEMORY_GB_TO_MIB * CAPPED_JOB_MEMORY_OVERHEAD) self.job_status[1]['keywords'].append('max_total_job_memory') # Useful info when troubleshooting. else: - total_submit_script_memory = self.job_memory_gb * 1024 * 1.1 # MB + total_submit_script_memory_mib = math.ceil(self.job_memory_gb * MEMORY_GB_TO_MIB * DEFAULT_JOB_MEMORY_OVERHEAD) + self.submit_script_memory_mib = total_submit_script_memory_mib # Determine amount of memory in submit script based on cluster job scheduling system. cluster_software = servers[self.server].get('cluster_soft').lower() if self.server is not None else None if cluster_software in ['oge', 'sge', 'htcondor']: - # In SGE, "-l h_vmem=5000M" specifies the memory for all cores to be 5000 MB. - self.submit_script_memory = math.ceil(total_submit_script_memory) # in MB + # ARC uses MiB internally and passes that integer consistently to scheduler templates. + self.submit_script_memory = total_submit_script_memory_mib if cluster_software in ['pbs']: - # In PBS, "#PBS -l select=1:ncpus=8:mem=12000000" specifies the memory for all cores to be 12 MB. - self.submit_script_memory = math.ceil(total_submit_script_memory) * 1E6 # in Bytes + # ARC keeps the PBS request in MiB as well. The template still uses + # an "mb" suffix, but the integer is derived from the same base-2 + # MiB count used everywhere else in ARC. + self.submit_script_memory = total_submit_script_memory_mib elif cluster_software in ['slurm']: - # In Slurm, "#SBATCH --mem-per-cpu=2000" specifies the memory **per cpu/thread** to be 2000 MB. - self.submit_script_memory = math.ceil(total_submit_script_memory / self.cpu_cores) # in MB + # In Slurm, "#SBATCH --mem-per-cpu=2000" is a per-core request, so + # we divide ARC's total MiB budget across the requested cores. + self.submit_script_memory = math.ceil(total_submit_script_memory_mib / self.cpu_cores) self.set_input_file_memory() def as_dict(self) -> dict: diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 9657f9a62a..dd1a520620 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -244,7 +244,7 @@ def test_set_cpu_and_mem(self): self.job_4.server = 'server3' self.job_4.cpu_cores = None self.job_4.set_cpu_and_mem() - expected_memory = math.ceil(14 * 1024 * 1.1) * 1E6 + expected_memory = math.ceil(14 * 1024 * 1.1) self.assertEqual(self.job_4.submit_script_memory, expected_memory) self.job_4.server = 'local' diff --git a/arc/job/adapters/common.py b/arc/job/adapters/common.py index 82a8db0c40..1acdd8b607 100644 --- a/arc/job/adapters/common.py +++ b/arc/job/adapters/common.py @@ -183,6 +183,7 @@ def _initialize_adapter(obj: 'JobAdapter', obj.server_nodes = server_nodes or list() obj.species = [species] if species is not None and not isinstance(species, list) else species obj.submit_script_memory = None + obj.submit_script_memory_mib = None obj.testing = testing obj.times_rerun = times_rerun obj.torsions = [torsions] if torsions is not None and not isinstance(torsions[0], list) else torsions diff --git a/arc/job/adapters/gaussian.py b/arc/job/adapters/gaussian.py index 9321d454f2..40736d6d8b 100644 --- a/arc/job/adapters/gaussian.py +++ b/arc/job/adapters/gaussian.py @@ -36,6 +36,12 @@ settings['default_job_settings'], settings['global_ess_settings'], settings['input_filenames'], \ settings['output_filenames'], settings['servers'], settings['submit_filenames'] +# Gaussian should not consume the entire scheduler allocation. ARC reserves a +# fixed fraction of the submit-script memory for non-Gaussian overhead such as +# the scheduler, runtime, scratch bookkeeping, and Gaussian allocations outside +# the explicit %mem budget. +GAUSSIAN_MEMORY_HEADROOM_FRACTION = 0.90 + # job_type_1: '' for sp, irc, or composite methods, 'opt=calcfc', 'opt=(calcfc,ts,noeigen)', # job_type_2: '' or 'freq iop(7/33=1)' (cannot be combined with CBS-QB3) @@ -493,8 +499,14 @@ def set_input_file_memory(self) -> None: """ Set the input_file_memory attribute. """ - # Gaussian's memory is in MB, total for all cpu cores - self.input_file_memory = math.ceil(self.job_memory_gb * 1024) + # Gaussian's %mem is the total memory budget for the process. ARC keeps + # scheduler memory in MiB and intentionally gives Gaussian only part of + # that total so the queue allocation retains headroom. This matters most + # on capped nodes: e.g., a human "10 GB" node is only ~9.31 GiB, and if + # ARC already requests ~95% of a node, passing the entire allocation to + # %mem leaves too little room for runtime overhead and can trigger galloc. + submit_script_memory_mib = self.submit_script_memory_mib or math.ceil(self.job_memory_gb * 1024) + self.input_file_memory = max(1, math.floor(submit_script_memory_mib * GAUSSIAN_MEMORY_HEADROOM_FRACTION)) def execute_incore(self): """ diff --git a/arc/job/adapters/gaussian_test.py b/arc/job/adapters/gaussian_test.py index c81e8669b9..0a84d19372 100644 --- a/arc/job/adapters/gaussian_test.py +++ b/arc/job/adapters/gaussian_test.py @@ -515,15 +515,18 @@ def test_set_cpu_and_mem(self): """Test assigning number of cpu's and memory""" self.job_8.input_file_memory = None self.job_8.submit_script_memory = None + self.job_8.submit_script_memory_mib = None self.job_8.server = 'server2' self.job_8.set_cpu_and_mem() self.assertEqual(self.job_8.cpu_cores, 8) + self.assertEqual(self.job_8.submit_script_memory_mib, math.ceil(14 * 1024 * 1.1)) + self.assertLess(self.job_8.input_file_memory, self.job_8.submit_script_memory_mib) def test_set_input_file_memory(self): """Test setting the input_file_memory argument""" - expected_memory = math.ceil(14 * 1024) + expected_memory = math.floor(math.ceil(14 * 1024 * 1.1) * 0.9) self.assertEqual(self.job_1.input_file_memory, expected_memory) - self.assertEqual(self.job_2.input_file_memory, 14336) + self.assertEqual(self.job_2.input_file_memory, expected_memory) def test_write_input_file_multi(self): """Test writing Gaussian input files""" @@ -531,7 +534,7 @@ def test_write_input_file_multi(self): with open(os.path.join(self.job_multi.local_path, input_filenames[self.job_multi.job_adapter]), 'r') as f: content_multi = f.read() job_multi_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc) SCRF=(smd, Solvent=water) uwb97xd/def2tzvp IOp(2/9=2000) @@ -545,7 +548,7 @@ def test_write_input_file_multi(self): --link1-- %chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc) SCRF=(smd, Solvent=water) uwb97xd/def2tzvp IOp(2/9=2000) @@ -559,7 +562,7 @@ def test_write_input_file_multi(self): --link1-- %chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc) SCRF=(smd, Solvent=water) wb97xd/def2tzvp IOp(2/9=2000) @@ -588,7 +591,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_1.local_path, input_filenames[self.job_1.job_adapter]), 'r') as f: content_1 = f.read() job_1_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc) cbs-qb3 IOp(2/9=2000) IOp(1/12=5,3/44=0) @@ -606,7 +609,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_3.local_path, input_filenames[self.job_3.job_adapter]), 'r') as f: content_3 = f.read() job_3_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc) SCRF=(smd, Solvent=water) uwb97xd/def2tzvp IOp(2/9=2000) @@ -624,7 +627,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_4.local_path, input_filenames[self.job_4.job_adapter]), 'r') as f: content_4 = f.read() job_4_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxStep=5,modredundant,noeigentest) integral=(grid=ultrafine, Acc2E=12) guess=mix wb97xd/def2tzvp IOp(2/9=2000) scf=(direct,tight) @@ -657,7 +660,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_5.local_path, input_filenames[self.job_5.job_adapter]), 'r') as f: content_5 = f.read() job_5_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P uwb97xd/def2tzvp freq IOp(7/33=1) integral=(grid=ultrafine, Acc2E=12) IOp(2/9=2000) scf=(direct,tight) @@ -675,7 +678,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_6.local_path, input_filenames[self.job_6.job_adapter]), 'r') as f: content_6 = f.read() job_6_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc) uwb97xd/def2tzvp IOp(2/9=2000) @@ -693,7 +696,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_7.local_path, input_filenames[self.job_7.job_adapter]), 'r') as f: content_7 = f.read() job_7_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P irc=(CalcAll,maxpoints=50,reverse,stepsize=7) uwb97xd/def2tzvp IOp(2/9=2000) @@ -711,7 +714,7 @@ def test_write_input_file(self): with open(os.path.join(self.job_opt_uff.local_path, input_filenames[self.job_opt_uff.job_adapter]), 'r') as f: content_opt_uff = f.read() job_opt_uff_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt uff IOp(2/9=2000) @@ -776,7 +779,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_10.local_path, input_filenames[self.job_10.job_adapter]), 'r') as f: content_10 = f.read() job_10_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) uwb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight) @@ -794,7 +797,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_11.local_path, input_filenames[self.job_11.job_adapter]), 'r') as f: content_11 = f.read() job_11_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight) @@ -820,7 +823,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_12.local_path, input_filenames[self.job_12.job_adapter]), 'r') as f: content_12 = f.read() job_12_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(direct,tight,xqc) @@ -846,7 +849,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_13.local_path, input_filenames[self.job_13.job_adapter]), 'r') as f: content_13 = f.read() job_13_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(NDamp=30,direct,tight,xqc) @@ -872,7 +875,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_14.local_path, input_filenames[self.job_14.job_adapter]), 'r') as f: content_14 = f.read() job_14_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(NDamp=30,NoDIIS,direct,tight,xqc) @@ -898,7 +901,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_15.local_path, input_filenames[self.job_15.job_adapter]), 'r') as f: content_15 = f.read() job_15_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,cartesian,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(NDamp=30,NoDIIS,direct,tight,xqc) @@ -925,7 +928,7 @@ def test_trsh_write_input_file(self): content_16 = f.read() job_16_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(cartesian) integral=(grid=ultrafine, Acc2E=14) guess=INDO wb97xd IOp(2/9=2000) nosymm scf=(Fermi,NDamp=30,NoDIIS,NoVarAcc,Noincfock,direct,tight,xqc) @@ -952,7 +955,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_17.local_path, input_filenames[self.job_17.job_adapter]), 'r') as f: content_17 = f.read() job_17_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight,xqc) @@ -978,7 +981,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_18.local_path, input_filenames[self.job_18.job_adapter]), 'r') as f: content_18 = f.read() job_18_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) int=grid=300590 scf=(direct,tight) @@ -1004,7 +1007,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_19.local_path, input_filenames[self.job_19.job_adapter]), 'r') as f: content_19 = f.read() job_19_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) nosymm scf=(Fermi,NDamp=30,NoDIIS,NoVarAcc,Noincfock,direct,tight,xqc) @@ -1030,7 +1033,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_20.local_path, input_filenames[self.job_20.job_adapter]), 'r') as f: content_20 = f.read() job_20_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(NDamp=30,NoDIIS,NoVarAcc,direct,tight,xqc) @@ -1057,7 +1060,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_21.local_path, input_filenames[self.job_21.job_adapter]), 'r') as f: content_21 = f.read() job_21_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=100,maxstep=5,tight) guess=INDO wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) int=grid=300590 scf=(NDamp=30,NoDIIS,NoVarAcc,direct,tight,xqc) @@ -1084,7 +1087,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_22.local_path, input_filenames[self.job_22.job_adapter]), 'r') as f: content_22 = f.read() job_22_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight) @@ -1111,7 +1114,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_23.local_path, input_filenames[self.job_23.job_adapter]), 'r') as f: content_23 = f.read() job_23_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(RFO,calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight) @@ -1138,7 +1141,7 @@ def test_trsh_write_input_file(self): with open(os.path.join(self.job_24.local_path, input_filenames[self.job_24.job_adapter]), 'r') as f: content_24 = f.read() job_24_expected_input_file = """%chk=check.chk -%mem=14336mb +%mem=14193mb %NProcShared=8 #P opt=(GDIIS,calcfc,maxcycle=200,maxstep=5,tight) guess=mix wb97xd integral=(grid=ultrafine, Acc2E=14) IOp(2/9=2000) scf=(direct,tight) diff --git a/arc/job/trsh.py b/arc/job/trsh.py index f1878a7011..61b77a8e1c 100644 --- a/arc/job/trsh.py +++ b/arc/job/trsh.py @@ -42,9 +42,11 @@ delete_command, inconsistency_ab, inconsistency_az, maximum_barrier, preserve_params_in_scan, rotor_scan_resolution, \ - servers, submit_filenames = settings['delete_command'], settings['inconsistency_ab'], settings['inconsistency_az'], \ - settings['maximum_barrier'], settings['preserve_params_in_scan'], \ - settings['rotor_scan_resolution'], settings['servers'], settings['submit_filenames'] + servers, submit_filenames, default_job_settings = settings['delete_command'], settings['inconsistency_ab'], \ + settings['inconsistency_az'], settings['maximum_barrier'], \ + settings['preserve_params_in_scan'], \ + settings['rotor_scan_resolution'], settings['servers'], \ + settings['submit_filenames'], settings['default_job_settings'] def determine_ess_status(output_path: str, @@ -980,11 +982,24 @@ def trsh_ess_job(label: str, # Increase memory allocation couldnt_trsh = False max_mem = servers[server].get('memory', 128) # Node memory in GB, defaults to 128 if not specified - memory = min(memory_gb * 2, max_mem * 0.95) + max_mem_allocation = max_mem * default_job_settings.get('job_max_server_node_memory_allocation', 0.95) + memory = min(memory_gb * 2, max_mem_allocation) if memory > memory_gb: logger.info(f'Troubleshooting {job_type} job in {software} for {label} using more memory: {memory} GB ' f'instead of {memory_gb} GB') ess_trsh_methods.append('memory') + else: + couldnt_trsh = True + output_errors.append( + f'Error: Could not troubleshoot {job_type} for {label}! Gaussian exhausted memory even after ARC ' + f'reached the configured node-memory cap ({max_mem_allocation:.2f} GB total allocation) while ' + f'still reserving scheduler headroom. Use a higher-memory node or lower the job cost; ' + ) + logger.error( + f'Could not troubleshoot {job_type} job in {software} for {label}. ARC already reached the ' + f'configured node-memory cap ({max_mem_allocation:.2f} GB total allocation) and still preserved ' + f'Gaussian headroom.' + ) if attempted_ess_trsh_methods: if attempted_ess_trsh_methods == ess_trsh_methods: diff --git a/arc/job/trsh_test.py b/arc/job/trsh_test.py index d974874e9c..c4c1d8b709 100644 --- a/arc/job/trsh_test.py +++ b/arc/job/trsh_test.py @@ -432,6 +432,20 @@ def test_trsh_ess_job(self): self.assertIn('Error: Could not troubleshoot opt for ethanol! The job ran out of disc space on server1; ', output_errors) # Gaussian: test 6 + job_status = {'keywords': ['Memory', 'max_total_job_memory'], + 'error': 'Memory allocation failed (did you ask for too much?)'} + capped_memory_gb = settings['default_job_settings']['job_max_server_node_memory_allocation'] * \ + settings['servers']['server2']['memory'] + output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ + memory, shift, cpu_cores, couldnt_trsh = trsh.trsh_ess_job(label, level_of_theory, 'server2', job_status, + job_type, software, fine, capped_memory_gb, + num_heavy_atoms, cpu_cores, []) + + self.assertTrue(couldnt_trsh) + self.assertEqual(memory, capped_memory_gb) + self.assertIn('Use a higher-memory node or lower the job cost', output_errors[0]) + + # Gaussian: test 7 job_status = {'keywords': ['SCF', 'GL502', 'NoSymm']} ess_trsh_methods = ['scf=(NoDIIS)', 'int=(Acc2E=14)', 'checkfile=None', 'scf=(qc)', 'NoSymm','scf=(NDamp=30)', 'guess=INDO', 'scf=(Fermi)', 'scf=(Noincfock)', 'scf=(NoVarAcc)'] @@ -445,7 +459,7 @@ def test_trsh_ess_job(self): output_errors, ) - # Gaussian: test 7 + # Gaussian: test 8 job_status = {'keywords': ['MaxOptCycles', 'GL9999','SCF']} ess_trsh_methods = ['int=(Acc2E=14)'] output_errors, ess_trsh_methods, remove_checkfile, level_of_theory, software, job_type, fine, trsh_keyword, \ @@ -455,7 +469,7 @@ def test_trsh_ess_job(self): self.assertFalse(couldnt_trsh) self.assertIn('opt=(maxcycle=200)', ess_trsh_methods) - # Gaussian: test 8 - part 1 + # Gaussian: test 9 - part 1 # 'InaccurateQuadrature', 'GL502' job_status = {'keywords': ['InaccurateQuadrature', 'GL502']} ess_trsh_methods = ['int=(Acc2E=14)'] @@ -467,7 +481,7 @@ def test_trsh_ess_job(self): self.assertIn('int=(Acc2E=14)', ess_trsh_methods) self.assertIn('int=grid=300590', ess_trsh_methods) - # Gaussian: test 8 - part 2 + # Gaussian: test 9 - part 2 # 'InaccurateQuadrature', 'GL502' job_status = {'keywords': ['InaccurateQuadrature', 'GL502']} ess_trsh_methods = ['int=(Acc2E=14)', 'int=grid=300590'] @@ -480,7 +494,7 @@ def test_trsh_ess_job(self): self.assertIn('int=grid=300590', ess_trsh_methods) self.assertIn('scf=(NoVarAcc)', ess_trsh_methods) - # Gaussian: test 8 - part 3 + # Gaussian: test 9 - part 3 # 'InaccurateQuadrature', 'GL502' job_status = {'keywords': ['InaccurateQuadrature', 'GL502']} ess_trsh_methods = ['int=(Acc2E=14)', 'int=grid=300590', 'scf=(NoVarAcc)'] @@ -494,7 +508,7 @@ def test_trsh_ess_job(self): self.assertIn('scf=(NoVarAcc)', ess_trsh_methods) self.assertIn('guess=INDO', ess_trsh_methods) - # Gaussian: test 9 - part 1 + # Gaussian: test 10 - part 1 # 'MaxOptCycles', 'GL9999' # Adding maxcycle=200 to opt job_status = {'keywords': ['MaxOptCycles', 'GL9999']} @@ -506,7 +520,7 @@ def test_trsh_ess_job(self): self.assertFalse(couldnt_trsh) self.assertIn('opt=(maxcycle=200)', ess_trsh_methods) - # Gaussian: test 9 - part 2 + # Gaussian: test 10 - part 2 # 'MaxOptCycles', 'GL9999' # Adding RFO to opt job_status = {'keywords': ['MaxOptCycles', 'GL9999']} @@ -520,7 +534,7 @@ def test_trsh_ess_job(self): self.assertIn('opt=(RFO)', ess_trsh_methods) self.assertIn('opt=(maxcycle=200,RFO)', trsh_keyword) - # Gaussian: test 9 - part 3 + # Gaussian: test 10 - part 3 # 'MaxOptCycles', 'GL9999' # Adding GDIIS to opt # Removing RFO from opt @@ -536,7 +550,7 @@ def test_trsh_ess_job(self): self.assertIn('opt=(GDIIS)', ess_trsh_methods) self.assertIn('opt=(maxcycle=200,GDIIS)', trsh_keyword) - # Gaussian: test 9 - part 4 + # Gaussian: test 10 - part 4 # 'MaxOptCycles', 'GL9999' # Adding GEDIIS to opt # Removing RFO from opt @@ -554,7 +568,7 @@ def test_trsh_ess_job(self): self.assertIn('opt=(GEDIIS)', ess_trsh_methods) self.assertIn('opt=(maxcycle=200,GEDIIS)', trsh_keyword) - # Gaussian: test 9 - part 5 + # Gaussian: test 10 - part 5 # 'MaxOptCycles', 'GL9999' # Final test to ensure that it cannot troubleshoot the job further job_status = {'keywords': ['MaxOptCycles', 'GL9999']} diff --git a/arc/settings/submit.py b/arc/settings/submit.py index 993681319a..bc1e23ec5b 100644 --- a/arc/settings/submit.py +++ b/arc/settings/submit.py @@ -63,6 +63,10 @@ {python_exe} -m arc.scripts.pipe_worker --pipe_root {pipe_root} --worker_id $WORKER_ID """, + # ARC passes a base-2-derived MiB integer into the PBS template. On our PBS + # systems, the `mb` suffix is interpreted accordingly, so the directive + # remains `mem={memory}mb` even though ARC's internal conversion uses + # 1 GiB = 1024 MiB. 'pbs': """#!/bin/bash -l #PBS -N {name} #PBS -q {queue} @@ -104,6 +108,9 @@ # Submission scripts stored as a dictionary with server and software as primary and secondary keys submit_scripts = { 'local': { + # ARC passes a base-2-derived MiB integer into this sample PBS + # template. On our PBS systems, the `mb` suffix is interpreted in + # base-2, so the directive stays `mem={memory}mb`. 'gaussian': """#!/bin/bash -l #SBATCH -p normal #SBATCH -J {name}