Skip to content

Commit d6759d0

Browse files
fix: account lookup / test in multicluster environment (#350)
We need to perform the account testing with `sacctmgr`, first, then with `sshare`. `sshare` might not provide all accounts, as SLURM in a multicluster environment is only queried on one cluster. This is an observation from my cluster, where `sshare` did only show accounts for cluster _A_, whereas `sacctmgr` displayed for clusters `A` and `B`. The reason is, that `sshare` queries the `slurmctld` of a given cluster, whereas `sacctmgr` queries the overlooking SLURM database of all clusters in a multi cluster environment. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Accept multiple SLURM accounts (comma/space-separated) and handle each as a distinct account entry; when multiple provided, the first is used for submission. * Safer quoting for account and partition values to prevent shell injection. * **Bug Fixes** * Prefer sacctmgr for account validation in multicluster setups, falling back to sshare when needed. * Deduplicate reported accounts and provide clearer combined error reports when validation fails. * **Documentation** * Clarified messages and comments about multicluster validation and deduplication. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent 660c800 commit d6759d0

File tree

1 file changed

+33
-17
lines changed

1 file changed

+33
-17
lines changed

snakemake_executor_plugin_slurm/__init__.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ def run_job(self, job: JobExecutorInterface):
307307
"run_uuid": self.run_uuid,
308308
"slurm_logfile": slurm_logfile,
309309
"comment_str": comment_str,
310-
"account": self.get_account_arg(job),
310+
"account": next(self.get_account_arg(job)),
311311
"partition": self.get_partition_arg(job),
312312
"workdir": self.workflow.workdir_init,
313313
}
@@ -668,26 +668,37 @@ def get_account_arg(self, job: JobExecutorInterface):
668668
else raises an error - implicetly.
669669
"""
670670
if job.resources.get("slurm_account"):
671-
# here, we check whether the given or guessed account is valid
672-
# if not, a WorkflowError is raised
673-
self.test_account(job.resources.slurm_account)
674-
return f" -A '{job.resources.slurm_account}'"
671+
# split the account upon ',' and whitespace, to allow
672+
# multiple accounts being given
673+
accounts = [
674+
a for a in re.split(r"[,\s]+", job.resources.slurm_account) if a
675+
]
676+
for account in accounts:
677+
# here, we check whether the given or guessed account is valid
678+
# if not, a WorkflowError is raised
679+
self.test_account(account)
680+
# sbatch only allows one account per submission
681+
# yield one after the other, if multiple were given
682+
# we have to quote the account, because it might
683+
# contain build-in shell commands - see issue #354
684+
for account in accounts:
685+
yield f" -A {shlex.quote(account)}"
675686
else:
676687
if self._fallback_account_arg is None:
677688
self.logger.warning("No SLURM account given, trying to guess.")
678689
account = self.get_account()
679690
if account:
680691
self.logger.warning(f"Guessed SLURM account: {account}")
681692
self.test_account(f"{account}")
682-
self._fallback_account_arg = f" -A {account}"
693+
self._fallback_account_arg = f" -A {shlex.quote(account)}"
683694
else:
684695
self.logger.warning(
685696
"Unable to guess SLURM account. Trying to proceed without."
686697
)
687698
self._fallback_account_arg = (
688699
"" # no account specific args for sbatch
689700
)
690-
return self._fallback_account_arg
701+
yield self._fallback_account_arg
691702

692703
def get_partition_arg(self, job: JobExecutorInterface):
693704
"""
@@ -702,7 +713,9 @@ def get_partition_arg(self, job: JobExecutorInterface):
702713
self._fallback_partition = self.get_default_partition(job)
703714
partition = self._fallback_partition
704715
if partition:
705-
return f" -p {partition}"
716+
# we have to quote the partition, because it might
717+
# contain build-in shell commands
718+
return f" -p {shlex.quote(partition)}"
706719
else:
707720
return ""
708721

@@ -730,32 +743,35 @@ def test_account(self, account):
730743
"""
731744
tests whether the given account is registered, raises an error, if not
732745
"""
733-
cmd = "sshare -U --format Account%256 --noheader"
746+
# first we need to test with sacctmgr because sshare might not
747+
# work in a multicluster environment
748+
cmd = f'sacctmgr -n -s list user "{os.environ["USER"]}" format=account%256'
749+
sacctmgr_report = sshare_report = ""
734750
try:
735751
accounts = subprocess.check_output(
736752
cmd, shell=True, text=True, stderr=subprocess.PIPE
737753
)
738754
except subprocess.CalledProcessError as e:
739-
sshare_report = (
755+
sacctmgr_report = (
740756
"Unable to test the validity of the given or guessed"
741-
f" SLURM account '{account}' with sshare: {e.stderr}."
757+
f" SLURM account '{account}' with sacctmgr: {e.stderr}."
742758
)
743759
accounts = ""
744760

745761
if not accounts.strip():
746-
cmd = f'sacctmgr -n -s list user "{os.environ["USER"]}" format=account%256'
762+
cmd = "sshare -U --format Account%256 --noheader"
747763
try:
748764
accounts = subprocess.check_output(
749765
cmd, shell=True, text=True, stderr=subprocess.PIPE
750766
)
751767
except subprocess.CalledProcessError as e:
752-
sacctmgr_report = (
768+
sshare_report = (
753769
"Unable to test the validity of the given or guessed "
754-
f"SLURM account '{account}' with sacctmgr: {e.stderr}."
770+
f"SLURM account '{account}' with sshare: {e.stderr}."
755771
)
756772
raise WorkflowError(
757-
f"The 'sshare' reported: '{sshare_report}' "
758-
f"and likewise 'sacctmgr' reported: '{sacctmgr_report}'."
773+
f"The 'sacctmgr' reported: '{sacctmgr_report}' "
774+
f"and likewise 'sshare' reported: '{sshare_report}'."
759775
)
760776

761777
# The set() has been introduced during review to eliminate
@@ -764,7 +780,7 @@ def test_account(self, account):
764780

765781
if not accounts:
766782
self.logger.warning(
767-
f"Both 'sshare' and 'sacctmgr' returned empty results for account "
783+
f"Both 'sacctmgr' and 'sshare' returned empty results for account "
768784
f"'{account}'. Proceeding without account validation."
769785
)
770786
return ""

0 commit comments

Comments
 (0)