Source code for galaxy.jobs.runners.slurm

"""
SLURM job control via the DRMAA API.
"""

import os
import time

from galaxy import model
from galaxy.jobs.runners.drmaa import DRMAAJobRunner
from galaxy.util import (
    commands,
    unicodify,
)
from galaxy.util.custom_logging import get_logger

log = get_logger(__name__)

__all__ = ("SlurmJobRunner",)

# Error message printed to job stderr when SLURM itself kills a job.
# See src/common/slurm_jobacct_gather.c and src/slurmd/slurmd/req.c in
# https://github.com/SchedMD/slurm/
SLURM_MEMORY_LIMIT_EXCEEDED_MSG = "slurmstepd: error: Exceeded job memory limit"
# Warning messages which may be printed to job stderr by SLURM after termination
# of a job step when using the cgroup task plugin. The exceeded memory is not
# always the cause of the step termination, which can be successful.
# See src/plugins/task/cgroup/task_cgroup_memory.c in
# https://github.com/SchedMD/slurm/
SLURM_MEMORY_LIMIT_EXCEEDED_PARTIAL_WARNINGS = [
    ": Exceeded job memory limit at some point.",
    ": Exceeded step memory limit at some point.",
]

# These messages are returned to the user
OUT_OF_MEMORY_MSG = "This job was terminated because it used more memory than it was allocated."
PROBABLY_OUT_OF_MEMORY_MSG = "This job was cancelled probably because it used more memory than it was allocated."


[docs]class SlurmJobRunner(DRMAAJobRunner): runner_name = "SlurmRunner" restrict_job_name_length = False def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): def _get_slurm_state_with_sacct(job_id, cluster): cmd = ["sacct", "-n", "-o", "state%-32"] if cluster: cmd.extend(["-M", cluster]) cmd.extend(["-j", job_id]) try: stdout = commands.execute(cmd) except commands.CommandLineException as e: if e.stderr.strip() == "SLURM accounting storage is disabled": log.warning("SLURM accounting storage is not properly configured, unable to run sacct") return raise e # First line is for 'job_id' # Second line is for 'job_id.batch' (only available after the batch job is complete) # Following lines are for the steps 'job_id.0', 'job_id.1', ... (but Galaxy does not use steps) first_line = stdout.splitlines()[0] # Strip whitespaces and the final '+' (if present), only return the first word return first_line.strip().rstrip("+").split()[0] def _get_slurm_state(): cmd = ["scontrol", "-o"] if "." in ajs.job_id: # custom slurm-drmaa-with-cluster-support job id syntax job_id, cluster = ajs.job_id.split(".", 1) cmd.extend(["-M", cluster]) else: job_id = ajs.job_id cluster = None cmd.extend(["show", "job", job_id]) try: stdout = commands.execute(cmd).strip() except commands.CommandLineException as e: if e.stderr == "slurm_load_jobs error: Invalid job id specified\n": # The job may be old, try to get its state with sacct job_state = _get_slurm_state_with_sacct(job_id, cluster) if job_state: return job_state return "NOT_FOUND" raise e # stdout is a single line in format "key1=value1 key2=value2 ..." job_info_keys = [] job_info_values = [] for job_info in stdout.split(): try: # Some value may contain `=` (e.g. `StdIn=StdIn=/dev/null`) k, v = job_info.split("=", 1) job_info_keys.append(k) job_info_values.append(v) except ValueError: # Some value may contain spaces (e.g. `Comment=** time_limit (60m) min_nodes (1) **`) job_info_values[-1] += f" {job_info}" job_info_dict = dict(zip(job_info_keys, job_info_values)) return job_info_dict["JobState"] try: if drmaa_state == self.drmaa_job_states.FAILED: slurm_state = _get_slurm_state() sleep = 1 while slurm_state == "COMPLETING": log.debug( "(%s/%s) Waiting %s seconds for failed job to exit COMPLETING state for post-mortem", ajs.job_wrapper.get_id_tag(), ajs.job_id, sleep, ) time.sleep(sleep) sleep *= 2 if sleep > 64: ajs.fail_message = "This job failed and the system timed out while trying to determine the cause of the failure." break slurm_state = _get_slurm_state() if slurm_state == "NOT_FOUND": log.warning( "(%s/%s) Job not found, assuming job check exceeded MinJobAge and completing as successful", ajs.job_wrapper.get_id_tag(), ajs.job_id, ) drmaa_state = self.drmaa_job_states.DONE elif slurm_state == "COMPLETED": log.debug( "(%s/%s) SLURM reported job success, assuming job check exceeded MinJobAge and completing as successful", ajs.job_wrapper.get_id_tag(), ajs.job_id, ) drmaa_state = self.drmaa_job_states.DONE elif slurm_state == "TIMEOUT": log.info("(%s/%s) Job hit walltime", ajs.job_wrapper.get_id_tag(), ajs.job_id) ajs.fail_message = ( "This job was terminated because it ran longer than the maximum allowed job run time." ) ajs.runner_state = ajs.runner_states.WALLTIME_REACHED elif slurm_state == "NODE_FAIL": log.warning( "(%s/%s) Job failed due to node failure, attempting resubmission", ajs.job_wrapper.get_id_tag(), ajs.job_id, ) ajs.job_wrapper.change_state( model.Job.states.QUEUED, info="Job was resubmitted due to node failure" ) try: self.queue_job(ajs.job_wrapper) return except Exception: ajs.fail_message = ( "This job failed due to a cluster node failure, and an attempt to resubmit the job failed." ) elif slurm_state == "OUT_OF_MEMORY": log.info( "(%s/%s) Job hit memory limit (SLURM state: OUT_OF_MEMORY)", ajs.job_wrapper.get_id_tag(), ajs.job_id, ) ajs.fail_message = OUT_OF_MEMORY_MSG ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED elif slurm_state == "CANCELLED": if ajs.job_wrapper.get_state() == model.Job.states.STOPPED: # User requested to stop job, this isn't an error, just finish as normal return super()._complete_terminal_job(ajs, drmaa_state=drmaa_state) # Check to see if the job was killed for exceeding memory consumption check_memory_limit_msg = self.__check_memory_limit(ajs.error_file) if check_memory_limit_msg: log.info( "(%s/%s) Job hit memory limit (SLURM state: CANCELLED)", ajs.job_wrapper.get_id_tag(), ajs.job_id, ) ajs.fail_message = check_memory_limit_msg ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED else: log.info( "(%s/%s) Job was cancelled via SLURM (e.g. with scancel(1))", ajs.job_wrapper.get_id_tag(), ajs.job_id, ) ajs.fail_message = "This job failed because it was cancelled by an administrator." elif slurm_state in ("PENDING", "RUNNING"): log.warning( "(%s/%s) Job was reported by drmaa as terminal but job state in SLURM is: %s, returning to monitor queue", ajs.job_wrapper.get_id_tag(), ajs.job_id, slurm_state, ) return True else: log.warning( "(%s/%s) Job failed due to unknown reasons, job state in SLURM was: %s", ajs.job_wrapper.get_id_tag(), ajs.job_id, slurm_state, ) ajs.fail_message = "This job failed for reasons that could not be determined." if drmaa_state == self.drmaa_job_states.FAILED: ajs.fail_message += "\nPlease click the bug icon to report this problem if you need help." ajs.stop_job = False self.work_queue.put((self.fail_job, ajs)) return except Exception: log.exception( "(%s/%s) Failure in SLURM _complete_terminal_job(), job final state will be: %s", ajs.job_wrapper.get_id_tag(), ajs.job_id, drmaa_state, ) # by default, finish the job with the state from drmaa return super()._complete_terminal_job(ajs, drmaa_state=drmaa_state) def __check_memory_limit(self, efile_path): """ A very poor implementation of tail, but it doesn't need to be fancy since we are only searching the last 2K """ try: log.debug("Checking %s for exceeded memory message from SLURM", efile_path) with open(efile_path, "rb") as f: if os.path.getsize(efile_path) > 2048: f.seek(-2048, os.SEEK_END) f.readline() for line in f.readlines(): stripped_line = unicodify(line.strip()) if stripped_line == SLURM_MEMORY_LIMIT_EXCEEDED_MSG: return OUT_OF_MEMORY_MSG elif any(_ in stripped_line for _ in SLURM_MEMORY_LIMIT_EXCEEDED_PARTIAL_WARNINGS): return PROBABLY_OUT_OF_MEMORY_MSG except FileNotFoundError: # Entirely expected, as __check_memory_limit is only called if the job state is CANCELLED return False except Exception: log.exception("Error reading end of %s:", efile_path) return False