Warning

This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.jobs.runners.state_handlers.resubmit

import logging
from datetime import datetime

from galaxy import model
from galaxy.jobs.runners import JobState
from ._safe_eval import safe_eval

__all__ = ('failure', )

log = logging.getLogger(__name__)

MESSAGES = dict(
    walltime_reached='it reached the walltime',
    memory_limit_reached='it exceeded the amount of allocated memory',
    unknown_error='it encountered an unknown error',
    tool_detected='it encountered a tool detected error condition',
)


def eval_condition(condition, job_state):
    runner_state = getattr(job_state, 'runner_state', None) or JobState.runner_states.UNKNOWN_ERROR

    attempt = 1
    now = datetime.utcnow()
    last_running_state = None
    last_queued_state = None
    for state in job_state.job_wrapper.get_job().state_history:
        if state.state == model.Job.states.RUNNING:
            last_running_state = state
        elif state.state == model.Job.states.QUEUED:
            last_queued_state = state
        elif state.state == model.Job.states.RESUBMITTED:
            attempt = attempt + 1

    seconds_running = 0
    seconds_since_queued = 0
    if last_running_state:
        seconds_running = (now - last_running_state.create_time).total_seconds()
    if last_queued_state:
        seconds_since_queued = (now - last_queued_state.create_time).total_seconds()

    condition_locals = {
        "walltime_reached": runner_state == JobState.runner_states.WALLTIME_REACHED,
        "memory_limit_reached": runner_state == JobState.runner_states.MEMORY_LIMIT_REACHED,
        "tool_detected_failure": runner_state == JobState.runner_states.TOOL_DETECT_ERROR,
        "unknown_error": JobState.runner_states.UNKNOWN_ERROR,
        "any_failure": True,
        "any_potential_job_failure": True,  # Add a hook here - later on allow tools to describe things that are definitely input problems.
        "attempt": attempt,
        "seconds_running": seconds_running,
        "seconds_since_queued": seconds_since_queued,
    }

    # Small optimization to eliminate the need to parse AST and eval for simple variables.
    if condition in condition_locals:
        return condition_locals[condition]
    else:
        return safe_eval(condition, condition_locals)


[docs]def failure(app, job_runner, job_state): # Leave handler quickly if no resubmit conditions specified or if the runner state doesn't allow resubmission. resubmit_definitions = job_state.job_destination.get('resubmit') if not resubmit_definitions: return runner_state = getattr(job_state, 'runner_state', None) or JobState.runner_states.UNKNOWN_ERROR if (runner_state not in (JobState.runner_states.WALLTIME_REACHED, JobState.runner_states.MEMORY_LIMIT_REACHED, JobState.runner_states.JOB_OUTPUT_NOT_RETURNED_FROM_CLUSTER, JobState.runner_states.TOOL_DETECT_ERROR, JobState.runner_states.UNKNOWN_ERROR)): # not set or not a handleable runner state return _handle_resubmit_definitions(resubmit_definitions, app, job_runner, job_state)
def _handle_resubmit_definitions(resubmit_definitions, app, job_runner, job_state): runner_state = getattr(job_state, 'runner_state', None) or JobState.runner_states.UNKNOWN_ERROR # Setup environment for evaluating resubmission conditions and related expression. expression_context = _ExpressionContext(job_state) # Intercept jobs that hit the walltime and have a walltime or # nonspecific resubmit destination configured for resubmit in resubmit_definitions: condition = resubmit.get('condition', None) if condition and not expression_context.safe_eval(condition): # There is a resubmit defined for the destination but # its condition is not for the encountered state continue external_id = getattr(job_state, "job_id", None) if external_id: job_log_prefix = f"({job_state.job_wrapper.job_id}/{job_state.job_id})" else: job_log_prefix = "(%s)" % (job_state.job_wrapper.job_id) # Is destination needed here, might these be serialized to the database? destination = resubmit.get('environment') or resubmit.get('destination') log.info("%s Job will be resubmitted to '%s' because %s at " "the '%s' destination", job_log_prefix, destination, MESSAGES[runner_state], job_state.job_wrapper.job_destination.id) # fetch JobDestination for the id or tag if destination: new_destination = app.job_config.get_destination(destination) else: new_destination = job_state.job_destination # Resolve dynamic if necessary new_destination = (job_state.job_wrapper.job_runner_mapper .cache_job_destination(new_destination)) # Reset job state job_state.job_wrapper.clear_working_directory() job_state.job_wrapper.invalidate_external_metadata() job = job_state.job_wrapper.get_job() if resubmit.get('handler', None): log.debug('%s Job reassigned to handler %s', job_log_prefix, resubmit['handler']) job.set_handler(resubmit['handler']) job_runner.sa_session.add(job) # Is this safe to do here? job_runner.sa_session.flush() # Cache the destination to prevent rerunning dynamic after # resubmit job_state.job_wrapper.job_runner_mapper \ .cached_job_destination = new_destination # Handle delaying before resubmission if needed. raw_delay = resubmit.get('delay') if raw_delay: delay = str(expression_context.safe_eval(str(raw_delay))) try: # ensure result acts like a number when persisted. float(delay) new_destination.params['__resubmit_delay_seconds'] = str(delay) except ValueError: log.warning("Cannot delay job with delay [%s], does not appear to be a number." % delay) job_state.job_wrapper.set_job_destination(new_destination) # Clear external ID (state change below flushes the change) job.job_runner_external_id = None # Allow the UI to query for resubmitted state if job.params is None: job.params = {} job_state.runner_state_handled = True info = "This job was resubmitted to the queue because %s on its " \ "compute resource." % MESSAGES[runner_state] job_runner.mark_as_resubmitted(job_state, info=info) return class _ExpressionContext: def __init__(self, job_state): self._job_state = job_state self._lazy_context = None def safe_eval(self, condition): if condition.isdigit(): return int(condition) if self._lazy_context is None: runner_state = getattr(self._job_state, 'runner_state', None) or JobState.runner_states.UNKNOWN_ERROR attempt = 1 now = datetime.utcnow() last_running_state = None last_queued_state = None for state in self._job_state.job_wrapper.get_job().state_history: if state.state == model.Job.states.RUNNING: last_running_state = state elif state.state == model.Job.states.QUEUED: last_queued_state = state elif state.state == model.Job.states.RESUBMITTED: attempt = attempt + 1 seconds_running = 0 seconds_since_queued = 0 if last_running_state: seconds_running = (now - last_running_state.create_time).total_seconds() if last_queued_state: seconds_since_queued = (now - last_queued_state.create_time).total_seconds() self._lazy_context = { "walltime_reached": runner_state == JobState.runner_states.WALLTIME_REACHED, "memory_limit_reached": runner_state == JobState.runner_states.MEMORY_LIMIT_REACHED, "unknown_error": runner_state == JobState.runner_states.UNKNOWN_ERROR, "tool_detected_failure": runner_state == JobState.runner_states.TOOL_DETECT_ERROR, "any_failure": True, "any_potential_job_failure": True, # Add a hook here - later on allow tools to describe things that are definitely input problems. "attempt": attempt, "seconds_running": seconds_running, "seconds_since_queued": seconds_since_queued, } # Small optimization to eliminate the need to parse AST and eval for simple variables. if condition in self._lazy_context: return self._lazy_context[condition] else: return safe_eval(condition, self._lazy_context)