Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.workflow.run
import logging
import uuid
from typing import (
Any,
Dict,
List,
Optional,
Tuple,
TYPE_CHECKING,
Union,
)
from typing_extensions import Protocol
from galaxy import model
from galaxy.exceptions import MessageException
from galaxy.model import (
WorkflowInvocation,
WorkflowInvocationStep,
)
from galaxy.model.base import transaction
from galaxy.schema.invocation import (
CancelReason,
FailureReason,
InvocationCancellationHistoryDeleted,
InvocationFailureCollectionFailed,
InvocationFailureDatasetFailed,
InvocationFailureJobFailed,
InvocationFailureOutputNotFound,
InvocationUnexpectedFailure,
InvocationWarningWorkflowOutputNotFound,
WarningReason,
)
from galaxy.util import ExecutionTimer
from galaxy.workflow import modules
from galaxy.workflow.run_request import (
workflow_request_to_run_config,
workflow_run_config_to_request,
WorkflowRunConfig,
)
if TYPE_CHECKING:
from galaxy.model import (
Workflow,
WorkflowOutput,
WorkflowStep,
WorkflowStepConnection,
)
from galaxy.webapps.base.webapp import GalaxyWebTransaction
from galaxy.work.context import WorkRequestContext
log = logging.getLogger(__name__)
WorkflowOutputsType = Dict[int, Any]
# Entry point for core workflow scheduler.
def schedule(
trans: "WorkRequestContext",
workflow: "Workflow",
workflow_run_config: WorkflowRunConfig,
workflow_invocation: WorkflowInvocation,
) -> Tuple[WorkflowOutputsType, WorkflowInvocation]:
return __invoke(trans, workflow, workflow_run_config, workflow_invocation)
def __invoke(
trans: "WorkRequestContext",
workflow: "Workflow",
workflow_run_config: WorkflowRunConfig,
workflow_invocation: Optional[WorkflowInvocation] = None,
populate_state: bool = False,
) -> Tuple[WorkflowOutputsType, WorkflowInvocation]:
"""Run the supplied workflow in the supplied target_history."""
if populate_state:
modules.populate_module_and_state(
trans,
workflow,
workflow_run_config.param_map,
allow_tool_state_corrections=workflow_run_config.allow_tool_state_corrections,
)
invoker = WorkflowInvoker(
trans,
workflow,
workflow_run_config,
workflow_invocation=workflow_invocation,
)
workflow_invocation = invoker.workflow_invocation
outputs = {}
try:
outputs = invoker.invoke()
except modules.CancelWorkflowEvaluation as e:
if workflow_invocation.cancel():
workflow_invocation.add_message(e.why)
except modules.FailWorkflowEvaluation as e:
workflow_invocation.fail()
workflow_invocation.add_message(e.why)
except MessageException as e:
# Convention for safe message we can show to users
workflow_invocation.fail()
failure = InvocationUnexpectedFailure(reason=FailureReason.unexpected_failure, details=str(e))
workflow_invocation.add_message(failure)
except Exception:
# Could potentially be large and/or contain raw ids or other secrets, don't add details
log.exception("Failed to execute scheduled workflow.")
# Running workflow invocation in background, just mark
# persistent workflow invocation as failed.
failure = InvocationUnexpectedFailure(reason=FailureReason.unexpected_failure)
workflow_invocation.fail()
workflow_invocation.add_message(failure)
# Be sure to update state of workflow_invocation.
trans.sa_session.add(workflow_invocation)
with transaction(trans.sa_session):
trans.sa_session.commit()
return outputs, workflow_invocation
[docs]def queue_invoke(
trans: "GalaxyWebTransaction",
workflow: "Workflow",
workflow_run_config: WorkflowRunConfig,
request_params: Optional[Dict[str, Any]] = None,
populate_state: bool = True,
flush: bool = True,
) -> WorkflowInvocation:
request_params = request_params or {}
if populate_state:
modules.populate_module_and_state(
trans,
workflow,
workflow_run_config.param_map,
allow_tool_state_corrections=workflow_run_config.allow_tool_state_corrections,
)
workflow_invocation = workflow_run_config_to_request(trans, workflow_run_config, workflow)
workflow_invocation.workflow = workflow
return trans.app.workflow_scheduling_manager.queue(workflow_invocation, request_params, flush=flush)
class WorkflowInvoker:
def __init__(
self,
trans: "WorkRequestContext",
workflow: "Workflow",
workflow_run_config: WorkflowRunConfig,
workflow_invocation: Optional[WorkflowInvocation] = None,
progress: Optional["WorkflowProgress"] = None,
) -> None:
self.trans = trans
self.workflow = workflow
self.workflow_invocation: WorkflowInvocation
if progress is not None:
assert workflow_invocation is None
workflow_invocation = progress.workflow_invocation
if workflow_invocation is None:
invocation_uuid = uuid.uuid1()
workflow_invocation = WorkflowInvocation()
workflow_invocation.workflow = self.workflow
# In one way or another, following attributes will become persistent
# so they are available during delayed/revisited workflow scheduling.
workflow_invocation.uuid = invocation_uuid
workflow_invocation.history = workflow_run_config.target_history
self.workflow_invocation = workflow_invocation
module_injector = modules.WorkflowModuleInjector(trans)
if progress is None:
progress = WorkflowProgress(
self.workflow_invocation,
workflow_run_config.inputs,
module_injector,
param_map=workflow_run_config.param_map,
jobs_per_scheduling_iteration=getattr(
trans.app.config, "maximum_workflow_jobs_per_scheduling_iteration", -1
),
copy_inputs_to_history=workflow_run_config.copy_inputs_to_history,
use_cached_job=workflow_run_config.use_cached_job,
replacement_dict=workflow_run_config.replacement_dict,
)
self.progress = progress
def invoke(self) -> Dict[int, Any]:
workflow_invocation = self.workflow_invocation
config = self.trans.app.config
maximum_duration = getattr(config, "maximum_workflow_invocation_duration", -1)
if maximum_duration > 0 and workflow_invocation.seconds_since_created > maximum_duration:
log.debug(
f"Workflow invocation [{workflow_invocation.id}] exceeded maximum number of seconds allowed for scheduling [{maximum_duration}], failing."
)
workflow_invocation.state = model.WorkflowInvocation.states.FAILED
# All jobs ran successfully, so we can save now
self.trans.sa_session.add(workflow_invocation)
# Not flushing in here, because web controller may create multiple
# invocations.
return self.progress.outputs
if workflow_invocation.history.deleted:
raise modules.CancelWorkflowEvaluation(
why=InvocationCancellationHistoryDeleted(
reason=CancelReason.history_deleted, history_id=workflow_invocation.history_id
)
)
remaining_steps = self.progress.remaining_steps()
delayed_steps = False
max_jobs_per_iteration_reached = False
for step, workflow_invocation_step in remaining_steps:
max_jobs_to_schedule = self.progress.maximum_jobs_to_schedule_or_none
if max_jobs_to_schedule is not None and max_jobs_to_schedule <= 0:
max_jobs_per_iteration_reached = True
break
step_delayed = False
step_timer = ExecutionTimer()
try:
self.__check_implicitly_dependent_steps(step)
if not workflow_invocation_step:
workflow_invocation_step = WorkflowInvocationStep()
assert workflow_invocation_step
workflow_invocation_step.workflow_invocation = workflow_invocation
workflow_invocation_step.workflow_step = step
workflow_invocation_step.state = "new"
workflow_invocation.steps.append(workflow_invocation_step)
assert workflow_invocation_step
incomplete_or_none = self._invoke_step(workflow_invocation_step)
if incomplete_or_none is False:
step_delayed = delayed_steps = True
workflow_invocation_step.state = "ready"
self.progress.mark_step_outputs_delayed(step, why="Not all jobs scheduled for state.")
else:
workflow_invocation_step.state = "scheduled"
except modules.DelayedWorkflowEvaluation as de:
step_delayed = delayed_steps = True
self.progress.mark_step_outputs_delayed(step, why=de.why)
except Exception as e:
log.exception(
"Failed to schedule %s, problem occurred on %s.",
self.workflow_invocation.workflow.log_str(),
step.log_str(),
)
if isinstance(e, MessageException):
# This is the highest level at which we can inject the step id
# to provide some more context to the exception.
raise modules.FailWorkflowEvaluation(
why=InvocationUnexpectedFailure(
reason=FailureReason.unexpected_failure, details=str(e), workflow_step_id=step.id
)
)
raise
if not step_delayed:
log.debug(f"Workflow step {step.id} of invocation {workflow_invocation.id} invoked {step_timer}")
if delayed_steps or max_jobs_per_iteration_reached:
state = model.WorkflowInvocation.states.READY
else:
state = model.WorkflowInvocation.states.SCHEDULED
workflow_invocation.state = state
# All jobs ran successfully, so we can save now
self.trans.sa_session.add(workflow_invocation)
# Not flushing in here, because web controller may create multiple
# invocations.
return self.progress.outputs
def __check_implicitly_dependent_steps(self, step):
"""Method will delay the workflow evaluation if implicitly dependent
steps (steps dependent but not through an input->output way) are not
yet complete.
"""
for input_connection in step.input_connections:
if input_connection.non_data_connection:
output_id = input_connection.output_step.id
self.__check_implicitly_dependent_step(output_id, step.id)
def __check_implicitly_dependent_step(self, output_id: int, step_id: int):
step_invocation = self.workflow_invocation.step_invocation_for_step_id(output_id)
# No steps created yet - have to delay evaluation.
if not step_invocation:
delayed_why = f"depends on step [{output_id}] but that step has not been invoked yet"
raise modules.DelayedWorkflowEvaluation(why=delayed_why)
if step_invocation.state != "scheduled":
delayed_why = f"depends on step [{output_id}] job has not finished scheduling yet"
raise modules.DelayedWorkflowEvaluation(delayed_why)
# TODO: Handle implicit dependency on stuff like pause steps.
for job in step_invocation.jobs:
# At least one job in incomplete.
if not job.finished:
delayed_why = (
f"depends on step [{output_id}] but one or more jobs created from that step have not finished yet"
)
raise modules.DelayedWorkflowEvaluation(why=delayed_why)
if job.state != job.states.OK:
raise modules.FailWorkflowEvaluation(
why=InvocationFailureJobFailed(
reason=FailureReason.job_failed,
job_id=job.id,
workflow_step_id=step_id,
dependent_workflow_step_id=output_id,
)
)
def _invoke_step(self, invocation_step: WorkflowInvocationStep) -> Optional[bool]:
incomplete_or_none = invocation_step.workflow_step.module.execute(
self.trans,
self.progress,
invocation_step,
use_cached_job=self.progress.use_cached_job,
)
return incomplete_or_none
STEP_OUTPUT_DELAYED = object()
class ModuleInjector(Protocol):
def inject(self, step, step_args=None, steps=None, **kwargs):
pass
def inject_all(self, workflow: "Workflow", param_map=None, ignore_tool_missing_exception=True, **kwargs):
pass
def compute_runtime_state(self, step, step_args=None):
pass
class WorkflowProgress:
def __init__(
self,
workflow_invocation: WorkflowInvocation,
inputs_by_step_id: Any,
module_injector: ModuleInjector,
param_map: Dict[int, Dict[str, Any]],
jobs_per_scheduling_iteration: int = -1,
copy_inputs_to_history: bool = False,
use_cached_job: bool = False,
replacement_dict: Optional[Dict[str, str]] = None,
subworkflow_collection_info=None,
when_values=None,
) -> None:
self.outputs: Dict[int, Any] = {}
self.module_injector = module_injector
self.workflow_invocation = workflow_invocation
self.inputs_by_step_id = inputs_by_step_id
self.param_map = param_map
self.jobs_per_scheduling_iteration = jobs_per_scheduling_iteration
self.jobs_scheduled_this_iteration = 0
self.copy_inputs_to_history = copy_inputs_to_history
self.use_cached_job = use_cached_job
self.replacement_dict = replacement_dict or {}
self.runtime_replacements: Dict[str, str] = {}
self.subworkflow_collection_info = subworkflow_collection_info
self.subworkflow_structure = subworkflow_collection_info.structure if subworkflow_collection_info else None
self.when_values = when_values
@property
def maximum_jobs_to_schedule_or_none(self) -> Optional[int]:
if self.jobs_per_scheduling_iteration > 0:
return self.jobs_per_scheduling_iteration - self.jobs_scheduled_this_iteration
else:
return None
def record_executed_job_count(self, job_count: int) -> None:
self.jobs_scheduled_this_iteration += job_count
def remaining_steps(
self,
) -> List[Tuple["WorkflowStep", Optional[WorkflowInvocationStep]]]:
# Previously computed and persisted step states.
step_states = self.workflow_invocation.step_states_by_step_id()
steps = self.workflow_invocation.workflow.steps
# TODO: Wouldn't a generator be much better here so we don't have to reason about
# steps we are no where near ready to schedule?
remaining_steps = []
step_invocations_by_id = self.workflow_invocation.step_invocations_by_step_id()
self.module_injector.inject_all(self.workflow_invocation.workflow, param_map=self.param_map)
for step in steps:
step_id = step.id
step_args = self.param_map.get(step_id, {})
self.module_injector.compute_runtime_state(step, step_args=step_args)
if step_id not in step_states:
# Can this ever happen?
public_message = f"Workflow invocation has no step state for step {step.order_index + 1}"
log.error(f"{public_message}. State is known for these step ids: {list(step_states.keys())}.")
raise MessageException(public_message)
runtime_state = step_states[step_id].value
assert step.module
step.state = step.module.decode_runtime_state(runtime_state)
invocation_step = step_invocations_by_id.get(step_id, None)
if invocation_step and invocation_step.state == "scheduled":
self._recover_mapping(invocation_step)
else:
remaining_steps.append((step, invocation_step))
return remaining_steps
def replacement_for_input(self, step: "WorkflowStep", input_dict: Dict[str, Any]) -> Any:
replacement: Union[
modules.NoReplacement,
model.DatasetCollectionInstance,
List[model.DatasetCollectionInstance],
] = modules.NO_REPLACEMENT
prefixed_name = input_dict["name"]
multiple = input_dict["multiple"]
if prefixed_name in step.input_connections_by_name:
connection = step.input_connections_by_name[prefixed_name]
if input_dict["input_type"] == "dataset" and multiple:
temp = [self.replacement_for_connection(c) for c in connection]
# If replacement is just one dataset collection, replace tool
# input_dict with dataset collection - tool framework will extract
# datasets properly.
if len(temp) == 1:
if isinstance(temp[0], model.HistoryDatasetCollectionAssociation):
replacement = temp[0]
else:
replacement = temp
else:
replacement = temp
else:
is_data = input_dict["input_type"] in ["dataset", "dataset_collection"]
replacement = self.replacement_for_connection(connection[0], is_data=is_data)
return replacement
def replacement_for_connection(self, connection: "WorkflowStepConnection", is_data: bool = True) -> Any:
output_step_id = connection.output_step.id
output_name = connection.output_name
if output_step_id not in self.outputs:
raise modules.FailWorkflowEvaluation(
why=InvocationFailureOutputNotFound(
reason=FailureReason.output_not_found,
workflow_step_id=connection.input_step_id,
output_name=output_name,
dependent_workflow_step_id=output_step_id,
)
)
step_outputs = self.outputs[output_step_id]
if step_outputs is STEP_OUTPUT_DELAYED:
delayed_why = f"dependent step [{output_step_id}] delayed, so this step must be delayed"
raise modules.DelayedWorkflowEvaluation(why=delayed_why)
try:
replacement = step_outputs[output_name]
except KeyError:
raise modules.FailWorkflowEvaluation(
why=InvocationFailureOutputNotFound(
reason=FailureReason.output_not_found,
workflow_step_id=connection.input_step_id,
output_name=output_name,
dependent_workflow_step_id=output_step_id,
)
)
if isinstance(replacement, model.HistoryDatasetCollectionAssociation):
if not replacement.collection.populated:
if not replacement.waiting_for_elements:
# If we are not waiting for elements, there was some
# problem creating the collection. Collection will never
# be populated.
raise modules.FailWorkflowEvaluation(
why=InvocationFailureCollectionFailed(
reason=FailureReason.collection_failed,
hdca_id=replacement.id,
workflow_step_id=connection.input_step_id,
dependent_workflow_step_id=output_step_id,
)
)
delayed_why = f"dependent collection [{replacement.id}] not yet populated with datasets"
raise modules.DelayedWorkflowEvaluation(why=delayed_why)
if isinstance(replacement, model.DatasetCollection):
raise NotImplementedError
if not is_data and isinstance(
replacement, (model.HistoryDatasetAssociation, model.HistoryDatasetCollectionAssociation)
):
if isinstance(replacement, model.HistoryDatasetAssociation):
if replacement.is_pending:
raise modules.DelayedWorkflowEvaluation()
if not replacement.is_ok:
raise modules.FailWorkflowEvaluation(
why=InvocationFailureDatasetFailed(
reason=FailureReason.dataset_failed,
hda_id=replacement.id,
workflow_step_id=connection.input_step_id,
dependent_workflow_step_id=output_step_id,
)
)
else:
if not replacement.collection.populated:
raise modules.DelayedWorkflowEvaluation()
pending = False
for dataset_instance in replacement.dataset_instances:
if dataset_instance.is_pending:
pending = True
elif not dataset_instance.is_ok:
raise modules.FailWorkflowEvaluation(
why=InvocationFailureDatasetFailed(
reason=FailureReason.dataset_failed,
hda_id=replacement.id,
workflow_step_id=connection.input_step_id,
dependent_workflow_step_id=output_step_id,
)
)
if pending:
raise modules.DelayedWorkflowEvaluation()
return replacement
def get_replacement_workflow_output(self, workflow_output: "WorkflowOutput") -> Any:
step = workflow_output.workflow_step
output_name = workflow_output.output_name
step_outputs = self.outputs[step.id]
if step_outputs is STEP_OUTPUT_DELAYED:
delayed_why = f"depends on workflow output [{output_name}] but that output has not been created yet"
raise modules.DelayedWorkflowEvaluation(why=delayed_why)
else:
return step_outputs[output_name]
def set_outputs_for_input(
self, invocation_step: WorkflowInvocationStep, outputs: Any = None, already_persisted: bool = False
) -> None:
step = invocation_step.workflow_step
if outputs is None:
outputs = {}
if self.inputs_by_step_id:
step_id = step.id
if step_id not in self.inputs_by_step_id and "output" not in outputs:
default_value = step.input_default_value
if default_value or step.input_optional:
outputs["output"] = default_value
else:
log.error(f"{step.log_str()} not found in inputs_step_id {self.inputs_by_step_id}")
raise modules.FailWorkflowEvaluation(
why=InvocationFailureOutputNotFound(
reason=FailureReason.output_not_found,
workflow_step_id=invocation_step.workflow_step_id,
output_name="output",
dependent_workflow_step_id=invocation_step.workflow_step_id,
)
)
elif step_id in self.inputs_by_step_id:
outputs["output"] = self.inputs_by_step_id[step_id]
if step.label and step.type == "parameter_input" and "output" in outputs:
self.runtime_replacements[step.label] = str(outputs["output"])
self.set_step_outputs(invocation_step, outputs, already_persisted=already_persisted)
def effective_replacement_dict(self):
replacement_dict = {}
for key, value in self.replacement_dict.items():
replacement_dict[key] = value
for key, value in self.runtime_replacements.items():
if key not in replacement_dict:
replacement_dict[key] = value
return replacement_dict
def set_step_outputs(
self, invocation_step: WorkflowInvocationStep, outputs: Dict[str, Any], already_persisted: bool = False
) -> None:
step = invocation_step.workflow_step
if invocation_step.output_value:
outputs[invocation_step.output_value.workflow_output.output_name] = invocation_step.output_value.value
self.outputs[step.id] = outputs
if not already_persisted:
workflow_outputs_by_name = {wo.output_name: wo for wo in step.workflow_outputs}
for output_name, output_object in outputs.items():
if hasattr(output_object, "history_content_type"):
invocation_step.add_output(output_name, output_object)
else:
# Add this non-data, non workflow-output output to the workflow outputs.
# This is required for recovering the output in the next scheduling iteration,
# and should be replaced with a WorkflowInvocationStepOutputValue ASAP.
if not workflow_outputs_by_name.get(output_name) and not output_object == modules.NO_REPLACEMENT:
workflow_output = model.WorkflowOutput(step, output_name=output_name)
step.workflow_outputs.append(workflow_output)
for workflow_output in step.workflow_outputs:
output_name = workflow_output.output_name
if output_name not in outputs:
invocation_step.workflow_invocation.add_message(
InvocationWarningWorkflowOutputNotFound(
reason=WarningReason.workflow_output_not_found,
workflow_step_id=step.id,
output_name=output_name,
)
)
message = f"Failed to find expected workflow output [{output_name}] in step outputs [{outputs}]"
log.debug(message)
continue
output = outputs[output_name]
self._record_workflow_output(
step,
workflow_output,
output=output,
)
def _record_workflow_output(self, step: "WorkflowStep", workflow_output: "WorkflowOutput", output: Any) -> None:
self.workflow_invocation.add_output(workflow_output, step, output)
def mark_step_outputs_delayed(self, step: "WorkflowStep", why: Optional[str] = None) -> None:
if why:
message = f"Marking step {step.id} outputs of invocation {self.workflow_invocation.id} delayed ({why})"
log.debug(message)
self.outputs[step.id] = STEP_OUTPUT_DELAYED
def _subworkflow_invocation(self, step: "WorkflowStep") -> WorkflowInvocation:
workflow_invocation = self.workflow_invocation
subworkflow_invocation = workflow_invocation.get_subworkflow_invocation_for_step(step)
if subworkflow_invocation is None:
raise MessageException(f"Failed to find persisted subworkflow invocation for step [{step.order_index + 1}]")
return subworkflow_invocation
def subworkflow_invoker(
self,
trans: "WorkRequestContext",
step: "WorkflowStep",
use_cached_job: bool = False,
subworkflow_collection_info=None,
when_values=None,
) -> WorkflowInvoker:
subworkflow_invocation = self._subworkflow_invocation(step)
workflow_run_config = workflow_request_to_run_config(subworkflow_invocation, use_cached_job)
subworkflow_progress = self.subworkflow_progress(
subworkflow_invocation,
step,
workflow_run_config.param_map,
subworkflow_collection_info=subworkflow_collection_info,
when_values=when_values,
)
subworkflow_invocation = subworkflow_progress.workflow_invocation
return WorkflowInvoker(
trans,
workflow=subworkflow_invocation.workflow,
workflow_run_config=workflow_run_config,
progress=subworkflow_progress,
)
def subworkflow_progress(
self,
subworkflow_invocation: WorkflowInvocation,
step: "WorkflowStep",
param_map: Dict,
subworkflow_collection_info=None,
when_values=None,
) -> "WorkflowProgress":
subworkflow = subworkflow_invocation.workflow
subworkflow_inputs = {}
for input_subworkflow_step in subworkflow.input_steps:
connection_found = False
subworkflow_step_id = input_subworkflow_step.id
for input_connection in step.input_connections:
if input_connection.input_subworkflow_step_id == subworkflow_step_id:
is_data = input_connection.output_step.type != "parameter_input"
replacement = self.replacement_for_connection(
input_connection,
is_data=is_data,
)
subworkflow_inputs[subworkflow_step_id] = replacement
connection_found = True
break
if not connection_found and not input_subworkflow_step.input_optional:
raise modules.FailWorkflowEvaluation(
InvocationFailureOutputNotFound(
reason=FailureReason.output_not_found,
workflow_step_id=step.id,
output_name=input_connection.output_name,
dependent_workflow_step_id=input_connection.output_step.id,
)
)
return WorkflowProgress(
subworkflow_invocation,
subworkflow_inputs,
self.module_injector,
param_map=param_map,
use_cached_job=self.use_cached_job,
replacement_dict=self.replacement_dict,
subworkflow_collection_info=subworkflow_collection_info,
when_values=when_values,
)
def _recover_mapping(self, step_invocation: WorkflowInvocationStep) -> None:
try:
step_invocation.workflow_step.module.recover_mapping(step_invocation, self)
except modules.DelayedWorkflowEvaluation as de:
self.mark_step_outputs_delayed(step_invocation.workflow_step, de.why)
__all__ = ("queue_invoke", "WorkflowRunConfig")