Warning

This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.managers.datasets

"""
Manager and Serializer for Datasets.
"""

import glob
import logging
import os
from typing import (
    Any,
    Dict,
    List,
    Optional,
    Type,
    TypeVar,
)

from sqlalchemy import select

from galaxy import (
    exceptions,
    model,
)
from galaxy.datatypes import sniff
from galaxy.managers import (
    base,
    deletable,
    rbac_secured,
    secured,
    users,
)
from galaxy.model import (
    Dataset,
    DatasetHash,
)
from galaxy.model.base import transaction
from galaxy.schema.tasks import (
    ComputeDatasetHashTaskRequest,
    PurgeDatasetsTaskRequest,
)
from galaxy.structured_app import MinimalManagerApp
from galaxy.util.hash_util import memory_bound_hexdigest

log = logging.getLogger(__name__)

T = TypeVar("T")


[docs]class DatasetManager(base.ModelManager[model.Dataset], secured.AccessibleManagerMixin, deletable.PurgableManagerMixin): """ Manipulate datasets: the components contained in DatasetAssociations/DatasetInstances/HDAs/LDDAs """ model_class = model.Dataset foreign_key_name = "dataset" app: MinimalManagerApp # TODO:?? get + error_if_uploading is common pattern, should upload check be worked into access/owed?
[docs] def __init__(self, app: MinimalManagerApp): super().__init__(app) self.permissions = DatasetRBACPermissions(app) # needed for admin test self.user_manager = users.UserManager(app) self.quota_agent = app.quota_agent self.security_agent = app.model.security_agent
[docs] def create(self, manage_roles=None, access_roles=None, flush=True, **kwargs): """ Create and return a new Dataset object. """ # default to NEW state on new datasets kwargs.update(dict(state=(kwargs.get("state", model.Dataset.states.NEW)))) dataset = model.Dataset(**kwargs) self.session().add(dataset) self.permissions.set(dataset, manage_roles, access_roles, flush=False) if flush: session = self.session() with transaction(session): session.commit() return dataset
[docs] def copy(self, dataset, **kwargs): raise exceptions.NotImplemented("Datasets cannot be copied")
[docs] def purge(self, dataset, flush=True): """ Remove the object_store/file for this dataset from storage and mark as purged. :raises exceptions.ConfigDoesNotAllowException: if the instance doesn't allow """ self.error_unless_dataset_purge_allowed(dataset) # the following also marks dataset as purged and deleted dataset.full_delete() self.session().add(dataset) if flush: session = self.session() with transaction(session): session.commit() return dataset
[docs] def purge_datasets(self, request: PurgeDatasetsTaskRequest): """ Caution: any additional security checks must be done before executing this action. Completely removes a set of object_store/files associated with the datasets from storage and marks them as purged. They might not be removed if there are still un-purged associations to the dataset. """ self.error_unless_dataset_purge_allowed() with self.session().begin(): for dataset_id in request.dataset_ids: dataset: Dataset = self.session().get(Dataset, dataset_id) if dataset.user_can_purge: try: dataset.full_delete() except Exception: log.exception(f"Unable to purge dataset ({dataset.id})")
# TODO: this may be more conv. somewhere else # TODO: how to allow admin bypass?
[docs] def error_unless_dataset_purge_allowed(self, msg=None): if not self.app.config.allow_user_dataset_purge: msg = msg or "This instance does not allow user dataset purging" raise exceptions.ConfigDoesNotAllowException(msg)
# .... accessibility # datasets can implement the accessible interface, but accessibility is checked in an entirely different way # than those resources that have a user attribute (histories, pages, etc.)
[docs] def is_accessible(self, item: Any, user: Optional[model.User], **kwargs) -> bool: """ Is this dataset readable/viewable to user? """ if self.user_manager.is_admin(user, trans=kwargs.get("trans")): return True if self.has_access_permission(item, user): return True return False
[docs] def has_access_permission(self, dataset, user): """ Return T/F if the user has role-based access to the dataset. """ roles = user.all_roles_exploiting_cache() if user else [] return self.app.security_agent.can_access_dataset(roles, dataset)
[docs] def update_object_store_id(self, trans, dataset, object_store_id: str): device_source_map = self.app.object_store.get_device_source_map() old_object_store_id = dataset.object_store_id new_object_store_id = object_store_id if old_object_store_id == new_object_store_id: return None old_device_id = device_source_map.get_device_id(old_object_store_id) new_device_id = device_source_map.get_device_id(new_object_store_id) if old_device_id != new_device_id: raise exceptions.RequestParameterInvalidException( "Cannot swap object store IDs for object stores that don't share a device ID." ) if not self.security_agent.can_change_object_store_id(trans.user, dataset): # TODO: probably want separate exceptions for doesn't own the dataset and dataset # has been shared. raise exceptions.InsufficientPermissionsException("Cannot change dataset permissions...") quota_source_map = self.app.object_store.get_quota_source_map() if quota_source_map: old_label = quota_source_map.get_quota_source_label(old_object_store_id) new_label = quota_source_map.get_quota_source_label(new_object_store_id) if old_label != new_label: self.quota_agent.relabel_quota_for_dataset(dataset, old_label, new_label) sa_session = self.app.model.context with transaction(sa_session): dataset.object_store_id = new_object_store_id sa_session.add(dataset) sa_session.commit()
[docs] def compute_hash(self, request: ComputeDatasetHashTaskRequest): # For files in extra_files_path dataset = self.by_id(request.dataset_id) extra_files_path = request.extra_files_path if extra_files_path: extra_dir = dataset.extra_files_path_name file_path = self.app.object_store.get_filename(dataset, extra_dir=extra_dir, alt_name=extra_files_path) else: file_path = dataset.get_file_name() hash_function = request.hash_function calculated_hash_value = memory_bound_hexdigest(hash_func_name=hash_function, path=file_path) extra_files_path = request.extra_files_path dataset_hash = model.DatasetHash( hash_function=hash_function, hash_value=calculated_hash_value, extra_files_path=extra_files_path, ) dataset_hash.dataset = dataset # TODO: replace/update if the combination of dataset_id/hash_function has already # been stored. sa_session = self.session() hash = get_dataset_hash(sa_session, dataset.id, hash_function, extra_files_path) if hash is None: sa_session.add(dataset_hash) with transaction(sa_session): sa_session.commit() else: old_hash_value = hash.hash_value if old_hash_value != calculated_hash_value: log.warning( f"Re-calculated dataset hash for dataset [{dataset.id}] and new hash value [{calculated_hash_value}] does not equal previous hash value [{old_hash_value}]." ) else: log.debug("Duplicated dataset hash request, no update to the database.")
# TODO: implement above for groups # TODO: datatypes? # .... data, object_store # TODO: SecurityAgentDatasetRBACPermissions( object ):
[docs]class DatasetRBACPermissions:
[docs] def __init__(self, app): self.app = app self.access = rbac_secured.AccessDatasetRBACPermission(app) self.manage = rbac_secured.ManageDatasetRBACPermission(app)
# TODO: temporary facade over security_agent
[docs] def available_roles(self, trans, dataset, controller="root"): return self.app.security_agent.get_legitimate_roles(trans, dataset, controller)
[docs] def get(self, dataset, flush=True): manage = self.manage.by_dataset(dataset) access = self.access.by_dataset(dataset) return (manage, access)
[docs] def set(self, dataset, manage_roles, access_roles, flush=True): manage = self.manage.set(dataset, manage_roles or [], flush=False) access = self.access.set(dataset, access_roles or [], flush=flush) return (manage, access)
# ---- conv. settings
[docs] def set_public_with_single_manager(self, dataset, user, flush=True): manage = self.manage.grant(dataset, user, flush=flush) self.access.clear(dataset, flush=False) return ([manage], [])
[docs] def set_private_to_one_user(self, dataset, user, flush=True): manage = self.manage.grant(dataset, user, flush=False) access = self.access.set_private(dataset, user, flush=flush) return ([manage], access)
[docs]class DatasetSerializer(base.ModelSerializer[DatasetManager], deletable.PurgableSerializerMixin): model_manager_class = DatasetManager
[docs] def __init__(self, app: MinimalManagerApp, user_manager: users.UserManager): super().__init__(app) self.dataset_manager = self.manager # needed for admin test self.user_manager = user_manager self.default_view = "summary" self.add_view( "summary", [ "id", "create_time", "update_time", "state", "deleted", "purged", "purgable", # 'object_store_id', # 'external_filename', # 'extra_files_path', "file_size", "total_size", "uuid", ], )
# could do visualizations and/or display_apps
[docs] def add_serializers(self): super().add_serializers() deletable.PurgableSerializerMixin.add_serializers(self) serializers: Dict[str, base.Serializer] = { "create_time": self.serialize_date, "update_time": self.serialize_date, "uuid": lambda item, key, **context: str(item.uuid) if item.uuid else None, "file_name": self.serialize_file_name, "extra_files_path": self.serialize_extra_files_path, "permissions": self.serialize_permissions, "total_size": lambda item, key, **context: int(item.get_total_size()), "file_size": lambda item, key, **context: int(item.get_size(calculate_size=False)), } self.serializers.update(serializers)
[docs] def serialize_file_name(self, item, key, user=None, **context): """ If the config allows or the user is admin, return the file name of the file that contains this dataset's data. """ dataset = item is_admin = self.user_manager.is_admin(user, trans=context.get("trans")) # expensive: allow config option due to cost of operation if is_admin or self.app.config.expose_dataset_path: if not dataset.purged: return dataset.get_file_name(sync_cache=False) self.skip()
[docs] def serialize_extra_files_path(self, item, key, user=None, **context): """ If the config allows or the user is admin, return the file path. """ dataset = item is_admin = self.user_manager.is_admin(user, trans=context.get("trans")) # expensive: allow config option due to cost of operation if is_admin or self.app.config.expose_dataset_path: if not dataset.purged: return dataset.extra_files_path self.skip()
[docs] def serialize_permissions(self, item, key, user=None, **context): """ """ dataset = item trans = context.get("trans") if not self.dataset_manager.permissions.manage.is_permitted(dataset, user, trans=trans): self.skip() management_permissions = self.dataset_manager.permissions.manage.by_dataset(dataset) access_permissions = self.dataset_manager.permissions.access.by_dataset(dataset) permissions = { "manage": [self.app.security.encode_id(perm.role.id) for perm in management_permissions], "access": [self.app.security.encode_id(perm.role.id) for perm in access_permissions], } return permissions
# ============================================================================= AKA DatasetInstanceManager
[docs]class DatasetAssociationManager( base.ModelManager[model.DatasetInstance], secured.AccessibleManagerMixin, secured.OwnableManagerMixin, deletable.PurgableManagerMixin, ): """ DatasetAssociation/DatasetInstances are intended to be working proxies to a Dataset, associated with either a library or a user/history (HistoryDatasetAssociation). """ # DA's were meant to be proxies - but were never fully implemented as them # Instead, a dataset association HAS a dataset but contains metadata specific to a library (lda) or user (hda) app: MinimalManagerApp # NOTE: model_manager_class should be set in HDA/LDA subclasses
[docs] def __init__(self, app): super().__init__(app) self.dataset_manager = DatasetManager(app)
[docs] def is_accessible(self, item, user: Optional[model.User], **kwargs: Any) -> bool: """ Is this DA accessible to `user`? """ # defer to the dataset return self.dataset_manager.is_accessible(item.dataset, user, **kwargs)
[docs] def delete(self, item, flush: bool = True, stop_job: bool = False, **kwargs): """ Marks this dataset association as deleted. If `stop_job` is True, will stop the creating job if all other outputs are deleted. """ super().delete(item, flush=flush) if stop_job: self.stop_creating_job(item, flush=flush) return item
[docs] def purge(self, dataset_assoc, flush=True): """ Purge this DatasetInstance and the dataset underlying it. """ # error here if disallowed - before jobs are stopped # TODO: this check may belong in the controller self.dataset_manager.error_unless_dataset_purge_allowed() # We need to ignore a potential flush=False here if jobs are not tracked in the database, # so that job cleanup associated with stop_creating_job will see # the dataset as purged. flush_required = not self.app.config.track_jobs_in_database super().purge(dataset_assoc, flush=flush or flush_required) # stop any jobs outputing the dataset_assoc self.stop_creating_job(dataset_assoc, flush=True) # more importantly, purge underlying dataset as well if dataset_assoc.dataset.user_can_purge: self.dataset_manager.purge(dataset_assoc.dataset) return dataset_assoc
[docs] def by_user(self, user): raise exceptions.NotImplemented("Abstract Method")
# .... associated job
[docs] def creating_job(self, dataset_assoc): """ Return the `Job` that created this dataset or None if not found. """ # TODO: is this needed? Can't you use the dataset_assoc.creating_job attribute? When is this None? # TODO: this would be even better if outputs and inputs were the underlying datasets job = None for job_output_assoc in dataset_assoc.creating_job_associations: job = job_output_assoc.job break return job
[docs] def stop_creating_job(self, dataset_assoc, flush=False): """ Stops an dataset_assoc's creating job if all the job's other outputs are deleted. """ # Optimize this to skip other checks if this dataset is terminal - we can infer the # job is already complete. if dataset_assoc.state in model.Dataset.terminal_states: return False if dataset_assoc.parent_id is None and len(dataset_assoc.creating_job_associations) > 0: # Mark associated job for deletion job = dataset_assoc.creating_job_associations[0].job if not job.finished: # Are *all* of the job's other output datasets deleted? if job.check_if_output_datasets_deleted(): track_jobs_in_database = self.app.config.track_jobs_in_database job.mark_deleted(track_jobs_in_database) if not track_jobs_in_database: self.app.job_manager.stop(job) if flush: session = self.session() with transaction(session): session.commit() return True return False
[docs] def is_composite(self, dataset_assoc): """ Return True if this hda/ldda is a composite type dataset. .. note:: see also (whereever we keep information on composite datatypes?) """ return dataset_assoc.extension in self.app.datatypes_registry.get_composite_extensions()
[docs] def extra_files(self, dataset_assoc): """Return a list of file paths for composite files, an empty list otherwise.""" if not self.is_composite(dataset_assoc): return [] return glob.glob(os.path.join(dataset_assoc.dataset.extra_files_path, "*"))
[docs] def serialize_dataset_association_roles(self, trans, dataset_assoc): if hasattr(dataset_assoc, "library_dataset_dataset_association"): library_dataset = dataset_assoc dataset = library_dataset.library_dataset_dataset_association.dataset else: library_dataset = None dataset = dataset_assoc.dataset # Omit duplicated roles by converting to set security_agent = trans.app.security_agent access_roles = set(dataset.get_access_roles(security_agent)) manage_roles = set(dataset.get_manage_permissions_roles(security_agent)) access_dataset_role_list = [ (access_role.name, trans.security.encode_id(access_role.id)) for access_role in access_roles ] manage_dataset_role_list = [ (manage_role.name, trans.security.encode_id(manage_role.id)) for manage_role in manage_roles ] rval = dict(access_dataset_roles=access_dataset_role_list, manage_dataset_roles=manage_dataset_role_list) if library_dataset is not None: modify_roles = set( security_agent.get_roles_for_action( library_dataset, trans.app.security_agent.permitted_actions.LIBRARY_MODIFY ) ) modify_item_role_list = [ (modify_role.name, trans.security.encode_id(modify_role.id)) for modify_role in modify_roles ] rval["modify_item_roles"] = modify_item_role_list return rval
[docs] def ensure_dataset_on_disk(self, trans, dataset): # Not a guarantee data is really present, but excludes a lot of expected cases if dataset.purged or dataset.dataset.purged: raise exceptions.ItemDeletionException("The dataset you are attempting to view has been purged.") elif dataset.deleted and not (trans.user_is_admin or self.is_owner(dataset, trans.get_user())): raise exceptions.ItemDeletionException("The dataset you are attempting to view has been deleted.") elif dataset.state == Dataset.states.UPLOAD: raise exceptions.Conflict("Please wait until this dataset finishes uploading before attempting to view it.") elif dataset.state == Dataset.states.DISCARDED: raise exceptions.ItemDeletionException("The dataset you are attempting to view has been discarded.") elif dataset.state == Dataset.states.DEFERRED: raise exceptions.Conflict( "The dataset you are attempting to view has deferred data. You can only use this dataset as input for jobs." ) elif dataset.state == Dataset.states.PAUSED: raise exceptions.Conflict( "The dataset you are attempting to view is in paused state. One of the inputs for the job that creates this dataset has failed." )
[docs] def ensure_can_change_datatype(self, dataset: model.DatasetInstance, raiseException: bool = True) -> bool: if not dataset.datatype.is_datatype_change_allowed(): if not raiseException: return False raise exceptions.InsufficientPermissionsException( f'Changing datatype "{dataset.extension}" is not allowed.' ) return True
[docs] def ensure_can_set_metadata(self, dataset: model.DatasetInstance, raiseException: bool = True) -> bool: if not dataset.ok_to_edit_metadata(): if not raiseException: return False raise exceptions.ItemAccessibilityException( "This dataset is currently being used as input or output. You cannot change datatype until the jobs have completed or you have canceled them." ) return True
[docs] def detect_datatype(self, trans, dataset_assoc): """Sniff and assign the datatype to a given dataset association (ldda or hda)""" data = trans.sa_session.get(self.model_class, dataset_assoc.id) self.ensure_can_change_datatype(data) self.ensure_can_set_metadata(data) path = data.dataset.get_file_name() datatype = sniff.guess_ext(path, trans.app.datatypes_registry.sniff_order) trans.app.datatypes_registry.change_datatype(data, datatype) with transaction(trans.sa_session): trans.sa_session.commit() self.set_metadata(trans, dataset_assoc)
[docs] def set_metadata(self, trans, dataset_assoc, overwrite=False, validate=True): """Trigger a job that detects and sets metadata on a given dataset association (ldda or hda)""" data = trans.sa_session.get(self.model_class, dataset_assoc.id) self.ensure_can_set_metadata(data) if overwrite: self.overwrite_metadata(data) job, *_ = self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool, trans, incoming={"input1": data, "validate": validate}, overwrite=overwrite, ) self.app.job_manager.enqueue(job, tool=self.app.datatypes_registry.set_external_metadata_tool)
[docs] def overwrite_metadata(self, data): for name, spec in data.metadata.spec.items(): # We need to be careful about the attributes we are resetting if name not in ["name", "info", "dbkey", "base_name"]: if spec.get("default"): setattr(data.metadata, name, spec.unwrap(spec.get("default")))
[docs] def update_permissions(self, trans, dataset_assoc, **kwd): action = kwd.get("action", "set_permissions") if action not in ["remove_restrictions", "make_private", "set_permissions"]: raise exceptions.RequestParameterInvalidException( 'The mandatory parameter "action" has an invalid value. ' 'Allowed values are: "remove_restrictions", "make_private", "set_permissions"' ) if hasattr(dataset_assoc, "library_dataset_dataset_association"): library_dataset = dataset_assoc dataset = library_dataset.library_dataset_dataset_association.dataset else: library_dataset = None dataset = dataset_assoc.dataset current_user_roles = trans.get_current_user_roles() can_manage = trans.app.security_agent.can_manage_dataset(current_user_roles, dataset) or trans.user_is_admin if not can_manage: raise exceptions.InsufficientPermissionsException( "You do not have proper permissions to manage permissions on this dataset." ) if action == "remove_restrictions": trans.app.security_agent.make_dataset_public(dataset) if not trans.app.security_agent.dataset_is_public(dataset): raise exceptions.InternalServerError("An error occurred while making dataset public.") elif action == "make_private": if not trans.app.security_agent.dataset_is_private_to_user(trans, dataset): private_role = trans.app.security_agent.get_private_user_role(trans.user) dp = trans.app.model.DatasetPermissions( trans.app.security_agent.permitted_actions.DATASET_ACCESS.action, dataset, private_role ) trans.sa_session.add(dp) with transaction(trans.sa_session): trans.sa_session.commit() if not trans.app.security_agent.dataset_is_private_to_user(trans, dataset): # Check again and inform the user if dataset is not private. raise exceptions.InternalServerError("An error occurred and the dataset is NOT private.") elif action == "set_permissions": def parameters_roles_or_none(role_type): return kwd.get(role_type, kwd.get(f"{role_type}_ids[]")) access_roles = parameters_roles_or_none("access") manage_roles = parameters_roles_or_none("manage") modify_roles = parameters_roles_or_none("modify") role_ids_dict = { "DATASET_MANAGE_PERMISSIONS": manage_roles, "DATASET_ACCESS": access_roles, } if library_dataset is not None: role_ids_dict["LIBRARY_MODIFY"] = modify_roles self._set_permissions(trans, dataset_assoc, role_ids_dict)
def _set_permissions(self, trans, dataset_assoc, roles_dict): raise exceptions.NotImplemented()
class _UnflattenedMetadataDatasetAssociationSerializer(base.ModelSerializer[T], deletable.PurgableSerializerMixin): def __init__(self, app): self.dataset_serializer = app[DatasetSerializer] super().__init__(app) def add_serializers(self): super().add_serializers() deletable.PurgableSerializerMixin.add_serializers(self) serializers: Dict[str, base.Serializer] = { "create_time": self.serialize_date, "update_time": self.serialize_date, # underlying dataset "dataset": lambda item, key, **context: self.dataset_serializer.serialize_to_view( item.dataset, view="summary", **context ), "dataset_id": self._proxy_to_dataset(proxy_key="id"), # TODO: why is this named uuid!? The da doesn't have a uuid - it's the underlying dataset's uuid! "uuid": self._proxy_to_dataset(proxy_key="uuid"), # 'dataset_uuid': self._proxy_to_dataset( key='uuid' ), "file_name": self._proxy_to_dataset(serializer=self.dataset_serializer.serialize_file_name), "extra_files_path": self._proxy_to_dataset(serializer=self.dataset_serializer.serialize_extra_files_path), "permissions": self._proxy_to_dataset(serializer=self.dataset_serializer.serialize_permissions), # TODO: do the sizes proxy accurately/in the same way? "size": lambda item, key, **context: int(item.get_size(calculate_size=False)), "file_size": lambda item, key, **context: self.serializers["size"](item, key, **context), "nice_size": lambda item, key, **context: item.get_size(nice_size=True, calculate_size=False), # common to lddas and hdas - from mapping.py "copied_from_history_dataset_association_id": self.serialize_id, "copied_from_library_dataset_dataset_association_id": self.serialize_id, "info": lambda item, key, **context: item.info.strip() if isinstance(item.info, str) else item.info, "blurb": lambda item, key, **context: item.blurb, "peek": lambda item, key, **context: item.display_peek() if item.peek and item.peek != "no peek" else None, "meta_files": self.serialize_meta_files, "metadata": self.serialize_metadata, "creating_job": self.serialize_creating_job, "rerunnable": self.serialize_rerunnable, "parent_id": self.serialize_id, "designation": lambda item, key, **context: item.designation, # 'extended_metadata': self.serialize_extended_metadata, # 'extended_metadata_id': self.serialize_id, # remapped # TODO: Replace string cast with https://github.com/pydantic/pydantic/pull/9137 on 24.1 "genome_build": lambda item, key, **context: str(item.dbkey) if item.dbkey is not None else None, # derived (not mapped) attributes "data_type": lambda item, key, **context: f"{item.datatype.__class__.__module__}.{item.datatype.__class__.__name__}", "converted": self.serialize_converted_datasets, # TODO: metadata/extra files } self.serializers.update(serializers) # this an abstract superclass, so no views created # because of that: we need to add a few keys that will use the default serializer self.serializable_keyset.update(["name", "state", "tool_version", "extension", "visible", "dbkey"]) def _proxy_to_dataset(self, serializer: Optional[base.Serializer] = None, proxy_key: Optional[str] = None): # dataset associations are (rough) proxies to datasets - access their serializer using this remapping fn # remapping done by either kwarg key: IOW dataset attr key (e.g. uuid) # or by kwarg serializer: a function that's passed in (e.g. permissions) if proxy_key: serializer = self.dataset_serializer.serializers.get(proxy_key) if serializer: return lambda item, key, **context: serializer(item.dataset, proxy_key or key, **context) raise TypeError("kwarg serializer or key needed") def serialize_meta_files(self, item, key, **context): """ Cycle through meta files and return them as a list of dictionaries. """ dataset_assoc = item meta_files = [] for meta_type in dataset_assoc.metadata_file_types: if getattr(dataset_assoc.metadata, meta_type, None): meta_files.append( dict( file_type=meta_type, download_url=self.url_for( "get_metadata_file", history_id=self.app.security.encode_id(dataset_assoc.history_id), history_content_id=self.app.security.encode_id(dataset_assoc.id), query_params={"metadata_file": meta_type}, context=context, ), ) ) return meta_files def serialize_metadata(self, item, key, excluded=None, **context): """ Cycle through metadata and return as dictionary. """ dataset_assoc = item # dbkey is a repeat actually (metadata_dbkey == genome_build) # excluded = [ 'dbkey' ] if excluded is None else excluded excluded = [] if excluded is None else excluded metadata = {} for name, spec in dataset_assoc.metadata.spec.items(): if name in excluded: continue val = dataset_assoc.metadata.get(name) # NOTE: no files if isinstance(val, model.MetadataFile): # only when explicitly set: fetching filepaths can be expensive if not self.app.config.expose_dataset_path: continue val = val.get_file_name() # TODO:? possibly split this off? # If no value for metadata, look in datatype for metadata. elif val is None and hasattr(dataset_assoc.datatype, name): val = getattr(dataset_assoc.datatype, name) if val is None and spec.get("optional"): continue metadata[name] = val return metadata def serialize_creating_job(self, item, key, **context): """ Return the id of the Job that created this dataset (or its original) or None if no `creating_job` is found. """ dataset = item if dataset.creating_job: return self.serialize_id(dataset.creating_job, "id") else: return None def serialize_rerunnable(self, item, key, **context): """ Return False if this tool that created this dataset can't be re-run (e.g. upload). """ dataset = item if dataset.creating_job: tool = self.app.toolbox.get_tool(dataset.creating_job.tool_id, dataset.creating_job.tool_version) if tool and tool.is_workflow_compatible: return True return False def serialize_converted_datasets(self, item, key, **context): """ Return a file extension -> converted dataset encoded id map with all the existing converted datasets associated with this instance. This filters out deleted associations. """ dataset_assoc = item id_map = {} for converted in dataset_assoc.implicitly_converted_datasets: if not converted.deleted and converted.dataset: id_map[converted.type] = self.serialize_id(converted.dataset, "id") return id_map
[docs]class DatasetAssociationSerializer(_UnflattenedMetadataDatasetAssociationSerializer[T]): # TODO: remove this class - metadata should be a sub-object instead as in the superclass
[docs] def add_serializers(self): super().add_serializers() # remove the single nesting key here del self.serializers["metadata"]
[docs] def serialize(self, dataset_assoc, keys, **context): """ Override to add metadata as flattened keys on the serialized DatasetInstance. """ # if 'metadata' isn't removed from keys here serialize will retrieve the un-serializable MetadataCollection # TODO: remove these when metadata is sub-object KEYS_HANDLED_SEPARATELY = ("metadata",) left_to_handle = self._pluck_from_list(keys, KEYS_HANDLED_SEPARATELY) serialized = super().serialize(dataset_assoc, keys, **context) # add metadata directly to the dict instead of as a sub-object if "metadata" in left_to_handle: metadata = self._prefixed_metadata(dataset_assoc) serialized.update(metadata) return serialized
# TODO: this is more util/gen. use def _pluck_from_list(self, list_, elems): """ Removes found elems from list list_ and returns list of found elems if found. """ found = [] for elem in elems: try: index = list_.index(elem) found.append(list_.pop(index)) except ValueError: pass return found def _prefixed_metadata(self, dataset_assoc): """ Adds (a prefixed version of) the DatasetInstance metadata to the dict, prefixing each key with 'metadata_'. """ # build the original, nested dictionary metadata = self.serialize_metadata(dataset_assoc, "metadata") # prefix each key within and return prefixed = {} for key, val in metadata.items(): prefixed_key = f"metadata_{key}" prefixed[prefixed_key] = val return prefixed
[docs]class DatasetAssociationDeserializer(base.ModelDeserializer, deletable.PurgableDeserializerMixin):
[docs] def add_deserializers(self): super().add_deserializers() deletable.PurgableDeserializerMixin.add_deserializers(self) self.deserializers.update( { "name": self.deserialize_basestring, "info": self.deserialize_basestring, "datatype": self.deserialize_datatype, } ) self.deserializable_keyset.update(self.deserializers.keys())
# TODO: untested
[docs] def deserialize_metadata(self, dataset_assoc, metadata_key, metadata_dict, **context): """ """ self.validate.matches_type(metadata_key, metadata_dict, dict) returned = {} for key, val in metadata_dict.items(): returned[key] = self.deserialize_metadatum(dataset_assoc, key, val, **context) return returned
[docs] def deserialize_metadatum(self, dataset_assoc, key, val, **context): """ """ if key not in dataset_assoc.datatype.metadata_spec: return metadata_specification = dataset_assoc.datatype.metadata_spec[key] if metadata_specification.get("readonly"): return unwrapped_val = metadata_specification.unwrap(val) setattr(dataset_assoc.metadata, key, unwrapped_val) # ...? return unwrapped_val
[docs] def deserialize_datatype(self, item, key, val, **context): if not item.datatype.is_datatype_change_allowed(): raise exceptions.RequestParameterInvalidException("The current datatype does not allow datatype changes.") target_datatype = self.app.datatypes_registry.get_datatype_by_extension(val) if not target_datatype: raise exceptions.RequestParameterInvalidException("The target datatype does not exist.") if not target_datatype.is_datatype_change_allowed(): raise exceptions.RequestParameterInvalidException("The target datatype does not allow datatype changes.") if not item.ok_to_edit_metadata(): raise exceptions.RequestParameterInvalidException( "Dataset metadata could not be updated because it is used as input or output of a running job." ) item.change_datatype(val) sa_session = self.app.model.context with transaction(sa_session): sa_session.commit() trans = context.get("trans") assert ( trans ), "Logic error in Galaxy, deserialize_datatype not send a transation object" # TODO: restructure this for stronger typing job, *_ = self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool, trans, incoming={"input1": item}, overwrite=False ) # overwrite is False as per existing behavior trans.app.job_manager.enqueue(job, tool=trans.app.datatypes_registry.set_external_metadata_tool) return item.datatype
[docs]class DatasetAssociationFilterParser(base.ModelFilterParser, deletable.PurgableFiltersMixin): def _add_parsers(self): super()._add_parsers() deletable.PurgableFiltersMixin._add_parsers(self) self.orm_filter_parsers.update( { "name": {"op": ("eq", "contains", "like")}, "state": {"column": "_state", "op": ("eq", "in")}, "visible": {"op": ("eq"), "val": base.parse_bool}, } ) self.fn_filter_parsers.update( { "genome_build": self.string_standard_ops("dbkey"), "data_type": {"op": {"eq": self.eq_datatype, "isinstance": self.isinstance_datatype}}, } )
[docs] def eq_datatype(self, dataset_assoc, class_str): """ Is the `dataset_assoc` datatype equal to the registered datatype `class_str`? """ comparison_class = self.app.datatypes_registry.get_datatype_class_by_name(class_str) return comparison_class and dataset_assoc.datatype.__class__ == comparison_class
[docs] def isinstance_datatype(self, dataset_assoc, class_strs): """ Is the `dataset_assoc` datatype derived from any of the registered datatypes in the comma separated string `class_strs`? """ parse_datatype_fn = self.app.datatypes_registry.get_datatype_class_by_name comparison_classes: List[Type] = [] for class_str in class_strs.split(","): datatype_class = parse_datatype_fn(class_str) if datatype_class: comparison_classes.append(datatype_class) return comparison_classes and isinstance(dataset_assoc.datatype, tuple(comparison_classes))
[docs]def get_dataset_hash(session, dataset_id, hash_function, extra_files_path): stmt = ( select(DatasetHash) .where(DatasetHash.dataset_id == dataset_id) .where(DatasetHash.hash_function == hash_function) .where(DatasetHash.extra_files_path == extra_files_path) ) return session.scalars(stmt).one_or_none()