Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.model.migrate.versions.0005_cleanup_datasets_fix

import datetime
import errno
import logging
import os
import time

from sqlalchemy import (
    and_,
    Boolean,
    Column,
    DateTime,
    false,
    ForeignKey,
    Integer,
    MetaData,
    not_,
    Numeric,
    Table,
    TEXT,
    true
)
from sqlalchemy.orm import (
    backref,
    mapper,
    relation,
    scoped_session,
    sessionmaker
)

from galaxy.model.custom_types import (
    MetadataType,
    TrimmedString
)
from galaxy.model.metadata import MetadataCollection
from galaxy.util.bunch import Bunch

log = logging.getLogger(__name__)
now = datetime.datetime.utcnow
metadata = MetaData()
context = scoped_session(sessionmaker(autoflush=False, autocommit=True))


# classes
[docs]def get_permitted_actions(**kwds): return Bunch()
[docs]def directory_hash_id(id): s = str(id) l = len(s) # Shortcut -- ids 0-999 go under ../000/ if l < 4: return ["000"] # Pad with zeros until a multiple of three padded = ((3 - len(s) % 3) * "0") + s # Drop the last three digits -- 1000 files per directory padded = padded[:-3] # Break into chunks of three return [padded[i * 3:(i + 1) * 3] for i in range(len(padded) // 3)]
[docs]class Dataset: states = Bunch(NEW='new', UPLOAD='upload', QUEUED='queued', RUNNING='running', OK='ok', EMPTY='empty', ERROR='error', DISCARDED='discarded') permitted_actions = get_permitted_actions(filter='DATASET') file_path = "/tmp/" engine = None def __init__(self, id=None, state=None, external_filename=None, extra_files_path=None, file_size=None, purgable=True): self.id = id self.state = state self.deleted = False self.purged = False self.purgable = purgable self.external_filename = external_filename self._extra_files_path = extra_files_path self.file_size = file_size
[docs] def get_file_name(self): if not self.external_filename: assert self.id is not None, "ID must be set before filename used (commit the object)" # First try filename directly under file_path filename = os.path.join(self.file_path, "dataset_%d.dat" % self.id) # Only use that filename if it already exists (backward compatibility), # otherwise construct hashed path if not os.path.exists(filename): dir = os.path.join(self.file_path, *directory_hash_id(self.id)) # Create directory if it does not exist try: os.makedirs(dir) except OSError as e: # File Exists is okay, otherwise reraise if e.errno != errno.EEXIST: raise # Return filename inside hashed directory return os.path.abspath(os.path.join(dir, "dataset_%d.dat" % self.id)) else: filename = self.external_filename # Make filename absolute return os.path.abspath(filename)
[docs] def set_file_name(self, filename): if not filename: self.external_filename = None else: self.external_filename = filename
file_name = property(get_file_name, set_file_name) @property def extra_files_path(self): if self._extra_files_path: path = self._extra_files_path else: path = os.path.join(self.file_path, "dataset_%d_files" % self.id) # only use path directly under self.file_path if it exists if not os.path.exists(path): path = os.path.join(os.path.join(self.file_path, *directory_hash_id(self.id)), "dataset_%d_files" % self.id) # Make path absolute return os.path.abspath(path)
[docs] def get_size(self): """Returns the size of the data on disk""" if self.file_size: return self.file_size else: try: return os.path.getsize(self.file_name) except OSError: return 0
[docs] def set_size(self): """Returns the size of the data on disk""" try: if not self.file_size: self.file_size = os.path.getsize(self.file_name) except OSError: self.file_size = 0
[docs] def has_data(self): """Detects whether there is any data""" return self.get_size() > 0
[docs] def mark_deleted(self, include_children=True): self.deleted = True
# FIXME: sqlalchemy will replace this def _delete(self): """Remove the file that corresponds to this data""" try: os.remove(self.data.file_name) except OSError as e: log.critical('{} delete error {}'.format(self.__class__.__name__, e))
[docs]class DatasetInstance: """A base class for all 'dataset instances', HDAs, LDAs, etc""" states = Dataset.states permitted_actions = Dataset.permitted_actions
[docs] def __init__(self, id=None, hid=None, name=None, info=None, blurb=None, peek=None, extension=None, dbkey=None, metadata=None, history=None, dataset=None, deleted=False, designation=None, parent_id=None, validation_errors=None, visible=True, create_dataset=False): self.name = name or "Unnamed dataset" self.id = id self.info = info self.blurb = blurb self.peek = peek self.extension = extension self.designation = designation self.metadata = metadata or dict() if dbkey: # dbkey is stored in metadata, only set if non-zero, or else we could clobber one supplied by input 'metadata' self.dbkey = dbkey self.deleted = deleted self.visible = visible # Relationships if not dataset and create_dataset: dataset = Dataset(state=Dataset.states.NEW) context.add(dataset) context.flush() self.dataset = dataset self.parent_id = parent_id self.validation_errors = validation_errors
@property def ext(self): return self.extension
[docs] def get_dataset_state(self): return self.dataset.state
[docs] def set_dataset_state(self, state): self.dataset.state = state context.add(self.dataset) context.flush() # flush here, because hda.flush() won't flush the Dataset object
state = property(get_dataset_state, set_dataset_state)
[docs] def get_file_name(self): return self.dataset.get_file_name()
[docs] def set_file_name(self, filename): return self.dataset.set_file_name(filename)
file_name = property(get_file_name, set_file_name) @property def extra_files_path(self): return self.dataset.extra_files_path
[docs] def get_metadata(self): if not hasattr(self, '_metadata_collection') or self._metadata_collection.parent != self: # using weakref to store parent (to prevent circ ref), does a context.clear() cause parent to be invalidated, while still copying over this non-database attribute? self._metadata_collection = MetadataCollection(self) return self._metadata_collection
[docs] def set_metadata(self, bunch): # Needs to accept a MetadataCollection, a bunch, or a dict self._metadata = self.metadata.make_dict_copy(bunch)
metadata = property(get_metadata, set_metadata) # This provide backwards compatibility with using the old dbkey # field in the database. That field now maps to "old_dbkey" (see mapping.py).
[docs] def get_dbkey(self): dbkey = self.metadata.dbkey if not isinstance(dbkey, list): dbkey = [dbkey] if dbkey in [[None], []]: return "?" return dbkey[0]
[docs] def set_dbkey(self, value): if "dbkey" in self.datatype.metadata_spec: if not isinstance(value, list): self.metadata.dbkey = [value] else: self.metadata.dbkey = value
dbkey = property(get_dbkey, set_dbkey)
[docs] def get_size(self): """Returns the size of the data on disk""" return self.dataset.get_size()
[docs] def set_size(self): """Returns the size of the data on disk""" return self.dataset.set_size()
[docs] def has_data(self): """Detects whether there is any data""" return self.dataset.has_data()
[docs] def get_raw_data(self): """Returns the full data. To stream it open the file_name and read/write as needed""" return self.datatype.get_raw_data(self)
[docs] def set_peek(self): return self.datatype.set_peek(self)
[docs] def init_meta(self, copy_from=None): return self.datatype.init_meta(self, copy_from=copy_from)
[docs] def set_meta(self, **kwd): self.clear_associated_files(metadata_safe=True) return self.datatype.set_meta(self, **kwd)
[docs] def missing_meta(self, **kwd): return self.datatype.missing_meta(self, **kwd)
[docs] def as_display_type(self, type, **kwd): return self.datatype.as_display_type(self, type, **kwd)
[docs] def display_peek(self): return self.datatype.display_peek(self)
[docs] def display_name(self): return self.datatype.display_name(self)
[docs] def display_info(self): return self.datatype.display_info(self)
[docs] def get_converted_files_by_type(self, file_type): valid = [] for assoc in self.implicitly_converted_datasets: if not assoc.deleted and assoc.type == file_type: valid.append(assoc.dataset) return valid
[docs] def clear_associated_files(self, metadata_safe=False, purge=False): raise Exception('Unimplemented')
[docs] def get_child_by_designation(self, designation): for child in self.children: if child.designation == designation: return child return None
[docs] def add_validation_error(self, validation_error): self.validation_errors.append(validation_error)
[docs] def extend_validation_errors(self, validation_errors): self.validation_errors.extend(validation_errors)
[docs] def mark_deleted(self, include_children=True): self.deleted = True if include_children: for child in self.children: child.mark_deleted()
[docs] def mark_undeleted(self, include_children=True): self.deleted = False if include_children: for child in self.children: child.mark_undeleted()
[docs] def undeletable(self): if self.purged: return False return True
@property def source_library_dataset(self): def get_source(dataset): if isinstance(dataset, LibraryDatasetDatasetAssociation): if dataset.library_dataset: return (dataset, dataset.library_dataset) if dataset.copied_from_library_dataset_dataset_association: source = get_source(dataset.copied_from_library_dataset_dataset_association) if source: return source if dataset.copied_from_history_dataset_association: source = get_source(dataset.copied_from_history_dataset_association) if source: return source return (None, None) return get_source(self)
[docs]class HistoryDatasetAssociation(DatasetInstance): def __init__(self, hid=None, history=None, copied_from_history_dataset_association=None, copied_from_library_dataset_dataset_association=None, **kwd): DatasetInstance.__init__(self, **kwd) self.hid = hid # Relationships self.history = history self.copied_from_history_dataset_association = copied_from_history_dataset_association self.copied_from_library_dataset_dataset_association = copied_from_library_dataset_dataset_association
[docs] def copy(self, copy_children=False, parent_id=None, target_history=None): hda = HistoryDatasetAssociation(hid=self.hid, name=self.name, info=self.info, blurb=self.blurb, peek=self.peek, extension=self.extension, dbkey=self.dbkey, dataset=self.dataset, visible=self.visible, deleted=self.deleted, parent_id=parent_id, copied_from_history_dataset_association=self, history=target_history) context.add(hda) context.flush() hda.set_size() # Need to set after flushed, as MetadataFiles require dataset.id hda.metadata = self.metadata if copy_children: for child in self.children: child.copy(copy_children=copy_children, parent_id=hda.id) if not self.datatype.copy_safe_peek: # In some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs hda.set_peek() context.flush() return hda
[docs] def to_library_dataset_dataset_association(self, target_folder, replace_dataset=None, parent_id=None): if replace_dataset: # The replace_dataset param ( when not None ) refers to a LibraryDataset that is being replaced with a new version. library_dataset = replace_dataset else: # If replace_dataset is None, the Library level permissions will be taken from the folder and applied to the new # LibraryDataset, and the current user's DefaultUserPermissions will be applied to the associated Dataset. library_dataset = LibraryDataset(folder=target_folder, name=self.name, info=self.info) context.add(library_dataset) context.flush() ldda = LibraryDatasetDatasetAssociation(name=self.name, info=self.info, blurb=self.blurb, peek=self.peek, extension=self.extension, dbkey=self.dbkey, dataset=self.dataset, library_dataset=library_dataset, visible=self.visible, deleted=self.deleted, parent_id=parent_id, copied_from_history_dataset_association=self, user=self.history.user) context.add(ldda) context.flush() # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset # Must set metadata after ldda flushed, as MetadataFiles require ldda.id ldda.metadata = self.metadata if not replace_dataset: target_folder.add_library_dataset(library_dataset, genome_build=ldda.dbkey) context.add(target_folder) context.flush() library_dataset.library_dataset_dataset_association_id = ldda.id context.add(library_dataset) context.flush() for child in self.children: child.to_library_dataset_dataset_association(target_folder=target_folder, replace_dataset=replace_dataset, parent_id=ldda.id) if not self.datatype.copy_safe_peek: # In some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs ldda.set_peek() context.flush() return ldda
[docs] def clear_associated_files(self, metadata_safe=False, purge=False): # metadata_safe = True means to only clear when assoc.metadata_safe == False for assoc in self.implicitly_converted_datasets: if not metadata_safe or not assoc.metadata_safe: assoc.clear(purge=purge)
[docs]class LibraryDatasetDatasetAssociation(DatasetInstance): def __init__(self, copied_from_history_dataset_association=None, copied_from_library_dataset_dataset_association=None, library_dataset=None, user=None, **kwd): DatasetInstance.__init__(self, **kwd) self.copied_from_history_dataset_association = copied_from_history_dataset_association self.copied_from_library_dataset_dataset_association = copied_from_library_dataset_dataset_association self.library_dataset = library_dataset self.user = user
[docs] def to_history_dataset_association(self, target_history, parent_id=None): hid = target_history._next_hid() hda = HistoryDatasetAssociation(name=self.name, info=self.info, blurb=self.blurb, peek=self.peek, extension=self.extension, dbkey=self.dbkey, dataset=self.dataset, visible=self.visible, deleted=self.deleted, parent_id=parent_id, copied_from_library_dataset_dataset_association=self, history=target_history, hid=hid) context.add(hda) context.flush() hda.metadata = self.metadata # need to set after flushed, as MetadataFiles require dataset.id for child in self.children: child.to_history_dataset_association(target_history=target_history, parent_id=hda.id) if not self.datatype.copy_safe_peek: hda.set_peek() # in some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs context.flush() return hda
[docs] def copy(self, copy_children=False, parent_id=None, target_folder=None): ldda = LibraryDatasetDatasetAssociation(name=self.name, info=self.info, blurb=self.blurb, peek=self.peek, extension=self.extension, dbkey=self.dbkey, dataset=self.dataset, visible=self.visible, deleted=self.deleted, parent_id=parent_id, copied_from_library_dataset_dataset_association=self, folder=target_folder) context.add(ldda) context.flush() # Need to set after flushed, as MetadataFiles require dataset.id ldda.metadata = self.metadata if copy_children: for child in self.children: child.copy(copy_children=copy_children, parent_id=ldda.id) if not self.datatype.copy_safe_peek: # In some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs ldda.set_peek() context.flush() return ldda
[docs] def clear_associated_files(self, metadata_safe=False, purge=False): return
[docs] def get_library_item_info_templates(self, template_list=[], restrict=False): # If restrict is True, we'll return only those templates directly associated with this LibraryDatasetDatasetAssociation if self.library_dataset_dataset_info_template_associations: template_list.extend([lddita.library_item_info_template for lddita in self.library_dataset_dataset_info_template_associations if lddita.library_item_info_template not in template_list]) self.library_dataset.get_library_item_info_templates(template_list, restrict) return template_list
[docs]class LibraryDataset: # This class acts as a proxy to the currently selected LDDA def __init__(self, folder=None, order_id=None, name=None, info=None, library_dataset_dataset_association=None, **kwd): self.folder = folder self.order_id = order_id self.name = name self.info = info self.library_dataset_dataset_association = library_dataset_dataset_association
[docs] def set_library_dataset_dataset_association(self, ldda): self.library_dataset_dataset_association = ldda ldda.library_dataset = self context.add_all((self, ldda)) context.flush()
[docs] def get_info(self): if self.library_dataset_dataset_association: return self.library_dataset_dataset_association.info elif self._info: return self._info else: return 'no info'
[docs] def set_info(self, info): self._info = info
info = property(get_info, set_info)
[docs] def get_name(self): if self.library_dataset_dataset_association: return self.library_dataset_dataset_association.name elif self._name: return self._name else: return 'Unnamed dataset'
[docs] def set_name(self, name): self._name = name
name = property(get_name, set_name)
[docs] def display_name(self): self.library_dataset_dataset_association.display_name()
[docs] def get_purged(self): return self.library_dataset_dataset_association.dataset.purged
[docs] def set_purged(self, purged): if purged: raise Exception("Not implemented") if not purged and self.purged: raise Exception("Cannot unpurge once purged")
purged = property(get_purged, set_purged)
[docs] def get_library_item_info_templates(self, template_list=[], restrict=False): # If restrict is True, we'll return only those templates directly associated with this LibraryDataset if self.library_dataset_info_template_associations: template_list.extend([ldita.library_item_info_template for ldita in self.library_dataset_info_template_associations if ldita.library_item_info_template not in template_list]) if restrict not in ['True', True]: self.folder.get_library_item_info_templates(template_list, restrict) return template_list
# tables Dataset.table = Table("dataset", metadata, Column("id", Integer, primary_key=True), Column("create_time", DateTime, default=now), Column("update_time", DateTime, index=True, default=now, onupdate=now), Column("state", TrimmedString(64)), Column("deleted", Boolean, index=True, default=False), Column("purged", Boolean, index=True, default=False), Column("purgable", Boolean, default=True), Column("external_filename", TEXT), Column("_extra_files_path", TEXT), Column('file_size', Numeric(15, 0))) HistoryDatasetAssociation.table = Table("history_dataset_association", metadata, Column("id", Integer, primary_key=True), Column("dataset_id", Integer, ForeignKey("dataset.id"), index=True), Column("create_time", DateTime, default=now), Column("update_time", DateTime, default=now, onupdate=now), Column("copied_from_history_dataset_association_id", Integer, ForeignKey("history_dataset_association.id"), nullable=True), Column("copied_from_library_dataset_dataset_association_id", Integer, ForeignKey("library_dataset_dataset_association.id"), nullable=True), Column("hid", Integer), Column("name", TrimmedString(255)), Column("info", TrimmedString(255)), Column("blurb", TrimmedString(255)), Column("peek", TEXT), Column("extension", TrimmedString(64)), Column("metadata", MetadataType(), key="_metadata"), Column("parent_id", Integer, ForeignKey("history_dataset_association.id"), nullable=True), Column("designation", TrimmedString(255)), Column("deleted", Boolean, index=True, default=False), Column("visible", Boolean)) LibraryDatasetDatasetAssociation.table = Table("library_dataset_dataset_association", metadata, Column("id", Integer, primary_key=True), Column("library_dataset_id", Integer, ForeignKey("library_dataset.id"), index=True), Column("dataset_id", Integer, ForeignKey("dataset.id"), index=True), Column("create_time", DateTime, default=now), Column("update_time", DateTime, default=now, onupdate=now), Column("copied_from_history_dataset_association_id", Integer, ForeignKey("history_dataset_association.id", use_alter=True, name='history_dataset_association_dataset_id_fkey'), nullable=True), Column("copied_from_library_dataset_dataset_association_id", Integer, ForeignKey("library_dataset_dataset_association.id", use_alter=True, name='library_dataset_dataset_association_id_fkey'), nullable=True), Column("name", TrimmedString(255)), Column("info", TrimmedString(255)), Column("blurb", TrimmedString(255)), Column("peek", TEXT), Column("extension", TrimmedString(64)), Column("metadata", MetadataType(), key="_metadata"), Column("parent_id", Integer, ForeignKey("library_dataset_dataset_association.id"), nullable=True), Column("designation", TrimmedString(255)), Column("deleted", Boolean, index=True, default=False), Column("visible", Boolean), Column("message", TrimmedString(255))) LibraryDataset.table = Table("library_dataset", metadata, Column("id", Integer, primary_key=True), Column("library_dataset_dataset_association_id", Integer, ForeignKey("library_dataset_dataset_association.id", use_alter=True, name="library_dataset_dataset_association_id_fk"), nullable=True, index=True), # current version of dataset, if null, there is not a current version selected Column("order_id", Integer), Column("create_time", DateTime, default=now), Column("update_time", DateTime, default=now, onupdate=now), Column("name", TrimmedString(255), key="_name"), # when not None/null this will supercede display in library (but not when imported into user's history?) Column("info", TrimmedString(255), key="_info"), # when not None/null this will supercede display in library (but not when imported into user's history?) Column("deleted", Boolean, index=True, default=False)) # mappers mapper(Dataset, Dataset.table, properties=dict( history_associations=relation( HistoryDatasetAssociation, primaryjoin=(Dataset.table.c.id == HistoryDatasetAssociation.table.c.dataset_id)), active_history_associations=relation( HistoryDatasetAssociation, primaryjoin=((Dataset.table.c.id == HistoryDatasetAssociation.table.c.dataset_id) & (HistoryDatasetAssociation.table.c.deleted == false()))), library_associations=relation( LibraryDatasetDatasetAssociation, primaryjoin=(Dataset.table.c.id == LibraryDatasetDatasetAssociation.table.c.dataset_id)), active_library_associations=relation( LibraryDatasetDatasetAssociation, primaryjoin=((Dataset.table.c.id == LibraryDatasetDatasetAssociation.table.c.dataset_id) & (LibraryDatasetDatasetAssociation.table.c.deleted == false()))))) mapper(HistoryDatasetAssociation, HistoryDatasetAssociation.table, properties=dict( dataset=relation( Dataset, primaryjoin=(Dataset.table.c.id == HistoryDatasetAssociation.table.c.dataset_id), lazy=False), # .history defined in History mapper copied_to_history_dataset_associations=relation( HistoryDatasetAssociation, primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id == HistoryDatasetAssociation.table.c.id), backref=backref("copied_from_history_dataset_association", primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id == HistoryDatasetAssociation.table.c.id), remote_side=[HistoryDatasetAssociation.table.c.id], uselist=False)), copied_to_library_dataset_dataset_associations=relation( LibraryDatasetDatasetAssociation, primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), backref=backref("copied_from_history_dataset_association", primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id], uselist=False)), children=relation( HistoryDatasetAssociation, primaryjoin=(HistoryDatasetAssociation.table.c.parent_id == HistoryDatasetAssociation.table.c.id), backref=backref("parent", primaryjoin=(HistoryDatasetAssociation.table.c.parent_id == HistoryDatasetAssociation.table.c.id), remote_side=[HistoryDatasetAssociation.table.c.id], uselist=False)), visible_children=relation( HistoryDatasetAssociation, primaryjoin=((HistoryDatasetAssociation.table.c.parent_id == HistoryDatasetAssociation.table.c.id) & (HistoryDatasetAssociation.table.c.visible == true()))))) mapper(LibraryDatasetDatasetAssociation, LibraryDatasetDatasetAssociation.table, properties=dict( dataset=relation(Dataset), library_dataset=relation(LibraryDataset, primaryjoin=(LibraryDatasetDatasetAssociation.table.c.library_dataset_id == LibraryDataset.table.c.id)), copied_to_library_dataset_dataset_associations=relation( LibraryDatasetDatasetAssociation, primaryjoin=(LibraryDatasetDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), backref=backref("copied_from_library_dataset_dataset_association", primaryjoin=(LibraryDatasetDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id])), copied_to_history_dataset_associations=relation( HistoryDatasetAssociation, primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), backref=backref("copied_from_library_dataset_dataset_association", primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id], uselist=False)), children=relation( LibraryDatasetDatasetAssociation, primaryjoin=(LibraryDatasetDatasetAssociation.table.c.parent_id == LibraryDatasetDatasetAssociation.table.c.id), backref=backref("parent", primaryjoin=(LibraryDatasetDatasetAssociation.table.c.parent_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id])), visible_children=relation( LibraryDatasetDatasetAssociation, primaryjoin=((LibraryDatasetDatasetAssociation.table.c.parent_id == LibraryDatasetDatasetAssociation.table.c.id) & (LibraryDatasetDatasetAssociation.table.c.visible == true()))))) mapper(LibraryDataset, LibraryDataset.table, properties=dict( library_dataset_dataset_association=relation(LibraryDatasetDatasetAssociation, primaryjoin=(LibraryDataset.table.c.library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id)), expired_datasets=relation(LibraryDatasetDatasetAssociation, foreign_keys=[LibraryDataset.table.c.id, LibraryDataset.table.c.library_dataset_dataset_association_id], primaryjoin=((LibraryDataset.table.c.id == LibraryDatasetDatasetAssociation.table.c.library_dataset_id) & (not_(LibraryDataset.table.c.library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id))), viewonly=True, uselist=True))) def __guess_dataset_by_filename(filename): """Return a guessed dataset by filename""" try: fields = os.path.split(filename) if fields: if fields[-1].startswith('dataset_') and fields[-1].endswith('.dat'): # dataset_%d.dat return Dataset.get(int(fields[-1][len('dataset_'): -len('.dat')])) except Exception: pass # some parsing error, we can't guess Dataset return None
[docs]def upgrade(migrate_engine): print(__doc__) metadata.bind = migrate_engine log.debug("Fixing a discrepancy concerning deleted shared history items.") affected_items = 0 start_time = time.time() for dataset in context.query(Dataset).filter(and_(Dataset.deleted == true(), Dataset.purged == false())): for dataset_instance in dataset.history_associations + dataset.library_associations: if not dataset_instance.deleted: dataset.deleted = False if dataset.file_size in [None, 0]: dataset.set_size() # Restore filesize affected_items += 1 break context.flush() log.debug("%i items affected, and restored." % (affected_items)) log.debug("Time elapsed: %s" % (time.time() - start_time)) # fix share before hda log.debug("Fixing a discrepancy concerning cleaning up deleted history items shared before HDAs.") dataset_by_filename = {} changed_associations = 0 start_time = time.time() for dataset in context.query(Dataset).filter(Dataset.external_filename.like('%dataset_%.dat')): if dataset.file_name in dataset_by_filename: guessed_dataset = dataset_by_filename[dataset.file_name] else: guessed_dataset = __guess_dataset_by_filename(dataset.file_name) if guessed_dataset and dataset.file_name != guessed_dataset.file_name: # not os.path.samefile( dataset.file_name, guessed_dataset.file_name ): guessed_dataset = None dataset_by_filename[dataset.file_name] = guessed_dataset if guessed_dataset is not None and guessed_dataset.id != dataset.id: # could we have a self referential dataset? for dataset_instance in dataset.history_associations + dataset.library_associations: dataset_instance.dataset = guessed_dataset changed_associations += 1 # mark original Dataset as deleted and purged, it is no longer in use, but do not delete file_name contents dataset.deleted = True dataset.external_filename = "Dataset was result of share before HDA, and has been replaced: {} mapped to Dataset {}".format(dataset.external_filename, guessed_dataset.id) dataset.purged = True # we don't really purge the file here, but we mark it as purged, since this dataset is now defunct context.flush() log.debug("%i items affected, and restored." % (changed_associations)) log.debug("Time elapsed: %s" % (time.time() - start_time))
[docs]def downgrade(migrate_engine): pass