Source code for galaxy.model.migrate.versions.0005_cleanup_datasets_fix

import datetime
import errno
import logging
import os
import time

from sqlalchemy import (
    and_,
    Boolean,
    Column,
    DateTime,
    false,
    ForeignKey,
    Integer,
    MetaData,
    not_,
    Numeric,
    Table,
    TEXT,
    true
)
from sqlalchemy.orm import (
    backref,
    mapper,
    relation,
    scoped_session,
    sessionmaker
)

from galaxy.model.custom_types import (
    MetadataType,
    TrimmedString
)
from galaxy.model.metadata import MetadataCollection
from galaxy.util.bunch import Bunch

log = logging.getLogger(__name__)
now = datetime.datetime.utcnow
metadata = MetaData()
context = scoped_session(sessionmaker(autoflush=False, autocommit=True))


# classes
[docs]def get_permitted_actions(**kwds):
    return Bunch()


[docs]def directory_hash_id(id):
    s = str(id)
    l = len(s)
    # Shortcut -- ids 0-999 go under ../000/
    if l < 4:
        return ["000"]
    # Pad with zeros until a multiple of three
    padded = ((3 - len(s) % 3) * "0") + s
    # Drop the last three digits -- 1000 files per directory
    padded = padded[:-3]
    # Break into chunks of three
    return [padded[i * 3:(i + 1) * 3] for i in range(len(padded) // 3)]


[docs]class Dataset:
    states = Bunch(NEW='new',
                   UPLOAD='upload',
                   QUEUED='queued',
                   RUNNING='running',
                   OK='ok',
                   EMPTY='empty',
                   ERROR='error',
                   DISCARDED='discarded')
    permitted_actions = get_permitted_actions(filter='DATASET')
    file_path = "/tmp/"
    engine = None

    def __init__(self, id=None, state=None, external_filename=None, extra_files_path=None, file_size=None, purgable=True):
        self.id = id
        self.state = state
        self.deleted = False
        self.purged = False
        self.purgable = purgable
        self.external_filename = external_filename
        self._extra_files_path = extra_files_path
        self.file_size = file_size

[docs]    def get_file_name(self):
        if not self.external_filename:
            assert self.id is not None, "ID must be set before filename used (commit the object)"
            # First try filename directly under file_path
            filename = os.path.join(self.file_path, "dataset_%d.dat" % self.id)
            # Only use that filename if it already exists (backward compatibility),
            # otherwise construct hashed path
            if not os.path.exists(filename):
                dir = os.path.join(self.file_path, *directory_hash_id(self.id))
                # Create directory if it does not exist
                try:
                    os.makedirs(dir)
                except OSError as e:
                    # File Exists is okay, otherwise reraise
                    if e.errno != errno.EEXIST:
                        raise
                # Return filename inside hashed directory
                return os.path.abspath(os.path.join(dir, "dataset_%d.dat" % self.id))
        else:
            filename = self.external_filename
        # Make filename absolute
        return os.path.abspath(filename)

[docs]    def set_file_name(self, filename):
        if not filename:
            self.external_filename = None
        else:
            self.external_filename = filename
    file_name = property(get_file_name, set_file_name)

    @property
    def extra_files_path(self):
        if self._extra_files_path:
            path = self._extra_files_path
        else:
            path = os.path.join(self.file_path, "dataset_%d_files" % self.id)
            # only use path directly under self.file_path if it exists
            if not os.path.exists(path):
                path = os.path.join(os.path.join(self.file_path, *directory_hash_id(self.id)), "dataset_%d_files" % self.id)
        # Make path absolute
        return os.path.abspath(path)

[docs]    def get_size(self):
        """Returns the size of the data on disk"""
        if self.file_size:
            return self.file_size
        else:
            try:
                return os.path.getsize(self.file_name)
            except OSError:
                return 0

[docs]    def set_size(self):
        """Returns the size of the data on disk"""
        try:
            if not self.file_size:
                self.file_size = os.path.getsize(self.file_name)
        except OSError:
            self.file_size = 0

[docs]    def has_data(self):
        """Detects whether there is any data"""
        return self.get_size() > 0

[docs]    def mark_deleted(self, include_children=True):
        self.deleted = True
    # FIXME: sqlalchemy will replace this

    def _delete(self):
        """Remove the file that corresponds to this data"""
        try:
            os.remove(self.data.file_name)
        except OSError as e:
            log.critical('{} delete error {}'.format(self.__class__.__name__, e))


[docs]class DatasetInstance:
    """A base class for all 'dataset instances', HDAs, LDAs, etc"""
    states = Dataset.states
    permitted_actions = Dataset.permitted_actions

[docs]    def __init__(self, id=None, hid=None, name=None, info=None, blurb=None, peek=None, extension=None,
                 dbkey=None, metadata=None, history=None, dataset=None, deleted=False, designation=None,
                 parent_id=None, validation_errors=None, visible=True, create_dataset=False):
        self.name = name or "Unnamed dataset"
        self.id = id
        self.info = info
        self.blurb = blurb
        self.peek = peek
        self.extension = extension
        self.designation = designation
        self.metadata = metadata or dict()
        if dbkey:  # dbkey is stored in metadata, only set if non-zero, or else we could clobber one supplied by input 'metadata'
            self.dbkey = dbkey
        self.deleted = deleted
        self.visible = visible
        # Relationships
        if not dataset and create_dataset:
            dataset = Dataset(state=Dataset.states.NEW)
            context.add(dataset)
            context.flush()
        self.dataset = dataset
        self.parent_id = parent_id
        self.validation_errors = validation_errors

    @property
    def ext(self):
        return self.extension

[docs]    def get_dataset_state(self):
        return self.dataset.state

[docs]    def set_dataset_state(self, state):
        self.dataset.state = state
        context.add(self.dataset)
        context.flush()  # flush here, because hda.flush() won't flush the Dataset object
    state = property(get_dataset_state, set_dataset_state)

[docs]    def get_file_name(self):
        return self.dataset.get_file_name()

[docs]    def set_file_name(self, filename):
        return self.dataset.set_file_name(filename)
    file_name = property(get_file_name, set_file_name)

    @property
    def extra_files_path(self):
        return self.dataset.extra_files_path

[docs]    def get_metadata(self):
        if not hasattr(self, '_metadata_collection') or self._metadata_collection.parent != self:  # using weakref to store parent (to prevent circ ref), does a context.clear() cause parent to be invalidated, while still copying over this non-database attribute?
            self._metadata_collection = MetadataCollection(self)
        return self._metadata_collection

[docs]    def set_metadata(self, bunch):
        # Needs to accept a MetadataCollection, a bunch, or a dict
        self._metadata = self.metadata.make_dict_copy(bunch)
    metadata = property(get_metadata, set_metadata)
    # This provide backwards compatibility with using the old dbkey
    # field in the database.  That field now maps to "old_dbkey" (see mapping.py).

[docs]    def get_dbkey(self):
        dbkey = self.metadata.dbkey
        if not isinstance(dbkey, list):
            dbkey = [dbkey]
        if dbkey in [[None], []]:
            return "?"
        return dbkey[0]

[docs]    def set_dbkey(self, value):
        if "dbkey" in self.datatype.metadata_spec:
            if not isinstance(value, list):
                self.metadata.dbkey = [value]
            else:
                self.metadata.dbkey = value
    dbkey = property(get_dbkey, set_dbkey)

[docs]    def get_size(self):
        """Returns the size of the data on disk"""
        return self.dataset.get_size()

[docs]    def set_size(self):
        """Returns the size of the data on disk"""
        return self.dataset.set_size()

[docs]    def has_data(self):
        """Detects whether there is any data"""
        return self.dataset.has_data()

[docs]    def get_raw_data(self):
        """Returns the full data. To stream it open the file_name and read/write as needed"""
        return self.datatype.get_raw_data(self)

[docs]    def set_peek(self):
        return self.datatype.set_peek(self)

[docs]    def init_meta(self, copy_from=None):
        return self.datatype.init_meta(self, copy_from=copy_from)

[docs]    def set_meta(self, **kwd):
        self.clear_associated_files(metadata_safe=True)
        return self.datatype.set_meta(self, **kwd)

[docs]    def missing_meta(self, **kwd):
        return self.datatype.missing_meta(self, **kwd)

[docs]    def as_display_type(self, type, **kwd):
        return self.datatype.as_display_type(self, type, **kwd)

[docs]    def display_peek(self):
        return self.datatype.display_peek(self)

[docs]    def display_name(self):
        return self.datatype.display_name(self)

[docs]    def display_info(self):
        return self.datatype.display_info(self)

[docs]    def get_converted_files_by_type(self, file_type):
        valid = []
        for assoc in self.implicitly_converted_datasets:
            if not assoc.deleted and assoc.type == file_type:
                valid.append(assoc.dataset)
        return valid

[docs]    def clear_associated_files(self, metadata_safe=False, purge=False):
        raise Exception('Unimplemented')

[docs]    def get_child_by_designation(self, designation):
        for child in self.children:
            if child.designation == designation:
                return child
        return None

[docs]    def add_validation_error(self, validation_error):
        self.validation_errors.append(validation_error)

[docs]    def extend_validation_errors(self, validation_errors):
        self.validation_errors.extend(validation_errors)

[docs]    def mark_deleted(self, include_children=True):
        self.deleted = True
        if include_children:
            for child in self.children:
                child.mark_deleted()

[docs]    def mark_undeleted(self, include_children=True):
        self.deleted = False
        if include_children:
            for child in self.children:
                child.mark_undeleted()

[docs]    def undeletable(self):
        if self.purged:
            return False
        return True

    @property
    def source_library_dataset(self):
        def get_source(dataset):
            if isinstance(dataset, LibraryDatasetDatasetAssociation):
                if dataset.library_dataset:
                    return (dataset, dataset.library_dataset)
            if dataset.copied_from_library_dataset_dataset_association:
                source = get_source(dataset.copied_from_library_dataset_dataset_association)
                if source:
                    return source
            if dataset.copied_from_history_dataset_association:
                source = get_source(dataset.copied_from_history_dataset_association)
                if source:
                    return source
            return (None, None)
        return get_source(self)


[docs]class HistoryDatasetAssociation(DatasetInstance):
    def __init__(self,
                 hid=None,
                 history=None,
                 copied_from_history_dataset_association=None,
                 copied_from_library_dataset_dataset_association=None,
                 **kwd):
        DatasetInstance.__init__(self, **kwd)
        self.hid = hid
        # Relationships
        self.history = history
        self.copied_from_history_dataset_association = copied_from_history_dataset_association
        self.copied_from_library_dataset_dataset_association = copied_from_library_dataset_dataset_association

[docs]    def copy(self, copy_children=False, parent_id=None, target_history=None):
        hda = HistoryDatasetAssociation(hid=self.hid,
                                        name=self.name,
                                        info=self.info,
                                        blurb=self.blurb,
                                        peek=self.peek,
                                        extension=self.extension,
                                        dbkey=self.dbkey,
                                        dataset=self.dataset,
                                        visible=self.visible,
                                        deleted=self.deleted,
                                        parent_id=parent_id,
                                        copied_from_history_dataset_association=self,
                                        history=target_history)
        context.add(hda)
        context.flush()
        hda.set_size()
        # Need to set after flushed, as MetadataFiles require dataset.id
        hda.metadata = self.metadata
        if copy_children:
            for child in self.children:
                child.copy(copy_children=copy_children, parent_id=hda.id)
        if not self.datatype.copy_safe_peek:
            # In some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs
            hda.set_peek()
        context.flush()
        return hda

[docs]    def to_library_dataset_dataset_association(self, target_folder, replace_dataset=None, parent_id=None):
        if replace_dataset:
            # The replace_dataset param ( when not None ) refers to a LibraryDataset that is being replaced with a new version.
            library_dataset = replace_dataset
        else:
            # If replace_dataset is None, the Library level permissions will be taken from the folder and applied to the new
            # LibraryDataset, and the current user's DefaultUserPermissions will be applied to the associated Dataset.
            library_dataset = LibraryDataset(folder=target_folder, name=self.name, info=self.info)
            context.add(library_dataset)
            context.flush()
        ldda = LibraryDatasetDatasetAssociation(name=self.name,
                                                info=self.info,
                                                blurb=self.blurb,
                                                peek=self.peek,
                                                extension=self.extension,
                                                dbkey=self.dbkey,
                                                dataset=self.dataset,
                                                library_dataset=library_dataset,
                                                visible=self.visible,
                                                deleted=self.deleted,
                                                parent_id=parent_id,
                                                copied_from_history_dataset_association=self,
                                                user=self.history.user)
        context.add(ldda)
        context.flush()
        # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset
        # Must set metadata after ldda flushed, as MetadataFiles require ldda.id
        ldda.metadata = self.metadata
        if not replace_dataset:
            target_folder.add_library_dataset(library_dataset, genome_build=ldda.dbkey)
            context.add(target_folder)
            context.flush()
        library_dataset.library_dataset_dataset_association_id = ldda.id
        context.add(library_dataset)
        context.flush()
        for child in self.children:
            child.to_library_dataset_dataset_association(target_folder=target_folder, replace_dataset=replace_dataset, parent_id=ldda.id)
        if not self.datatype.copy_safe_peek:
            # In some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs
            ldda.set_peek()
        context.flush()
        return ldda

[docs]    def clear_associated_files(self, metadata_safe=False, purge=False):
        # metadata_safe = True means to only clear when assoc.metadata_safe == False
        for assoc in self.implicitly_converted_datasets:
            if not metadata_safe or not assoc.metadata_safe:
                assoc.clear(purge=purge)


[docs]class LibraryDatasetDatasetAssociation(DatasetInstance):
    def __init__(self,
                 copied_from_history_dataset_association=None,
                 copied_from_library_dataset_dataset_association=None,
                 library_dataset=None,
                 user=None,
                 **kwd):
        DatasetInstance.__init__(self, **kwd)
        self.copied_from_history_dataset_association = copied_from_history_dataset_association
        self.copied_from_library_dataset_dataset_association = copied_from_library_dataset_dataset_association
        self.library_dataset = library_dataset
        self.user = user

[docs]    def to_history_dataset_association(self, target_history, parent_id=None):
        hid = target_history._next_hid()
        hda = HistoryDatasetAssociation(name=self.name,
                                        info=self.info,
                                        blurb=self.blurb,
                                        peek=self.peek,
                                        extension=self.extension,
                                        dbkey=self.dbkey,
                                        dataset=self.dataset,
                                        visible=self.visible,
                                        deleted=self.deleted,
                                        parent_id=parent_id,
                                        copied_from_library_dataset_dataset_association=self,
                                        history=target_history,
                                        hid=hid)
        context.add(hda)
        context.flush()
        hda.metadata = self.metadata  # need to set after flushed, as MetadataFiles require dataset.id
        for child in self.children:
            child.to_history_dataset_association(target_history=target_history, parent_id=hda.id)
        if not self.datatype.copy_safe_peek:
            hda.set_peek()  # in some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs
        context.flush()
        return hda

[docs]    def copy(self, copy_children=False, parent_id=None, target_folder=None):
        ldda = LibraryDatasetDatasetAssociation(name=self.name,
                                                info=self.info,
                                                blurb=self.blurb,
                                                peek=self.peek,
                                                extension=self.extension,
                                                dbkey=self.dbkey,
                                                dataset=self.dataset,
                                                visible=self.visible,
                                                deleted=self.deleted,
                                                parent_id=parent_id,
                                                copied_from_library_dataset_dataset_association=self,
                                                folder=target_folder)
        context.add(ldda)
        context.flush()
        # Need to set after flushed, as MetadataFiles require dataset.id
        ldda.metadata = self.metadata
        if copy_children:
            for child in self.children:
                child.copy(copy_children=copy_children, parent_id=ldda.id)
        if not self.datatype.copy_safe_peek:
            # In some instances peek relies on dataset_id, i.e. gmaj.zip for viewing MAFs
            ldda.set_peek()
        context.flush()
        return ldda

[docs]    def clear_associated_files(self, metadata_safe=False, purge=False):
        return

[docs]    def get_library_item_info_templates(self, template_list=[], restrict=False):
        # If restrict is True, we'll return only those templates directly associated with this LibraryDatasetDatasetAssociation
        if self.library_dataset_dataset_info_template_associations:
            template_list.extend([lddita.library_item_info_template for lddita in self.library_dataset_dataset_info_template_associations if lddita.library_item_info_template not in template_list])
        self.library_dataset.get_library_item_info_templates(template_list, restrict)
        return template_list


[docs]class LibraryDataset:
    # This class acts as a proxy to the currently selected LDDA
    def __init__(self, folder=None, order_id=None, name=None, info=None, library_dataset_dataset_association=None, **kwd):
        self.folder = folder
        self.order_id = order_id
        self.name = name
        self.info = info
        self.library_dataset_dataset_association = library_dataset_dataset_association

[docs]    def set_library_dataset_dataset_association(self, ldda):
        self.library_dataset_dataset_association = ldda
        ldda.library_dataset = self
        context.add_all((self, ldda))
        context.flush()

[docs]    def get_info(self):
        if self.library_dataset_dataset_association:
            return self.library_dataset_dataset_association.info
        elif self._info:
            return self._info
        else:
            return 'no info'

[docs]    def set_info(self, info):
        self._info = info
    info = property(get_info, set_info)

[docs]    def get_name(self):
        if self.library_dataset_dataset_association:
            return self.library_dataset_dataset_association.name
        elif self._name:
            return self._name
        else:
            return 'Unnamed dataset'

[docs]    def set_name(self, name):
        self._name = name
    name = property(get_name, set_name)

[docs]    def display_name(self):
        self.library_dataset_dataset_association.display_name()

[docs]    def get_purged(self):
        return self.library_dataset_dataset_association.dataset.purged

[docs]    def set_purged(self, purged):
        if purged:
            raise Exception("Not implemented")
        if not purged and self.purged:
            raise Exception("Cannot unpurge once purged")
    purged = property(get_purged, set_purged)

[docs]    def get_library_item_info_templates(self, template_list=[], restrict=False):
        # If restrict is True, we'll return only those templates directly associated with this LibraryDataset
        if self.library_dataset_info_template_associations:
            template_list.extend([ldita.library_item_info_template for ldita in self.library_dataset_info_template_associations if ldita.library_item_info_template not in template_list])
        if restrict not in ['True', True]:
            self.folder.get_library_item_info_templates(template_list, restrict)
        return template_list

# tables


Dataset.table = Table("dataset", metadata,
    Column("id", Integer, primary_key=True),
    Column("create_time", DateTime, default=now),
    Column("update_time", DateTime, index=True, default=now, onupdate=now),
    Column("state", TrimmedString(64)),
    Column("deleted", Boolean, index=True, default=False),
    Column("purged", Boolean, index=True, default=False),
    Column("purgable", Boolean, default=True),
    Column("external_filename", TEXT),
    Column("_extra_files_path", TEXT),
    Column('file_size', Numeric(15, 0)))


HistoryDatasetAssociation.table = Table("history_dataset_association", metadata,
    Column("id", Integer, primary_key=True),
    Column("dataset_id", Integer, ForeignKey("dataset.id"), index=True),
    Column("create_time", DateTime, default=now),
    Column("update_time", DateTime, default=now, onupdate=now),
    Column("copied_from_history_dataset_association_id", Integer, ForeignKey("history_dataset_association.id"), nullable=True),
    Column("copied_from_library_dataset_dataset_association_id", Integer, ForeignKey("library_dataset_dataset_association.id"), nullable=True),
    Column("hid", Integer),
    Column("name", TrimmedString(255)),
    Column("info", TrimmedString(255)),
    Column("blurb", TrimmedString(255)),
    Column("peek", TEXT),
    Column("extension", TrimmedString(64)),
    Column("metadata", MetadataType(), key="_metadata"),
    Column("parent_id", Integer, ForeignKey("history_dataset_association.id"), nullable=True),
    Column("designation", TrimmedString(255)),
    Column("deleted", Boolean, index=True, default=False),
    Column("visible", Boolean))


LibraryDatasetDatasetAssociation.table = Table("library_dataset_dataset_association", metadata,
    Column("id", Integer, primary_key=True),
    Column("library_dataset_id", Integer, ForeignKey("library_dataset.id"), index=True),
    Column("dataset_id", Integer, ForeignKey("dataset.id"), index=True),
    Column("create_time", DateTime, default=now),
    Column("update_time", DateTime, default=now, onupdate=now),
    Column("copied_from_history_dataset_association_id", Integer, ForeignKey("history_dataset_association.id", use_alter=True, name='history_dataset_association_dataset_id_fkey'), nullable=True),
    Column("copied_from_library_dataset_dataset_association_id", Integer, ForeignKey("library_dataset_dataset_association.id", use_alter=True, name='library_dataset_dataset_association_id_fkey'), nullable=True),
    Column("name", TrimmedString(255)),
    Column("info", TrimmedString(255)),
    Column("blurb", TrimmedString(255)),
    Column("peek", TEXT),
    Column("extension", TrimmedString(64)),
    Column("metadata", MetadataType(), key="_metadata"),
    Column("parent_id", Integer, ForeignKey("library_dataset_dataset_association.id"), nullable=True),
    Column("designation", TrimmedString(255)),
    Column("deleted", Boolean, index=True, default=False),
    Column("visible", Boolean),
    Column("message", TrimmedString(255)))

LibraryDataset.table = Table("library_dataset", metadata,
    Column("id", Integer, primary_key=True),
    Column("library_dataset_dataset_association_id", Integer, ForeignKey("library_dataset_dataset_association.id", use_alter=True, name="library_dataset_dataset_association_id_fk"), nullable=True, index=True),  # current version of dataset, if null, there is not a current version selected
    Column("order_id", Integer),
    Column("create_time", DateTime, default=now),
    Column("update_time", DateTime, default=now, onupdate=now),
    Column("name", TrimmedString(255), key="_name"),  # when not None/null this will supercede display in library (but not when imported into user's history?)
    Column("info", TrimmedString(255), key="_info"),  # when not None/null this will supercede display in library (but not when imported into user's history?)
    Column("deleted", Boolean, index=True, default=False))


# mappers


mapper(Dataset, Dataset.table,
    properties=dict(
        history_associations=relation(
            HistoryDatasetAssociation,
            primaryjoin=(Dataset.table.c.id == HistoryDatasetAssociation.table.c.dataset_id)),
        active_history_associations=relation(
            HistoryDatasetAssociation,
            primaryjoin=((Dataset.table.c.id == HistoryDatasetAssociation.table.c.dataset_id) & (HistoryDatasetAssociation.table.c.deleted == false()))),
        library_associations=relation(
            LibraryDatasetDatasetAssociation,
            primaryjoin=(Dataset.table.c.id == LibraryDatasetDatasetAssociation.table.c.dataset_id)),
        active_library_associations=relation(
            LibraryDatasetDatasetAssociation,
            primaryjoin=((Dataset.table.c.id == LibraryDatasetDatasetAssociation.table.c.dataset_id) & (LibraryDatasetDatasetAssociation.table.c.deleted == false())))))


mapper(HistoryDatasetAssociation, HistoryDatasetAssociation.table,
    properties=dict(
        dataset=relation(
            Dataset,
            primaryjoin=(Dataset.table.c.id == HistoryDatasetAssociation.table.c.dataset_id), lazy=False),
        # .history defined in History mapper
        copied_to_history_dataset_associations=relation(
            HistoryDatasetAssociation,
            primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id == HistoryDatasetAssociation.table.c.id),
            backref=backref("copied_from_history_dataset_association", primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id == HistoryDatasetAssociation.table.c.id), remote_side=[HistoryDatasetAssociation.table.c.id], uselist=False)),
        copied_to_library_dataset_dataset_associations=relation(
            LibraryDatasetDatasetAssociation,
            primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id),
            backref=backref("copied_from_history_dataset_association", primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id], uselist=False)),
        children=relation(
            HistoryDatasetAssociation,
            primaryjoin=(HistoryDatasetAssociation.table.c.parent_id == HistoryDatasetAssociation.table.c.id),
            backref=backref("parent", primaryjoin=(HistoryDatasetAssociation.table.c.parent_id == HistoryDatasetAssociation.table.c.id), remote_side=[HistoryDatasetAssociation.table.c.id], uselist=False)),
        visible_children=relation(
            HistoryDatasetAssociation,
            primaryjoin=((HistoryDatasetAssociation.table.c.parent_id == HistoryDatasetAssociation.table.c.id) & (HistoryDatasetAssociation.table.c.visible == true())))))

mapper(LibraryDatasetDatasetAssociation, LibraryDatasetDatasetAssociation.table,
    properties=dict(
        dataset=relation(Dataset),
        library_dataset=relation(LibraryDataset,
        primaryjoin=(LibraryDatasetDatasetAssociation.table.c.library_dataset_id == LibraryDataset.table.c.id)),
        copied_to_library_dataset_dataset_associations=relation(
            LibraryDatasetDatasetAssociation,
            primaryjoin=(LibraryDatasetDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id),
            backref=backref("copied_from_library_dataset_dataset_association", primaryjoin=(LibraryDatasetDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id])),
        copied_to_history_dataset_associations=relation(
            HistoryDatasetAssociation,
            primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id),
            backref=backref("copied_from_library_dataset_dataset_association", primaryjoin=(HistoryDatasetAssociation.table.c.copied_from_library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id], uselist=False)),
        children=relation(
            LibraryDatasetDatasetAssociation,
            primaryjoin=(LibraryDatasetDatasetAssociation.table.c.parent_id == LibraryDatasetDatasetAssociation.table.c.id),
            backref=backref("parent", primaryjoin=(LibraryDatasetDatasetAssociation.table.c.parent_id == LibraryDatasetDatasetAssociation.table.c.id), remote_side=[LibraryDatasetDatasetAssociation.table.c.id])),
        visible_children=relation(
            LibraryDatasetDatasetAssociation,
            primaryjoin=((LibraryDatasetDatasetAssociation.table.c.parent_id == LibraryDatasetDatasetAssociation.table.c.id) & (LibraryDatasetDatasetAssociation.table.c.visible == true())))))

mapper(LibraryDataset, LibraryDataset.table,
    properties=dict(
        library_dataset_dataset_association=relation(LibraryDatasetDatasetAssociation, primaryjoin=(LibraryDataset.table.c.library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id)),
        expired_datasets=relation(LibraryDatasetDatasetAssociation, foreign_keys=[LibraryDataset.table.c.id, LibraryDataset.table.c.library_dataset_dataset_association_id], primaryjoin=((LibraryDataset.table.c.id == LibraryDatasetDatasetAssociation.table.c.library_dataset_id) & (not_(LibraryDataset.table.c.library_dataset_dataset_association_id == LibraryDatasetDatasetAssociation.table.c.id))), viewonly=True, uselist=True)))


def __guess_dataset_by_filename(filename):
    """Return a guessed dataset by filename"""
    try:
        fields = os.path.split(filename)
        if fields:
            if fields[-1].startswith('dataset_') and fields[-1].endswith('.dat'):  # dataset_%d.dat
                return Dataset.get(int(fields[-1][len('dataset_'): -len('.dat')]))
    except Exception:
        pass  # some parsing error, we can't guess Dataset
    return None


[docs]def upgrade(migrate_engine):
    print(__doc__)
    metadata.bind = migrate_engine

    log.debug("Fixing a discrepancy concerning deleted shared history items.")
    affected_items = 0
    start_time = time.time()
    for dataset in context.query(Dataset).filter(and_(Dataset.deleted == true(), Dataset.purged == false())):
        for dataset_instance in dataset.history_associations + dataset.library_associations:
            if not dataset_instance.deleted:
                dataset.deleted = False
                if dataset.file_size in [None, 0]:
                    dataset.set_size()  # Restore filesize
                affected_items += 1
                break
    context.flush()
    log.debug("%i items affected, and restored." % (affected_items))
    log.debug("Time elapsed: %s" % (time.time() - start_time))

    # fix share before hda
    log.debug("Fixing a discrepancy concerning cleaning up deleted history items shared before HDAs.")
    dataset_by_filename = {}
    changed_associations = 0
    start_time = time.time()
    for dataset in context.query(Dataset).filter(Dataset.external_filename.like('%dataset_%.dat')):
        if dataset.file_name in dataset_by_filename:
            guessed_dataset = dataset_by_filename[dataset.file_name]
        else:
            guessed_dataset = __guess_dataset_by_filename(dataset.file_name)
            if guessed_dataset and dataset.file_name != guessed_dataset.file_name:  # not os.path.samefile( dataset.file_name, guessed_dataset.file_name ):
                guessed_dataset = None
            dataset_by_filename[dataset.file_name] = guessed_dataset

        if guessed_dataset is not None and guessed_dataset.id != dataset.id:  # could we have a self referential dataset?
            for dataset_instance in dataset.history_associations + dataset.library_associations:
                dataset_instance.dataset = guessed_dataset
                changed_associations += 1
            # mark original Dataset as deleted and purged, it is no longer in use, but do not delete file_name contents
            dataset.deleted = True
            dataset.external_filename = "Dataset was result of share before HDA, and has been replaced: {} mapped to Dataset {}".format(dataset.external_filename, guessed_dataset.id)
            dataset.purged = True  # we don't really purge the file here, but we mark it as purged, since this dataset is now defunct
    context.flush()
    log.debug("%i items affected, and restored." % (changed_associations))
    log.debug("Time elapsed: %s" % (time.time() - start_time))


[docs]def downgrade(migrate_engine):
    pass