Source code for galaxy.datatypes.isa

"""
ISA datatype

See https://github.com/ISA-tools
"""

import json
import logging
import os
import os.path
import re
import shutil
import tempfile
from typing import (
    Optional,
    TYPE_CHECKING,
)

logger = logging.getLogger(__name__)

# Imports isatab after turning off warnings inside logger settings to avoid pandas warning making uploads fail.
logging.getLogger("isatools.isatab").setLevel(logging.ERROR)
from isatools import (
    isajson,
    isatab_meta,
)
from markupsafe import escape

from galaxy import util
from galaxy.datatypes.data import Data
from galaxy.datatypes.protocols import (
    DatasetHasHidProtocol,
    DatasetProtocol,
    HasExtraFilesAndMetadata,
    HasExtraFilesPath,
)
from galaxy.util.compression_utils import CompressedFile
from galaxy.util.sanitize_html import sanitize_html

if TYPE_CHECKING:
    from isatools.model import Investigation

# Main files regex
JSON_FILE_REGEX = re.compile(r"^.*\.json$", flags=re.IGNORECASE)
INVESTIGATION_FILE_REGEX = re.compile(r"^i_\w+\.txt$", flags=re.IGNORECASE)

# The name of the ISA archive (compressed file) as saved inside Galaxy
ISA_ARCHIVE_NAME = "archive"

# Set max number of lines of the history peek
_MAX_LINES_HISTORY_PEEK = 11


class _Isa(Data):
    """Base class for implementing ISA datatypes"""

    composite_type = "auto_primary_file"
    is_binary = True

    def _make_investigation_instance(self, filename: str) -> "Investigation":
        raise NotImplementedError()

    def __init__(self, main_file_regex: re.Pattern, **kwd) -> None:
        super().__init__(**kwd)
        self._main_file_regex = main_file_regex

        # Add the archive file as the only composite file
        self.add_composite_file(ISA_ARCHIVE_NAME, is_binary=True, optional=True)

    def _get_isa_folder_path(self, dataset: HasExtraFilesPath) -> str:
        isa_folder = dataset.extra_files_path
        if not isa_folder:
            raise Exception("Unvalid dataset object, or no extra files path found for this dataset.")
        return isa_folder

    def _get_main_file(self, dataset: HasExtraFilesPath) -> str:
        """Get the main file of the ISA archive. Either the investigation file i_*.txt for ISA-Tab, or the JSON file for ISA-JSON."""

        main_file = None
        isa_folder = self._get_isa_folder_path(dataset)
        assert os.path.exists(isa_folder)

        # Get ISA archive older
        isa_files = os.listdir(isa_folder)

        main_file = self._find_main_file_in_archive(isa_files)
        # Make full path
        return os.path.join(isa_folder, main_file)

    def _get_investigation(self, dataset: HasExtraFilesPath) -> "Investigation":
        """Create a contained instance specific to the exact ISA type (Tab or Json).
        We will use it to parse and access information from the archive."""

        main_file = self._get_main_file(dataset)
        return self._make_investigation_instance(main_file)

    def _find_main_file_in_archive(self, files_list: list) -> str:
        """Find the main file inside the ISA archive."""

        found_file = None

        for f in files_list:
            match = self._main_file_regex.match(f)
            if match:
                if found_file is None:
                    matched = match.group()  # can be string or tuple
                    found_file = matched if isinstance(matched, str) else matched[0]
                else:
                    raise Exception(
                        f"More than one file match the pattern '{self._main_file_regex}' to identify the investigation file"
                    )
        if found_file is None:
            raise Exception("Invalid ISA archive. No main file found.")
        return found_file

    def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
        """Set the peek and blurb text. Get first lines of the main file and set it as the peek."""

        main_file = self._get_main_file(dataset)

        # Read first lines of main file
        with open(main_file, encoding="utf-8") as f:
            data: list = []
            for line in f:
                if len(data) < _MAX_LINES_HISTORY_PEEK:
                    data.append(line)
                else:
                    break
            if not dataset.dataset.purged and data:
                dataset.peek = json.dumps({"data": data})
                dataset.blurb = "data"
            else:
                dataset.peek = "file does not exist"
                dataset.blurb = "file purged from disk"

    def display_peek(self, dataset: DatasetProtocol) -> str:
        """Create the HTML table used for displaying peek, from the peek text found by set_peek() method."""

        out = ['<table cellspacing="0" cellpadding="3">']
        try:
            if not dataset.peek:
                dataset.set_peek()
            json_data = json.loads(dataset.peek)
            for line in json_data["data"]:
                line = line.strip()
                if not line:
                    continue
                out.append(f"<tr><td>{escape(util.unicodify(line, 'utf-8'))}</td></tr>")
            out.append("</table>")
            return "".join(out)
        except Exception as exc:
            return f"Can't create peek: {util.unicodify(exc)}"

    def generate_primary_file(self, dataset: HasExtraFilesAndMetadata) -> str:
        """Generate the primary file. It is an HTML file containing description of the composite dataset
        as well as a list of the composite files that it contains."""

        if dataset:
            rval = ["<html><head><title>ISA Dataset </title></head><p/>"]
            if hasattr(dataset, "extra_files_path"):
                rval.append("<div>ISA Dataset composed of the following files:<p/><ul>")
                for cmp_file in os.listdir(dataset.extra_files_path):
                    rval.append(f'<li><a href="{cmp_file}" type="text/plain">{escape(cmp_file)}</a></li>')
                rval.append("</ul></div></html>")
            else:
                rval.append("<div>ISA Dataset is empty!<p/><ul>")
            return "\n".join(rval)
        return "<div>No dataset available</div>"

    def dataset_content_needs_grooming(self, file_name: str) -> bool:
        """This function is called on an output dataset file after the content is initially generated."""
        return os.path.basename(file_name) == ISA_ARCHIVE_NAME

    def groom_dataset_content(self, file_name: str) -> None:
        """This method is called by Galaxy to extract files contained in a composite data type."""
        # XXX Is the right place to extract files? Should this step not be a cleaning step instead?
        # Could extracting be done earlier and composite files declared as files contained inside the archive
        # instead of the archive itself?

        # extract basename and folder of the current file whose content has to be groomed
        basename = os.path.basename(file_name)
        output_path = os.path.dirname(file_name)
        # extract archive if the file corresponds to the ISA archive
        if basename == ISA_ARCHIVE_NAME:
            # perform extraction
            # For some ZIP files CompressedFile::extract() extract the file inside <output_folder>/<file_name> instead of outputing it inside <output_folder>. So we first create a temporary folder, extract inside it, and move content to final destination.
            temp_folder = tempfile.mkdtemp()
            with CompressedFile(file_name) as cf:
                cf.extract(temp_folder)
            shutil.rmtree(output_path)
            extracted_files = os.listdir(temp_folder)
            logger.debug(" ".join(extracted_files))
            if len(extracted_files) == 0:
                os.makedirs(output_path)
                shutil.rmtree(temp_folder)
            elif len(extracted_files) == 1 and os.path.isdir(os.path.join(temp_folder, extracted_files[0])):
                shutil.move(os.path.join(temp_folder, extracted_files[0]), output_path)
                shutil.rmtree(temp_folder)
            else:
                shutil.move(temp_folder, output_path)

    def display_data(
        self,
        trans,
        dataset: DatasetHasHidProtocol,
        preview: bool = False,
        filename: Optional[str] = None,
        to_ext: Optional[str] = None,
        offset: Optional[int] = None,
        ck_size: Optional[int] = None,
        **kwd,
    ):
        """Downloads the ISA dataset if `preview` is `False`;
        if `preview` is `True`, it returns a preview of the ISA dataset as a HTML page.
        The preview is triggered when user clicks on the eye icon of the composite dataset."""

        headers = kwd.get("headers", {})
        # if it is not required a preview use the default behaviour of `display_data`
        if not preview:
            return super().display_data(trans, dataset, preview, filename, to_ext, **kwd)

        # prepare the preview of the ISA dataset
        try:
            investigation = self._get_investigation(dataset)
        except Exception:
            logger.exception(f"Failed to display dataset {dataset.id}")
            html = """<html><header><title>Error while reading ISA archive.</title></header>
                   <body>
                        <h1>An error occurred while reading content of ISA archive.</h1>
                        <p>If you have tried to load your archive with the uploader by selecting isa-tab as composite data type, then try to load it again with isa-json instead. Conversely, if you have tried to load your archive with the uploader by selecting isa-json as composite data type, then try isa-tab instead.</p>
                        <p>You may also try to look into your zip file in order to find out if this is a proper ISA archive. If you see a file i_Investigation.txt inside, then it is an ISA-Tab archive. If you see a file with extension .json inside, then it is an ISA-JSON archive. If you see nothing like that, then either your ISA archive is corrupted, or it is not an ISA archive.</p>
                   </body></html>"""
        else:
            html = "<html><body>"
            html += f"<h1>{investigation.title} {investigation.identifier}</h1>"

            # Loop on all studies
            for study in investigation.studies:
                html += f"<h2>Study {study.identifier}</h2>"
                html += f"<h3>{study.title}</h3>"
                html += f"<p>{study.description}</p>"
                html += f"<p>Submitted the {study.submission_date}</p>"
                html += f"<p>Released on {study.public_release_date}</p>"

                html += f"<p>Experimental factors used: {', '.join(x.name for x in study.factors)}</p>"

                # Loop on all assays of this study
                for assay in study.assays:
                    html += f"<h3>Assay {assay.filename}</h3>"
                    html += f"<p>Measurement type: {assay.measurement_type.term}</p>"  # OntologyAnnotation
                    html += f"<p>Technology type: {assay.technology_type.term}</p>"  # OntologyAnnotation
                    html += f"<p>Technology platform: {assay.technology_platform}</p>"
                    if assay.data_files is not None:
                        html += "<p>Data files:</p>"
                        html += "<ul>"
                        for data_file in assay.data_files:
                            if data_file.filename != "":
                                html += f"<li>{escape(util.unicodify(str(data_file.filename), 'utf-8'))} - {escape(util.unicodify(str(data_file.label), 'utf-8'))}</li>"
                        html += "</ul>"

            html += "</body></html>"

        # Set mime type
        mime = "text/html"
        self._clean_and_set_mime_type(trans, mime, headers)

        return sanitize_html(html).encode("utf-8"), headers



[docs]
class IsaTab(_Isa):
    file_ext = "isa-tab"


[docs]
    def __init__(self, **kwd):
        super().__init__(main_file_regex=INVESTIGATION_FILE_REGEX, **kwd)


    def _make_investigation_instance(self, filename: str):
        # Parse ISA-Tab investigation file
        parser = isatab_meta.InvestigationParser()
        isa_dir = os.path.dirname(filename)
        with open(filename, newline="", encoding="utf8") as fp:
            parser.parse(fp)
        for study in parser.isa.studies:
            s_parser = isatab_meta.LazyStudySampleTableParser(parser.isa)
            s_parser.parse(os.path.join(isa_dir, study.filename))
            for assay in study.assays:
                a_parser = isatab_meta.LazyAssayTableParser(parser.isa)
                a_parser.parse(os.path.join(isa_dir, assay.filename))
        isa = parser.isa

        return isa




[docs]
class IsaJson(_Isa):
    file_ext = "isa-json"


[docs]
    def __init__(self, **kwd):
        super().__init__(main_file_regex=JSON_FILE_REGEX, **kwd)


    def _make_investigation_instance(self, filename: str):
        # Parse JSON file
        with open(filename, newline="", encoding="utf8") as fp:
            isa = isajson.load(fp)

        return isa