Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.tool_util.ontologies.ontology_data

import logging
from collections import defaultdict
from functools import lru_cache
from typing import (
    cast,
    Dict,
    List,
    NamedTuple,
    Optional,
    Tuple,
)

import yaml

from galaxy.tool_util.biotools import BiotoolsMetadataSource
from galaxy.tool_util.parser import ToolSource
from galaxy.tool_util_models.tool_source import XrefDict
from galaxy.util.resources import resource_string

log = logging.getLogger(__name__)


def _multi_dict_mapping(content: str) -> Dict[str, List[str]]:
    mapping: Dict[str, List[str]] = {}
    for x in content.splitlines():
        if x.startswith("#"):
            continue
        key, value = cast(Tuple[str, str], tuple(x.split("\t")))
        mapping.setdefault(key, []).append(value)
    return mapping


def _read_ontology_data_text(filename: str) -> str:
    return resource_string(__name__, filename)


BIOTOOLS_MAPPING_FILENAME = "biotools_mappings.tsv"
EDAM_OPERATION_MAPPING_FILENAME = "edam_operation_mappings.tsv"
EDAM_TOPIC_MAPPING_FILENAME = "edam_topic_mappings.tsv"
TOOL_TAG_MAPPING_FILENAME = "tool_tag_mappings.yml"


@lru_cache(maxsize=1)
def _biotools_mapping() -> Dict[str, List[str]]:
    mapping: Dict[str, List[str]] = defaultdict(list)
    for line in _read_ontology_data_text(BIOTOOLS_MAPPING_FILENAME).splitlines():
        if not line.startswith("#"):
            tool_id, xref = line.split("\t")
            mapping[tool_id].append(xref)
    return mapping


@lru_cache(maxsize=1)
def _edam_operation_mapping() -> Dict[str, List[str]]:
    return _multi_dict_mapping(_read_ontology_data_text(EDAM_OPERATION_MAPPING_FILENAME))


@lru_cache(maxsize=1)
def _edam_topic_mapping() -> Dict[str, List[str]]:
    return _multi_dict_mapping(_read_ontology_data_text(EDAM_TOPIC_MAPPING_FILENAME))


def _load_tool_tag_mapping(content: str) -> Dict[str, List[str]]:
    raw = cast(
        Dict[str, List[str]],
        (yaml.safe_load(content) or {}).get("tool_tags", {}),
    )
    # `Tool.all_ids` is built from lowercased tool ids (see `Tool.parse` in
    # `lib/galaxy/tools/__init__.py`, around the `self_ids = [self.id.lower()]`
    # block), so the curated mapping must use lowercase keys to be looked up
    # successfully. Normalize at load time so admin-supplied YAML files don't
    # have to worry about case.
    return {tool_id.lower(): tags for tool_id, tags in raw.items()}


_TOOL_TAG_MAPPING_OVERRIDE: Optional[Dict[str, List[str]]] = None


def _tool_tag_mapping() -> Dict[str, List[str]]:
    if _TOOL_TAG_MAPPING_OVERRIDE is not None:
        return _TOOL_TAG_MAPPING_OVERRIDE
    return _bundled_tool_tag_mapping()


@lru_cache(maxsize=1)
def _bundled_tool_tag_mapping() -> Dict[str, List[str]]:
    return _load_tool_tag_mapping(_read_ontology_data_text(TOOL_TAG_MAPPING_FILENAME))


[docs] def configure_tool_tag_mapping(file_path: Optional[str]) -> None: """Replace the in-memory curated tool → tag mapping. Galaxy calls this once at startup with the value of the ``tool_tag_mappings_file`` config option. If ``file_path`` is empty or ``None``, the bundled minimal mapping (see ``tool_tag_mappings.yml``) is retained. Missing or unreadable files are logged and the in-memory mapping is left untouched so a typo in ``galaxy.yml`` doesn't take down tool loading. """ if not file_path: return try: with open(file_path, encoding="utf-8") as fh: new_mapping = _load_tool_tag_mapping(fh.read()) except OSError: # Surface the failure but keep the bundled fallback active. log.warning( "Could not read tool_tag_mappings_file %s; falling back to bundled mapping.", file_path, ) return global _TOOL_TAG_MAPPING_OVERRIDE _TOOL_TAG_MAPPING_OVERRIDE = new_mapping
[docs] class OntologyData(NamedTuple): xrefs: List[XrefDict] edam_operations: Optional[List[str]] edam_topics: Optional[List[str]] tool_tags: List[str]
[docs] def biotools_reference(xrefs): for xref in xrefs: if xref["type"] == "bio.tools": return xref["value"] return None
[docs] def legacy_biotools_external_reference(all_ids: List[str]) -> List[str]: biotools_mapping = _biotools_mapping() for tool_id in all_ids: if tool_id in biotools_mapping: return biotools_mapping[tool_id] return []
[docs] def curated_tool_tags(all_ids: List[str]) -> List[str]: mapping = _tool_tag_mapping() seen = set() tags: List[str] = [] for tool_id in all_ids: for tag in mapping.get(tool_id, []): if tag not in seen: seen.add(tag) tags.append(tag) return tags
[docs] def expand_ontology_data( tool_source: ToolSource, all_ids: List[str], biotools_metadata_source: Optional[BiotoolsMetadataSource] ) -> OntologyData: xrefs = tool_source.parse_xrefs() has_biotools_reference = any(x["type"] == "bio.tools" for x in xrefs) if not has_biotools_reference: for legacy_biotools_ref in legacy_biotools_external_reference(all_ids): if legacy_biotools_ref is not None: xrefs.append({"value": legacy_biotools_ref, "type": "bio.tools"}) edam_operations = tool_source.parse_edam_operations() edam_topics = tool_source.parse_edam_topics() edam_operation_mapping = _edam_operation_mapping() for tool_id in all_ids: if tool_id in edam_operation_mapping: edam_operations = edam_operation_mapping[tool_id] break edam_topic_mapping = _edam_topic_mapping() for tool_id in all_ids: if tool_id in edam_topic_mapping: edam_topics = edam_topic_mapping[tool_id] break has_missing_data = len(edam_operations) == 0 or len(edam_topics) == 0 if has_missing_data: biotools_reference_str = biotools_reference(xrefs) if biotools_reference_str and biotools_metadata_source: biotools_entry = biotools_metadata_source.get_biotools_metadata(biotools_reference_str) if biotools_entry: edam_info = biotools_entry.edam_info if len(edam_operations) == 0: edam_operations = edam_info.edam_operations if len(edam_topics) == 0: edam_topics = edam_info.edam_topics return OntologyData( xrefs, edam_operations, edam_topics, curated_tool_tags(all_ids), )