Warning
This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.tool_shed.tools.data_table_manager
import logging
import os
import shutil
from typing import (
TYPE_CHECKING,
Union,
)
from galaxy.tool_shed.galaxy_install.client import InstallationTarget
from galaxy.tool_shed.util import hg_util
from galaxy.tool_util.data import (
DataTableColumnMismatch,
TabularToolDataTable,
)
from galaxy.util import (
Element,
SubElement,
)
from galaxy.util.tool_shed import xml_util
if TYPE_CHECKING:
from galaxy.model.tool_shed_install import ToolShedRepository
from galaxy.structured_app import BasicSharedApp
from galaxy.util.path import StrPath
log = logging.getLogger(__name__)
RequiredAppT = Union["BasicSharedApp", InstallationTarget]
def _parse_table_columns(table_elem: Element) -> dict[str, int]:
"""Parse a ``<table>`` element's column spec into a name->index mapping.
Mirrors ``TabularToolDataTable.parse_column_spec`` so the dict matches what an
already-registered table exposes on ``.columns`` — no implicit ``name`` alias.
"""
columns, _, _ = TabularToolDataTable.parse_column_spec_element(table_elem)
return columns
class BaseShedToolDataTableManager:
def __init__(self, app: RequiredAppT):
self.app = app
def handle_sample_tool_data_table_conf_file(self, filename: "StrPath", persist: bool = False):
"""
Parse the incoming filename and add new entries to the in-memory
self.app.tool_data_tables dictionary. If persist is True (should
only occur if call is from the Galaxy side, not the tool shed), the
new entries will be appended to Galaxy's shed_tool_data_table_conf.xml
file on disk.
"""
error = False
try:
new_table_elems, message = self.app.tool_data_tables.add_new_entries_from_config_file(
config_filename=filename,
tool_data_path=self.app.config.shed_tool_data_path,
shed_tool_data_table_config=self.app.config.shed_tool_data_table_config,
persist=persist,
)
if message:
error = True
except Exception as e:
message = str(e)
error = True
return error, message
def reset_tool_data_tables(self):
# Reset the tool_data_tables to an empty dictionary.
self.app.tool_data_tables.data_tables = {}
[docs]
class ShedToolDataTableManager(BaseShedToolDataTableManager):
app: InstallationTarget
[docs]
def generate_repository_info_elem(
self, tool_shed: str, repository_name: str, changeset_revision: str, owner: str, parent_elem=None, **kwd
) -> Element:
"""Create and return an ElementTree repository info Element."""
if parent_elem is None:
elem = Element("tool_shed_repository")
else:
elem = SubElement(parent_elem, "tool_shed_repository")
tool_shed_elem = SubElement(elem, "tool_shed")
tool_shed_elem.text = tool_shed
repository_name_elem = SubElement(elem, "repository_name")
repository_name_elem.text = repository_name
repository_owner_elem = SubElement(elem, "repository_owner")
repository_owner_elem.text = owner
changeset_revision_elem = SubElement(elem, "installed_changeset_revision")
changeset_revision_elem.text = changeset_revision
# add additional values
# TODO: enhance additional values to allow e.g. use of dict values that will recurse
for key, value in kwd.items():
new_elem = SubElement(elem, key)
new_elem.text = value
return elem
[docs]
def generate_repository_info_elem_from_repository(self, tool_shed_repository, parent_elem=None, **kwd):
return self.generate_repository_info_elem(
tool_shed_repository.tool_shed,
tool_shed_repository.name,
tool_shed_repository.installed_changeset_revision,
tool_shed_repository.owner,
parent_elem=parent_elem,
**kwd,
)
[docs]
def get_tool_index_sample_files(self, sample_files: list[str]) -> list[str]:
"""
Try to return the list of all appropriate tool data sample files included
in the repository.
"""
tool_index_sample_files = []
for s in sample_files:
# The problem with this is that Galaxy does not follow a standard naming
# convention for file names.
if s.endswith(".loc.sample") or s.endswith(".xml.sample") or s.endswith(".txt.sample"):
tool_index_sample_files.append(str(s))
return tool_index_sample_files
[docs]
def handle_missing_data_table_entry(self, relative_install_dir, tool_path, repository_tools_tups):
"""
Inspect each tool to see if any have input parameters that are dynamically
generated select lists that require entries in the tool_data_table_conf.xml
file. This method is called only from Galaxy (not the tool shed) when a
repository is being installed or reinstalled.
"""
missing_data_table_entry = False
for repository_tools_tup in repository_tools_tups:
tup_path, guid, repository_tool = repository_tools_tup
if repository_tool.params_with_missing_data_table_entry:
missing_data_table_entry = True
break
if missing_data_table_entry:
# The repository must contain a tool_data_table_conf.xml.sample file that includes
# all required entries for all tools in the repository.
sample_tool_data_table_conf = hg_util.get_config_from_disk(
"tool_data_table_conf.xml.sample", relative_install_dir
)
if sample_tool_data_table_conf:
# Add entries to the ToolDataTableManager's in-memory data_tables dictionary.
error, message = self.handle_sample_tool_data_table_conf_file(sample_tool_data_table_conf, persist=True)
if error:
# TODO: Do more here than logging an exception.
log.debug(message)
# Reset the tool_data_tables by loading the empty tool_data_table_conf.xml file.
self.reset_tool_data_tables()
return repository_tools_tups
[docs]
def get_target_install_dir(self, tool_shed_repository: "ToolShedRepository"):
tool_path, relative_target_dir = tool_shed_repository.get_tool_relative_path(self.app)
# This is where index files will reside on a per repo/installed version basis.
target_dir = os.path.join(self.app.config.shed_tool_data_path, relative_target_dir)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
return target_dir, tool_path, relative_target_dir
def _merge_loc_sample_entries(
self,
existing_table: TabularToolDataTable,
elem: Element,
loc_basename_to_source: dict,
tool_shed_repository: "ToolShedRepository",
) -> None:
"""Append any non-comment rows from this install's .loc.sample files to the shared loc file,
attributing them to the installing repository. 99% of .loc.sample files are empty/comments,
in which case this is a no-op.
Skipped for non-tabular table types (e.g. ``RefgenieToolDataTable``) whose
``parse_file_fields`` is not designed to read a ``.loc.sample``.
"""
if getattr(existing_table, "type_key", "tabular") != "tabular":
return
attribution = (
f"Added by {tool_shed_repository.owner}/{tool_shed_repository.name}"
f"@{tool_shed_repository.installed_changeset_revision}"
)
for file_elem in elem.findall("file"):
shared_path = file_elem.get("path")
if not shared_path:
continue
basename = os.path.basename(shared_path)
source_loc_sample = loc_basename_to_source.get(basename)
if not source_loc_sample or not os.path.exists(source_loc_sample):
continue
# Keep `${__HERE__}` literal so the appended rows match the shared loc's existing format.
new_rows = existing_table.parse_file_fields(source_loc_sample, here="${__HERE__}")
if new_rows:
existing_table.append_entries_with_attribution(new_rows, attribution)
def _shed_config_has_matching_entry(self, table_name: str, elem: Element) -> bool:
"""Return True if ``shed_tool_data_table_config`` already has a ``<table>`` with the same
``name`` and identical set of ``<file path>`` entries as ``elem``.
Used to avoid writing duplicate ``<table>`` entries for tables that have already been
registered by a prior install. The shed config remains the persistent source of the
shared loc file's association with the data table name — on reload, ``merge_tool_data_table``
re-applies the filename info to the in-memory table.
"""
config = self.app.config.shed_tool_data_table_config
if not config or not os.path.exists(config):
return False
try:
tree, _ = xml_util.parse_xml(config)
except OSError as e:
log.warning("Could not read shed_tool_data_table_config '%s' for dedup check: %s", config, e)
return False
if tree is None:
return False
elem_paths = {fe.get("path") for fe in elem.findall("file") if fe.get("path")}
for existing_elem in tree.getroot().findall("table"):
if existing_elem.get("name") != table_name:
continue
existing_paths = {fe.get("path") for fe in existing_elem.findall("file") if fe.get("path")}
if elem_paths == existing_paths:
return True
return False
[docs]
def install_tool_data_tables(self, tool_shed_repository: "ToolShedRepository", tool_index_sample_files):
TOOL_DATA_TABLE_FILE_NAME = "tool_data_table_conf.xml"
TOOL_DATA_TABLE_FILE_SAMPLE_NAME = f"{TOOL_DATA_TABLE_FILE_NAME}.sample"
SAMPLE_SUFFIX = ".sample"
SAMPLE_SUFFIX_OFFSET = -len(SAMPLE_SUFFIX)
LOC_SAMPLE_SUFFIX = ".loc.sample"
target_dir, tool_path, relative_target_dir = self.get_target_install_dir(tool_shed_repository)
# Galaxy-managed loc files for shed-installed tools live under tool_data_path/shed/ so they
# are clearly separated from admin-configured loc files in tool_data_path and from any
# entries shipped via tool_data_table_conf.xml.sample.
shared_loc_dir = os.path.join(self.app.config.tool_data_path, "shed")
os.makedirs(shared_loc_dir, exist_ok=True)
# Map shared loc basename -> source .loc.sample, used to merge entries when a table is reinstalled.
loc_basename_to_source: dict[str, str] = {}
for sample_file in tool_index_sample_files:
path, filename = os.path.split(sample_file)
target_filename = filename
if target_filename.endswith(SAMPLE_SUFFIX):
target_filename = target_filename[:SAMPLE_SUFFIX_OFFSET]
source_file = os.path.join(tool_path, sample_file)
if filename.endswith(LOC_SAMPLE_SUFFIX):
target_path_filename = os.path.join(shared_loc_dir, target_filename)
install_dest_dir = shared_loc_dir
loc_basename_to_source[target_filename] = source_file
else:
target_path_filename = os.path.join(target_dir, target_filename)
install_dest_dir = target_dir
# We're not currently uninstalling index files, do not overwrite existing files.
if not os.path.exists(target_path_filename) or target_filename == TOOL_DATA_TABLE_FILE_NAME:
shutil.copy2(source_file, target_path_filename)
else:
log.debug(
"Did not copy sample file '%s' to install directory '%s' because file already exists.",
filename,
install_dest_dir,
)
# For provenance and to simplify introspection, let's keep the original data table sample file around.
if filename == TOOL_DATA_TABLE_FILE_SAMPLE_NAME:
shutil.copy2(source_file, os.path.join(target_dir, filename))
tool_data_table_conf_filename = os.path.join(target_dir, TOOL_DATA_TABLE_FILE_NAME)
elems: list = []
if os.path.exists(tool_data_table_conf_filename):
tree, error_message = xml_util.parse_xml(tool_data_table_conf_filename)
if tree:
root = tree.getroot()
if root.tag == "tables":
elems = list(iter(root))
else:
log.warning(
"The '%s' data table file has '%s' instead of <tables> as root element, skipping.",
tool_data_table_conf_filename,
root.tag,
)
else:
log.warning(
"The '%s' data table file was not found, but was expected to be copied from '%s' during repository installation.",
tool_data_table_conf_filename,
TOOL_DATA_TABLE_FILE_SAMPLE_NAME,
)
registered_tables = self.app.tool_data_tables.data_tables
kept_elems: list = []
for elem in elems:
if elem.tag != "table":
kept_elems.append(elem)
continue
for file_elem in elem.findall("file"):
path = file_elem.get("path", None)
if path:
new_path = os.path.normpath(os.path.join(shared_loc_dir, os.path.split(path)[1]))
file_elem.set("path", new_path)
table_name = elem.get("name") or ""
incoming_columns = _parse_table_columns(elem)
self.app.tool_data_tables.assert_data_table_consistency(table_name, incoming_columns)
existing = registered_tables.get(table_name) if table_name else None
if isinstance(existing, TabularToolDataTable) and existing.columns is not None:
# Already registered with matching columns. Merge any rows from this install's
# .loc.sample(s) into the shared loc file.
self._merge_loc_sample_entries(existing, elem, loc_basename_to_source, tool_shed_repository)
if self._shed_config_has_matching_entry(table_name, elem):
# An identical <table> entry already exists in shed_tool_data_table_config.
# Don't write another one (it would just duplicate the shared loc reference).
continue
kept_elems.append(elem)
if kept_elems:
# Remove old data_table
if os.path.exists(tool_data_table_conf_filename):
os.unlink(tool_data_table_conf_filename)
# Persist new data_table content.
self.app.tool_data_tables.to_xml_file(tool_data_table_conf_filename, kept_elems)
return tool_data_table_conf_filename, kept_elems
# For backwards compatibility with exisiting data managers
ToolDataTableManager = ShedToolDataTableManager
__all__ = (
"DataTableColumnMismatch",
"ToolDataTableManager",
"ShedToolDataTableManager",
)