"""
XML format classes
"""
import logging
import re
from typing import (
List,
TYPE_CHECKING,
)
from galaxy import util
from galaxy.datatypes.dataproviders.dataset import DatasetDataProvider
from galaxy.datatypes.dataproviders.hierarchy import XMLDataProvider
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
disable_parent_class_sniffing,
FilePrefix,
)
from . import (
data,
dataproviders,
)
if TYPE_CHECKING:
from galaxy.model import DatasetInstance
log = logging.getLogger(__name__)
OWL_MARKER = re.compile(r"\<owl:")
SBML_MARKER = re.compile(r"\<sbml")
[docs]@dataproviders.decorators.has_dataproviders
@build_sniff_from_prefix
class GenericXml(data.Text):
"""Base format class for any XML file."""
edam_format = "format_2332"
file_ext = "xml"
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "XML data"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
def _has_root_element_in_prefix(self, file_prefix: FilePrefix, root: str) -> bool:
for line in file_prefix.line_iterator():
if not line.startswith("<?"):
break
# pattern match <root or <ns:root for any ns string
pattern = r"^<(\w*:)?%s" % root
return re.match(pattern, line) is not None
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is XML or not
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
>>> GenericXml().sniff( fname )
True
>>> fname = get_test_fname( 'interval.interval' )
>>> GenericXml().sniff( fname )
False
"""
return file_prefix.startswith("<?xml ")
[docs] @staticmethod
def merge(split_files: List[str], output_file: str) -> None:
"""Merging multiple XML files is non-trivial and must be done in subclasses."""
if len(split_files) > 1:
raise NotImplementedError(
"Merging multiple XML files is non-trivial and must be implemented for each XML type"
)
# For one file only, use base class method (move/copy)
data.Text.merge(split_files, output_file)
[docs] @dataproviders.decorators.dataprovider_factory("xml", XMLDataProvider.settings)
def xml_dataprovider(self, dataset: "DatasetInstance", **settings) -> XMLDataProvider:
dataset_source = DatasetDataProvider(dataset)
return XMLDataProvider(dataset_source, **settings)
[docs]@disable_parent_class_sniffing
class MEMEXml(GenericXml):
"""MEME XML Output data"""
file_ext = "memexml"
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "MEME XML data"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs]@disable_parent_class_sniffing
class CisML(GenericXml):
"""CisML XML data""" # see: http://www.ncbi.nlm.nih.gov/pubmed/15001475
file_ext = "cisml"
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "CisML data"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs]class Dzi(GenericXml):
"""
Deep zoom image format, see
https://github.com/openseadragon/openseadragon/wiki/The-DZI-File-Format
"""
# General elements.
MetadataElement(
name="base_name", desc="Base name for this dataset", default="DeepZoomImage", readonly=True, set_in_upload=True
)
MetadataElement(name="format", desc="File format of the tiles", default=None, readonly=True, visible=True)
MetadataElement(name="tile_size", desc="Size of tiles", default=None, readonly=True, visible=True)
# Collection elements.
MetadataElement(
name="max_level", desc="Max pyramid level", default=None, readonly=True, optional=True, visible=True
)
MetadataElement(name="quality", desc="Quality", default=None, readonly=True, optional=True, visible=True)
# Image elements.
MetadataElement(name="height", desc="Size height", default=None, readonly=True, optional=True, visible=True)
MetadataElement(
name="overlap",
desc="Overlap of all four sides of tiles",
default=None,
readonly=True,
optional=True,
visible=True,
)
MetadataElement(name="width", desc="Size width", default=None, readonly=True, optional=True, visible=True)
file_ext = "dzi"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "Deep Zoom Image"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disc"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Checking for keyword - 'Collection' or 'Image' in the first 200 lines.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('1.dzi')
>>> Dzi().sniff(fname)
True
>>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
>>> Dzi().sniff(fname)
False
"""
for line in file_prefix.line_iterator():
line = line.lower()
if line.find("<collection") >= 0 or line.find("<image") >= 0:
return True
return False
[docs]class Phyloxml(GenericXml):
"""Format for defining phyloxml data http://www.phyloxml.org/"""
edam_data = "data_0872"
edam_format = "format_3159"
file_ext = "phyloxml"
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "Phyloxml data"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
""" "Checking for keyword - 'phyloxml' always in lowercase in the first few lines.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( '1.phyloxml' )
>>> Phyloxml().sniff( fname )
True
>>> fname = get_test_fname( 'interval.interval' )
>>> Phyloxml().sniff( fname )
False
>>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
>>> Phyloxml().sniff( fname )
False
"""
return self._has_root_element_in_prefix(file_prefix, "phyloxml")
[docs]class Owl(GenericXml):
"""
Web Ontology Language OWL format description
http://www.w3.org/TR/owl-ref/
"""
edam_format = "format_3262"
file_ext = "owl"
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "Web Ontology Language OWL"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disc"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Checking for keyword - '<owl' in the first 200 lines.
"""
return file_prefix.search(OWL_MARKER)
[docs]class Sbml(GenericXml):
"""
System Biology Markup Language
http://sbml.org
"""
file_ext = "sbml"
edam_data = "data_2024"
edam_format = "format_2585"
[docs] def set_peek(self, dataset: "DatasetInstance", **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name)
dataset.blurb = "System Biology Markup Language SBML"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disc"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Checking for keyword - '<sbml' in the first 200 lines.
"""
return file_prefix.search(SBML_MARKER)