Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.datatypes.proteomics
"""
Proteomics Datatypes
"""
import logging
import re
from typing import (
IO,
List,
Optional,
)
from galaxy.datatypes import data
from galaxy.datatypes.binary import Binary
from galaxy.datatypes.data import Text
from galaxy.datatypes.protocols import (
DatasetProtocol,
HasExtraFilesAndMetadata,
)
from galaxy.datatypes.sequence import Sequence
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
FilePrefix,
)
from galaxy.datatypes.tabular import (
Tabular,
TabularData,
)
from galaxy.datatypes.xml import GenericXml
from galaxy.util import nice_size
log = logging.getLogger(__name__)
[docs]class Wiff(Binary):
"""Class for wiff files."""
edam_data = "data_2536"
edam_format = "format_3710"
file_ext = "wiff"
composite_type = "auto_primary_file"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file(
"wiff",
description="AB SCIEX files in .wiff format. This can contain all needed information or only metadata.",
is_binary=True,
)
self.add_composite_file(
"wiff_scan",
description="AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.",
optional="True",
is_binary=True,
)
[docs] def generate_primary_file(self, dataset: HasExtraFilesAndMetadata) -> str:
rval = ["<html><head><title>Wiff Composite Dataset </title></head><p/>"]
rval.append("<div>This composite dataset is composed of the following files:<p/><ul>")
for composite_name, composite_file in self.get_composite_files(dataset=dataset).items():
fn = composite_name
opt_text = ""
if composite_file.optional:
opt_text = " (optional)"
if composite_file.get("description"):
rval.append(
f"<li><a href=\"{fn}\" type=\"text/plain\">{fn} ({composite_file.get('description')})</a>{opt_text}</li>"
)
else:
rval.append(f'<li><a href="{fn}" type="text/plain">{fn}</a>{opt_text}</li>')
rval.append("</ul></div></html>")
return "\n".join(rval)
[docs]class Wiff2(Binary):
"""Class for wiff2 files."""
edam_data = "data_2536"
edam_format = "format_3710"
file_ext = "wiff2"
composite_type = "auto_primary_file"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file(
"wiff2",
description="AB SCIEX files in .wiff2 format. This can contain all needed information or only metadata.",
is_binary=True,
)
self.add_composite_file(
"wiff_scan",
description="AB SCIEX spectra file (wiff.scan), if the corresponding .wiff2 file only contains metadata.",
optional="True",
is_binary=True,
)
[docs] def generate_primary_file(self, dataset: HasExtraFilesAndMetadata) -> str:
rval = ["<html><head><title>Wiff2 Composite Dataset </title></head><p/>"]
rval.append("<div>This composite dataset is composed of the following files:<p/><ul>")
for composite_name, composite_file in self.get_composite_files(dataset=dataset).items():
fn = composite_name
opt_text = ""
if composite_file.optional:
opt_text = " (optional)"
if composite_file.get("description"):
rval.append(
f"<li><a href=\"{fn}\" type=\"text/plain\">{fn} ({composite_file.get('description')})</a>{opt_text}</li>"
)
else:
rval.append(f'<li><a href="{fn}" type="text/plain">{fn}</a>{opt_text}</li>')
rval.append("</ul></div></html>")
return "\n".join(rval)
[docs]@build_sniff_from_prefix
class MzTab(Text):
"""
exchange format for proteomics and metabolomics results
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.mztab')
>>> MzTab().sniff(fname)
True
>>> fname = get_test_fname('test.mztab2')
>>> MzTab().sniff(fname)
False
"""
edam_data = "data_3681"
file_ext = "mztab"
# section names (except MTD)
_sections = ["PRH", "PRT", "PEH", "PEP", "PSH", "PSM", "SMH", "SML", "COM"]
# mandatory metadata fields and list of allowed entries (in lower case)
# (or None if everything is allowed)
_man_mtd = {
"mzTab-mode": ["complete", "summary"],
"mzTab-type": ["quantification", "identification"],
"description": None,
}
_version_re = r"(1)(\.[0-9])?(\.[0-9])?"
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "mzTab Format"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""Determines whether the file is the correct type."""
has_version = False
found_man_mtd = set()
contents = file_prefix.string_io()
for line in contents:
if re.match(r"^\s*$", line):
continue
columns = line.strip("\r\n").split("\t")
if columns[0] == "MTD":
if columns[1] == "mzTab-version" and re.match(self._version_re, columns[2]) is not None:
has_version = True
elif columns[1] in self._man_mtd:
mandatory_field = self._man_mtd[columns[1]]
if mandatory_field is None or columns[2].lower() in mandatory_field:
found_man_mtd.add(columns[1])
elif columns[0] not in self._sections:
return False
return has_version and found_man_mtd == set(self._man_mtd.keys())
[docs]class MzTab2(MzTab):
"""
exchange format for proteomics and metabolomics results
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.mztab2')
>>> MzTab2().sniff(fname)
True
>>> fname = get_test_fname('test.mztab')
>>> MzTab2().sniff(fname)
False
"""
file_ext = "mztab2"
_sections = ["SMH", "SML", "SFH", "SMF", "SEH", "SME", "COM"]
_version_re = r"(2)(\.[0-9])?(\.[0-9])?-M$"
_man_mtd = {"mzTab-ID": None}
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "mzTab2 Format"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs]@build_sniff_from_prefix
class Kroenik(Tabular):
"""
Kroenik (HardKloer sibling) files
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.kroenik')
>>> Kroenik().sniff(fname)
True
>>> fname = get_test_fname('test.peplist')
>>> Kroenik().sniff(fname)
False
"""
file_ext = "kroenik"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.column_names = [
"File",
"First Scan",
"Last Scan",
"Num of Scans",
"Charge",
"Monoisotopic Mass",
"Base Isotope Peak",
"Best Intensity",
"Summed Intensity",
"First RTime",
"Last RTime",
"Best RTime",
"Best Correlation",
"Modifications",
]
[docs] def display_peek(self, dataset: DatasetProtocol) -> str:
"""Returns formated html of peek"""
return self.make_html_table(dataset, column_names=self.column_names)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
fh = file_prefix.string_io()
line = [_.strip() for _ in fh.readline().split("\t")]
if line != self.column_names:
return False
line = fh.readline().split("\t")
try:
[int(_) for _ in line[1:5]]
[float(_) for _ in line[5:13]]
except ValueError:
return False
return True
[docs]@build_sniff_from_prefix
class PepList(Tabular):
"""
Peplist file as used in OpenMS
https://github.com/OpenMS/OpenMS/blob/0fc8765670a0ad625c883f328de60f738f7325a4/src/openms/source/FORMAT/FileHandler.cpp#L432
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.peplist')
>>> PepList().sniff(fname)
True
>>> fname = get_test_fname('test.psms')
>>> PepList().sniff(fname)
False
"""
file_ext = "peplist"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.column_names = ["m/z", "rt(min)", "snr", "charge", "intensity"]
[docs] def display_peek(self, dataset: DatasetProtocol) -> str:
"""Returns formated html of peek"""
return self.make_html_table(dataset, column_names=self.column_names)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
fh = file_prefix.string_io()
line = [_.strip() for _ in fh.readline().split("\t")]
if line == self.column_names:
return True
return False
[docs]@build_sniff_from_prefix
class PSMS(Tabular):
"""
Percolator tab-delimited output (PSM level, .psms) as used in OpenMS
https://github.com/OpenMS/OpenMS/blob/0fc8765670a0ad625c883f328de60f738f7325a4/src/openms/source/FORMAT/FileHandler.cpp#L453
see also http://www.kojak-ms.org/docs/percresults.html
Note that the data rows can have more columns than the header line
since ProteinIds are listed tab-separated.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.psms')
>>> PSMS().sniff(fname)
True
>>> fname = get_test_fname('test.kroenik')
>>> PSMS().sniff(fname)
False
"""
file_ext = "psms"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.column_names = ["PSMId", "score", "q-value", "posterior_error_prob", "peptide", "proteinIds"]
[docs] def display_peek(self, dataset: DatasetProtocol) -> str:
"""Returns formated html of peek"""
return self.make_html_table(dataset, column_names=self.column_names)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
fh = file_prefix.string_io()
line = [_.strip() for _ in fh.readline().split("\t")]
if line == self.column_names:
return True
return False
[docs]@build_sniff_from_prefix
class PEFF(Sequence):
"""
PSI Extended FASTA Format
https://github.com/HUPO-PSI/PEFF
"""
file_ext = "peff"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname( 'test.peff' )
>>> PEFF().sniff( fname )
True
>>> fname = get_test_fname( 'sequence.fasta' )
>>> PEFF().sniff( fname )
False
"""
fh = file_prefix.string_io()
if re.match(r"# PEFF \d+.\d+", fh.readline()):
return True
else:
return False
[docs]class PepXmlReport(Tabular):
"""pepxml converted to tabular report"""
edam_data = "data_2536"
file_ext = "pepxml.tsv"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.column_names = [
"Protein",
"Peptide",
"Assumed Charge",
"Neutral Pep Mass (calculated)",
"Neutral Mass",
"Retention Time",
"Start Scan",
"End Scan",
"Search Engine",
"PeptideProphet Probability",
"Interprophet Probability",
]
[docs] def display_peek(self, dataset: DatasetProtocol) -> str:
"""Returns formated html of peek"""
return self.make_html_table(dataset, column_names=self.column_names)
[docs]class ProtXmlReport(Tabular):
"""protxml converted to tabular report"""
edam_data = "data_2536"
file_ext = "protxml.tsv"
comment_lines = 1
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.column_names = [
"Entry Number",
"Group Probability",
"Protein",
"Protein Link",
"Protein Probability",
"Percent Coverage",
"Number of Unique Peptides",
"Total Independent Spectra",
"Percent Share of Spectrum ID's",
"Description",
"Protein Molecular Weight",
"Protein Length",
"Is Nondegenerate Evidence",
"Weight",
"Precursor Ion Charge",
"Peptide sequence",
"Peptide Link",
"NSP Adjusted Probability",
"Initial Probability",
"Number of Total Termini",
"Number of Sibling Peptides Bin",
"Number of Instances",
"Peptide Group Designator",
"Is Evidence?",
]
[docs] def display_peek(self, dataset: DatasetProtocol) -> str:
"""Returns formated html of peek"""
return self.make_html_table(dataset, column_names=self.column_names)
[docs]class Dta(TabularData):
"""dta
The first line contains the singly protonated peptide mass (MH+) and the
peptide charge state separated by a space. Subsequent lines contain space
separated pairs of fragment ion m/z and intensity values.
"""
file_ext = "dta"
comment_lines = 0
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
column_types = []
data_row: List = []
data_lines = 0
if dataset.has_data():
with open(dataset.get_file_name()) as dtafile:
for _ in dtafile:
data_lines += 1
# Guess column types
for cell in data_row:
column_types.append(self.guess_type(cell))
# Set metadata
dataset.metadata.data_lines = data_lines
dataset.metadata.comment_lines = 0
dataset.metadata.column_types = ["float", "float"]
dataset.metadata.columns = 2
dataset.metadata.column_names = ["m/z", "intensity"]
dataset.metadata.delimiter = " "
[docs]@build_sniff_from_prefix
class Dta2d(TabularData):
"""
dta2d: files with three tab/space-separated columns.
The default format is: retention time (seconds) , m/z , intensity.
If the first line starts with '#', a different order is defined by the the
order of the keywords 'MIN' (retention time in minutes) or 'SEC' (retention
time in seconds), 'MZ', and 'INT'.
Example: '#MZ MIN INT'
The peaks of one retention time have to be in subsequent lines.
Note: sniffer detects (tab or space separated) dta2d files with correct
header, wo header seems to generic
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.dta2d')
>>> Dta2d().sniff(fname)
True
>>> fname = get_test_fname('test.edta')
>>> Dta2d().sniff(fname)
False
"""
file_ext = "dta2d"
comment_lines = 0
def _parse_header(self, line: List) -> Optional[List]:
if len(line) != 3 or len(line[0]) < 3 or not line[0].startswith("#"):
return None
line[0] = line[0].lstrip("#")
line = [_.strip() for _ in line]
if "MZ" not in line or "INT" not in line or ("MIN" not in line and "SEC" not in line):
return None
return line
def _parse_delimiter(self, line: str) -> Optional[str]:
if len(line.split(" ")) == 3:
return " "
elif len(line.split("\t")) == 3:
return "\t"
return None
def _parse_dataline(self, line: List) -> bool:
try:
line = [float(_) for _ in line]
except ValueError:
return False
if not all(_ >= 0 for _ in line):
return False
return True
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
data_lines = 0
delim = None
if dataset.has_data():
with open(dataset.get_file_name()) as dtafile:
for line in dtafile:
if delim is None:
delim = self._parse_delimiter(line)
dataset.metadata.column_names = self._parse_header(line.split(delim))
data_lines += 1
# Set metadata
if delim is not None:
dataset.metadata.delimiter = delim
dataset.metadata.data_lines = data_lines
dataset.metadata.comment_lines = 0
dataset.metadata.column_types = ["float", "float", "float"]
dataset.metadata.columns = 3
if dataset.metadata.column_names is None or dataset.metadata.column_names == []:
dataset.metadata.comment_lines += 1
dataset.metadata.data_lines -= 1
dataset.metadata.column_names = ["SEC", "MZ", "INT"]
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
sep = None
header = None
for idx, line in enumerate(file_prefix.line_iterator()):
line = line.strip()
if sep is None:
sep = self._parse_delimiter(line)
if sep is None:
return False
line = line.split(sep)
if len(line) != 3:
return False
if idx == 0:
header = self._parse_header(line)
if (header is None) and not self._parse_dataline(line):
return False
elif not self._parse_dataline(line):
return False
if sep is None or header is None:
return False
return True
[docs]@build_sniff_from_prefix
class Edta(TabularData):
"""
Input text file containing tab, space or comma separated columns.
The separator between columns is checked in the first line in this order.
It supports three variants of this format.
1. Columns are: RT, MZ, Intensity A header is optional.
2. Columns are: RT, MZ, Intensity, Charge, <Meta-Data> columns{0,} A header is mandatory.
3. Columns are: (RT, MZ, Intensity, Charge){1,}, <Meta-Data> columns{0,}
Header is mandatory. First quadruplet is the consensus. All following
quadruplets describe the sub-features. This variant is discerned from
variant #2 by the name of the fifth column, which is required to be RT1
(or rt1). All other column names for sub-features are faithfully ignored.
Note the sniffer only detects files with header.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test.edta')
>>> Edta().sniff(fname)
True
>>> fname = get_test_fname('test.dta2d')
>>> Edta().sniff(fname)
False
"""
file_ext = "edta"
comment_lines = 0
def _parse_delimiter(self, line: str) -> Optional[str]:
if len(line.split(" ")) >= 3:
return " "
elif len(line.split("\t")) >= 3:
return "\t"
elif len(line.split(",")) >= 3:
return "\t"
return None
def _parse_type(self, line: List) -> Optional[int]:
"""
parse the type from the header line
types 1-3 as in the class docs, 0: type 1 wo/wrong header
"""
if len(line) < 3:
return None
line = [_.lower().replace("/", "") for _ in line]
if len(line) == 3:
if line[0] == "rt" and line[1] == "mz" and (line[2] == "int" or line[2] == "intensity"):
return 1
else:
return None
if line[0] != "rt" or line[1] != "mz" or (line[2] != "int" and line[2] != "intensity") or line[3] != "charge":
return None
if not line[4].startswith("rt"):
return 2
else:
return 3
def _parse_dataline(self, line: List, tpe: Optional[int]) -> bool:
if tpe == 2 or tpe == 3:
idx = 4
else:
idx = 3
try:
line = [float(_) for _ in line[:idx]]
except ValueError:
return False
if not all(_ >= 0 for _ in line[:idx]):
return False
return True
def _clean_header(self, line: List) -> List:
for idx, el in enumerate(line):
el = el.lower()
if el.startswith("rt"):
line[idx] = "RT"
elif el.startswith("int"):
line[idx] = "intensity"
elif el.startswith("mz"):
line[idx] = "m/z"
elif el.startswith("charge"):
line[idx] = "charge"
else:
break
if idx // 4 > 0:
line[idx] += str(idx // 4)
return line
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
data_lines = 0
delim = None
tpe = None
if dataset.has_data():
with open(dataset.get_file_name()) as dtafile:
for idx, line in enumerate(dtafile):
if idx == 0:
delim = self._parse_delimiter(line)
tpe = self._parse_type(line.split(delim))
if tpe == 0:
dataset.metadata.column_names = ["RT", "m/z", "intensity"]
else:
dataset.metadata.column_names = self._clean_header(line.split(delim))
data_lines += 1
# Set metadata
if delim is not None:
dataset.metadata.delimiter = delim
for c in dataset.metadata.column_names:
if any(c.startswith(_) for _ in ["RT", "m/z", "intensity", "charge"]):
dataset.metadata.column_types.append("float")
else:
dataset.metadata.column_types.append("str")
dataset.metadata.data_lines = data_lines
dataset.metadata.comment_lines = 0
dataset.metadata.columns = len(dataset.metadata.column_names)
if tpe is not None and tpe > 0:
dataset.metadata.comment_lines += 1
dataset.metadata.data_lines -= 1
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
sep = None
tpe = None
for idx, line in enumerate(file_prefix.line_iterator()):
line = line.strip("\r\n")
if sep is None:
sep = self._parse_delimiter(line)
if sep is None:
return False
line = line.split(sep)
if idx == 0:
tpe = self._parse_type(line)
if tpe is None:
return False
elif tpe == 0 and not self._parse_dataline(line, tpe):
return False
elif not self._parse_dataline(line, tpe):
return False
if tpe is None:
return False
return True
[docs]class ProteomicsXml(GenericXml):
"""An enhanced XML datatype used to reuse code across several
proteomic/mass-spec datatypes."""
edam_data = "data_2536"
edam_format = "format_2032"
root: str
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""Determines whether the file is the correct XML type."""
for line in file_prefix.line_iterator():
line = line.strip()
if not line.startswith("<?"):
break
# pattern match <root or <ns:root for any ns string
pattern = r"<(\w*:)?%s" % self.root
return re.search(pattern, line) is not None
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "ProteomicsXML data"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs]class ParamXml(ProteomicsXml):
"""store Parameters in XML formal"""
file_ext = "paramxml"
blurb = "parameters in xmls"
root = "parameters|PARAMETERS"
[docs]class PepXml(ProteomicsXml):
"""pepXML data"""
edam_format = "format_3655"
file_ext = "pepxml"
blurb = "pepXML data"
root = "msms_pipeline_analysis"
[docs]class MascotXML(ProteomicsXml):
"""mzXML data"""
file_ext = "mascotxml"
blurb = "mascot Mass Spectrometry data"
root = "mascot_search_results"
[docs]class MzML(ProteomicsXml):
"""mzML data"""
edam_format = "format_3244"
file_ext = "mzml"
blurb = "mzML Mass Spectrometry data"
root = "(mzML|indexedmzML)"
[docs]class NmrML(ProteomicsXml):
"""nmrML data"""
# No edam format number yet.
file_ext = "nmrml"
blurb = "nmrML NMR data"
root = "nmrML"
[docs]class ProtXML(ProteomicsXml):
"""protXML data"""
file_ext = "protxml"
blurb = "prot XML Search Results"
root = "protein_summary"
[docs]class MzXML(ProteomicsXml):
"""mzXML data"""
edam_format = "format_3654"
file_ext = "mzxml"
blurb = "mzXML Mass Spectrometry data"
root = "mzXML"
[docs]class MzData(ProteomicsXml):
"""mzData data"""
edam_format = "format_3245"
file_ext = "mzdata"
blurb = "mzData Mass Spectrometry data"
root = "mzData"
[docs]class MzIdentML(ProteomicsXml):
edam_format = "format_3247"
file_ext = "mzid"
blurb = "XML identified peptides and proteins."
root = "MzIdentML"
[docs]class TraML(ProteomicsXml):
edam_format = "format_3246"
file_ext = "traml"
blurb = "TraML transition list"
root = "TraML"
[docs]class TrafoXML(ProteomicsXml):
file_ext = "trafoxml"
blurb = "RT alignment tranformation"
root = "TrafoXML"
[docs]class MzQuantML(ProteomicsXml):
edam_format = "format_3248"
file_ext = "mzq"
blurb = "XML quantification data"
root = "MzQuantML"
[docs]class ConsensusXML(ProteomicsXml):
file_ext = "consensusxml"
blurb = "OpenMS multiple LC-MS map alignment file"
root = "consensusXML"
[docs]class FeatureXML(ProteomicsXml):
file_ext = "featurexml"
blurb = "OpenMS feature file"
root = "featureMap"
[docs]class IdXML(ProteomicsXml):
file_ext = "idxml"
blurb = "OpenMS identification file"
root = "IdXML"
[docs]class TandemXML(ProteomicsXml):
edam_format = "format_3711"
file_ext = "tandem"
blurb = "X!Tandem search results file"
root = "bioml"
[docs]class UniProtXML(ProteomicsXml):
file_ext = "uniprotxml"
blurb = "UniProt Proteome file"
root = "uniprot"
[docs]class XquestXML(ProteomicsXml):
file_ext = "xquest.xml"
blurb = "XQuest XML file"
root = "xquest_results"
[docs]class XquestSpecXML(ProteomicsXml):
"""spec.xml"""
file_ext = "spec.xml"
blurb = "xquest_spectra"
root = "xquest_spectra"
[docs]class QCML(ProteomicsXml):
"""qcml
https://github.com/OpenMS/OpenMS/blob/113c49d01677f7f03343ce7cd542d83c99b351ee/share/OpenMS/SCHEMAS/mzQCML_0_0_5.xsd
https://github.com/OpenMS/OpenMS/blob/3cfc57ad1788e7ab2bd6dd9862818b2855234c3f/share/OpenMS/SCHEMAS/qcML_0.0.7.xsd
"""
file_ext = "qcml"
blurb = "QualityAssessments to runs"
root = "qcML|MzQualityML)"
[docs]class Mgf(Text):
"""Mascot Generic Format data"""
edam_data = "data_2536"
edam_format = "format_3651"
file_ext = "mgf"
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "mgf Mascot Generic Format"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs] def sniff(self, filename: str) -> bool:
mgf_begin_ions = "BEGIN IONS"
max_lines = 100
with open(filename) as handle:
for i, line in enumerate(handle):
line = line.rstrip()
if line == mgf_begin_ions:
return True
if i > max_lines:
return False
return False
[docs]class MascotDat(Text):
"""Mascot search results"""
edam_data = "data_2536"
edam_format = "format_3713"
file_ext = "mascotdat"
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "mascotdat Mascot Search Results"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs] def sniff(self, filename: str) -> bool:
mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
max_lines = 10
with open(filename) as handle:
for i, line in enumerate(handle):
line = line.rstrip()
if line == mime_version:
return True
if i > max_lines:
return False
return False
[docs]class ThermoRAW(Binary):
"""Class describing a Thermo Finnigan binary RAW file"""
edam_data = "data_2536"
edam_format = "format_3712"
file_ext = "thermo.raw"
[docs] def sniff(self, filename: str) -> bool:
# Thermo Finnigan RAW format is proprietary and hence not well documented.
# Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
# This combination represents 17 bytes, but to play safe we read 20 bytes from
# the start of the file.
try:
header = open(filename, "rb").read(20)
finnigan = b"F\0i\0n\0n\0i\0g\0a\0n"
if header.find(finnigan) != -1:
return True
return False
except Exception:
return False
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = "Thermo Finnigan RAW file"
dataset.blurb = nice_size(dataset.get_size())
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs] def display_peek(self, dataset: DatasetProtocol) -> str:
try:
return dataset.peek
except Exception:
return f"Thermo Finnigan RAW file ({nice_size(dataset.get_size())})"
[docs]@build_sniff_from_prefix
class Msp(Text):
"""Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf"""
file_ext = "msp"
[docs] @staticmethod
def next_line_starts_with(contents: IO, prefix: str) -> bool:
next_line = contents.readline()
return next_line is not None and next_line.startswith(prefix)
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""Determines whether the file is a NIST MSP output file."""
begin_contents = file_prefix.contents_header
if "\n" not in begin_contents:
return False
lines = begin_contents.splitlines()
if len(lines) < 2:
return False
return lines[0].startswith("Name:") and lines[1].startswith("MW:")
[docs]class SPLibNoIndex(Text):
"""SPlib without index file"""
file_ext = "splib_noindex"
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "Spectral Library without index files"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs]@build_sniff_from_prefix
class SPLib(Msp):
"""SpectraST Spectral Library. Closely related to msp format"""
file_ext = "splib"
composite_type = "auto_primary_file"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file(
"library.splib", description="Spectral Library. Contains actual library spectra", is_binary=False
)
self.add_composite_file("library.spidx", description="Spectrum index", is_binary=False)
self.add_composite_file("library.pepidx", description="Peptide index", is_binary=False)
[docs] def generate_primary_file(self, dataset: HasExtraFilesAndMetadata) -> str:
rval = ["<html><head><title>Spectral Library Composite Dataset </title></head><p/>"]
rval.append("<div>This composite dataset is composed of the following files:<p/><ul>")
for composite_name, composite_file in self.get_composite_files(dataset=dataset).items():
fn = composite_name
opt_text = ""
if composite_file.optional:
opt_text = " (optional)"
if composite_file.get("description"):
rval.append(
f"<li><a href=\"{fn}\" type=\"text/plain\">{fn} ({composite_file.get('description')})</a>{opt_text}</li>"
)
else:
rval.append(f'<li><a href="{fn}" type="text/plain">{fn}</a>{opt_text}</li>')
rval.append("</ul></div></html>")
return "\n".join(rval)
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.get_file_name())
dataset.blurb = "splib Spectral Library Format"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""Determines whether the file is a SpectraST generated file."""
contents = file_prefix.string_io()
return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "LibID:")
[docs]@build_sniff_from_prefix
class Ms2(Text):
file_ext = "ms2"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""Determines whether the file is a valid ms2 file."""
header_lines = []
for line in file_prefix.line_iterator():
if line.strip() == "":
continue
elif line.startswith("H\t"):
header_lines.append(line)
else:
break
for header_field in ["CreationDate", "Extractor", "ExtractorVersion", "ExtractorOptions"]:
found_header = False
for header_line in header_lines:
if header_line.startswith(f"H\t{header_field}"):
found_header = True
break
if not found_header:
return False
return True
# unsniffable binary format, should do something about this
[docs]class XHunterAslFormat(Binary):
"""Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html"""
file_ext = "hlf"
[docs]class ImzML(Binary):
"""
Class for imzML files.
http://www.imzml.org
"""
edam_format = "format_3682"
file_ext = "imzml"
composite_type = "auto_primary_file"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file("imzml", description="The imzML metadata component.", is_binary=False)
self.add_composite_file("ibd", description="The mass spectral data component.", is_binary=True)
[docs] def generate_primary_file(self, dataset: HasExtraFilesAndMetadata) -> str:
rval = ["<html><head><title>imzML Composite Dataset </title></head><p/>"]
rval.append("<div>This composite dataset is composed of the following files:<p/><ul>")
for composite_name, composite_file in self.get_composite_files(dataset=dataset).items():
fn = composite_name
opt_text = ""
if composite_file.get("description"):
rval.append(
f"<li><a href=\"{fn}\" type=\"text/plain\">{fn} ({composite_file.get('description')})</a>{opt_text}</li>"
)
else:
rval.append(f'<li><a href="{fn}" type="text/plain">{fn}</a>{opt_text}</li>')
rval.append("</ul></div></html>")
return "\n".join(rval)