Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.datatypes.blast
# This file is now part of the Galaxy Project, but due to historical reasons
# reflecting time developed outside of the Galaxy Project, this file is under
# the MIT license.
#
# The MIT License (MIT)
# Copyright (c) 2012,2013,2014,2015,2016 Peter Cock
# Copyright (c) 2012 Edward Kirton
# Copyright (c) 2013 Nicola Soranzo
# Copyright (c) 2014 Bjoern Gruening
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
"""NCBI BLAST datatypes.
Covers the ``blastxml`` format and the BLAST databases.
"""
import logging
import os
from time import sleep
from galaxy.datatypes.sniff import (
build_sniff_from_prefix,
FilePrefix,
)
from galaxy.util import smart_str
from .data import (
Data,
get_file_peek,
Text
)
from .xml import GenericXml
log = logging.getLogger(__name__)
[docs]@build_sniff_from_prefix
class BlastXml(GenericXml):
"""NCBI Blast XML Output data"""
file_ext = "blastxml"
edam_format = "format_3331"
edam_data = "data_0857"
[docs] def set_peek(self, dataset):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name)
dataset.blurb = 'NCBI Blast XML data'
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
[docs] def sniff_prefix(self, file_prefix: FilePrefix):
"""Determines whether the file is blastxml
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
>>> BlastXml().sniff(fname)
True
>>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.blastxml')
>>> BlastXml().sniff(fname)
True
>>> fname = get_test_fname('interval.interval')
>>> BlastXml().sniff(fname)
False
"""
handle = file_prefix.string_io()
line = handle.readline()
if line.strip() != '<?xml version="1.0"?>':
return False
line = handle.readline()
if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
'<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
return False
line = handle.readline()
if line.strip() != '<BlastOutput>':
return False
return True
[docs] @staticmethod
def merge(split_files, output_file):
"""Merging multiple XML files is non-trivial and must be done in subclasses."""
if len(split_files) == 1:
# For one file only, use base class method (move/copy)
return Text.merge(split_files, output_file)
if not split_files:
raise ValueError("Given no BLAST XML files, %r, to merge into %s"
% (split_files, output_file))
with open(output_file, "w") as out:
h = None
old_header = None
for f in split_files:
if not os.path.isfile(f):
log.warning(f"BLAST XML file {f} missing, retry in 1s...")
sleep(1)
if not os.path.isfile(f):
log.error(f"BLAST XML file {f} missing")
raise ValueError(f"BLAST XML file {f} missing")
h = open(f)
header = h.readline()
if not header:
h.close()
# Retry, could be transient error with networked file system...
log.warning(f"BLAST XML file {f} empty, retry in 1s...")
sleep(1)
h = open(f)
header = h.readline()
if not header:
log.error(f"BLAST XML file {f} was empty")
raise ValueError(f"BLAST XML file {f} was empty")
if header.strip() != '<?xml version="1.0"?>':
out.write(header) # for diagnosis
h.close()
raise ValueError(f"{f} is not an XML file!")
line = h.readline()
header += line
if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
'<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
out.write(header) # for diagnosis
h.close()
raise ValueError(f"{f} is not a BLAST XML file!")
while True:
line = h.readline()
if not line:
out.write(header) # for diagnosis
h.close()
raise ValueError(f"BLAST XML file {f} ended prematurely")
header += line
if "<Iteration>" in line:
break
if len(header) > 10000:
# Something has gone wrong, don't load too much into memory!
# Write what we have to the merged file for diagnostics
out.write(header)
h.close()
raise ValueError(f"The header in BLAST XML file {f} is too long")
if "<BlastOutput>" not in header:
h.close()
raise ValueError(f"{f} is not a BLAST XML file:\n{header}\n...")
if f == split_files[0]:
out.write(header)
old_header = header
elif old_header is not None and old_header[:300] != header[:300]:
# Enough to check <BlastOutput_program> and <BlastOutput_version> match
h.close()
raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n"
% (split_files[0], f, old_header[:300], header[:300]))
else:
out.write(" <Iteration>\n")
for line in h:
if "</BlastOutput_iterations>" in line:
break
# TODO - Increment <Iteration_iter-num> and if required automatic query names
# like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
out.write(line)
h.close()
out.write(" </BlastOutput_iterations>\n")
out.write("</BlastOutput>\n")
class _BlastDb(Data):
"""Base class for BLAST database datatype."""
def set_peek(self, dataset):
"""Set the peek and blurb text."""
if not dataset.dataset.purged:
dataset.peek = "BLAST database (multiple files)"
dataset.blurb = "BLAST database (multiple files)"
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
def display_peek(self, dataset):
"""Create HTML content, used for displaying peek."""
try:
return dataset.peek
except Exception:
return "BLAST database (multiple files)"
def display_data(self, trans, data, preview=False, filename=None,
to_ext=None, size=None, offset=None, **kwd):
"""
If preview is `True` allows us to format the data shown in the central pane via the "eye" icon.
If preview is `False` triggers download.
"""
headers = kwd.get("headers", {})
if not preview:
return super().display_data(trans,
data=data,
preview=preview,
filename=filename,
to_ext=to_ext,
size=size,
offset=offset,
**kwd)
if self.file_ext == "blastdbn":
title = "This is a nucleotide BLAST database"
elif self.file_ext == "blastdbp":
title = "This is a protein BLAST database"
elif self.file_ext == "blastdbd":
title = "This is a domain BLAST database"
else:
# Error?
title = "This is a BLAST database."
msg = ""
try:
# Try to use any text recorded in the dummy index file:
with open(data.file_name, encoding='utf-8') as handle:
msg = handle.read().strip()
except Exception:
pass
if not msg:
msg = title
# Galaxy assumes HTML for the display of composite datatypes,
return smart_str(f"<html><head><title>{title}</title></head><body><pre>{msg}</pre></body></html>"), headers
def merge(split_files, output_file):
"""Merge BLAST databases (not implemented for now)."""
raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)")
def split(cls, input_datasets, subdir_generator_function, split_params):
"""Split a BLAST database (not implemented for now)."""
if split_params is None:
return None
raise NotImplementedError("Can't split BLAST databases")
[docs]class BlastNucDb(_BlastDb):
"""Class for nucleotide BLAST database files."""
file_ext = 'blastdbn'
composite_type = 'basic'
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers
self.add_composite_file('blastdb.nin', is_binary=True) # index file
self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences
self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb)
self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb)
self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb)
self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb)
self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
# self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) # first volume of the MegaBLAST index generated by makembindex
# The previous line should be repeated for each index volume, with filename extensions like '.01.idx', '.02.idx', etc.
self.add_composite_file('blastdb.shd', is_binary=True, optional=True) # MegaBLAST index superheader (-old_style_index false option of makembindex)
# self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data
# self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column
# self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column
# The previous 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc.
[docs]class BlastProtDb(_BlastDb):
"""Class for protein BLAST database files."""
file_ext = 'blastdbp'
composite_type = 'basic'
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
# Component file comments are as in BlastNucDb except where noted
self.add_composite_file('blastdb.phr', is_binary=True)
self.add_composite_file('blastdb.pin', is_binary=True)
self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences
self.add_composite_file('blastdb.phd', is_binary=True, optional=True)
self.add_composite_file('blastdb.phi', is_binary=True, optional=True)
self.add_composite_file('blastdb.pnd', is_binary=True, optional=True)
self.add_composite_file('blastdb.pni', is_binary=True, optional=True)
self.add_composite_file('blastdb.pog', is_binary=True, optional=True)
self.add_composite_file('blastdb.psd', is_binary=True, optional=True)
self.add_composite_file('blastdb.psi', is_binary=True, optional=True)
# self.add_composite_file('blastdb.paa', is_binary=True, optional=True)
# self.add_composite_file('blastdb.pab', is_binary=True, optional=True)
# self.add_composite_file('blastdb.pac', is_binary=True, optional=True)
# The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc.
[docs]class BlastDomainDb(_BlastDb):
"""Class for domain BLAST database files."""
file_ext = 'blastdbd'
composite_type = 'basic'
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file('blastdb.phr', is_binary=True)
self.add_composite_file('blastdb.pin', is_binary=True)
self.add_composite_file('blastdb.psq', is_binary=True)
self.add_composite_file('blastdb.freq', is_binary=True, optional=True)
self.add_composite_file('blastdb.loo', is_binary=True, optional=True)
self.add_composite_file('blastdb.psd', is_binary=True, optional=True)
self.add_composite_file('blastdb.psi', is_binary=True, optional=True)
self.add_composite_file('blastdb.rps', is_binary=True, optional=True)
self.add_composite_file('blastdb.aux', is_binary=True, optional=True)
[docs]class LastDb(Data):
"""Class for LAST database files."""
file_ext = 'lastdb'
composite_type = 'basic'
[docs] def set_peek(self, dataset):
"""Set the peek and blurb text."""
if not dataset.dataset.purged:
dataset.peek = "LAST database (multiple files)"
dataset.blurb = "LAST database (multiple files)"
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
[docs] def display_peek(self, dataset):
"""Create HTML content, used for displaying peek."""
try:
return dataset.peek
except Exception:
return "LAST database (multiple files)"
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file('lastdb.bck', is_binary=True)
self.add_composite_file('lastdb.des', description="Description file", is_binary=False)
self.add_composite_file('lastdb.prj', description="Project resume file", is_binary=False)
self.add_composite_file('lastdb.sds', is_binary=True)
self.add_composite_file('lastdb.ssp', is_binary=True)
self.add_composite_file('lastdb.suf', is_binary=True)
self.add_composite_file('lastdb.tis', is_binary=True)
[docs]class BlastNucDb5(_BlastDb):
"""Class for nucleotide BLAST database files."""
file_ext = 'blastdbn5'
composite_type = 'basic'
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers
self.add_composite_file('blastdb.nin', is_binary=True) # index file
self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences
self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb)
self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb)
self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb)
self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines)
self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb)
self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb)
# self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) # first volume of the MegaBLAST index generated by makembindex
# The previous line should be repeated for each index volume, with filename extensions like '.01.idx', '.02.idx', etc.
self.add_composite_file('blastdb.shd', is_binary=True, optional=True) # MegaBLAST index superheader (-old_style_index false option of makembindex)
# self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data
# self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column
# self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column
# The previous 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc.
[docs]class BlastProtDb5(_BlastDb):
"""Class for protein BLAST database files."""
file_ext = 'blastdbp5'
composite_type = 'basic'
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
# Component file comments are as in BlastNucDb except where noted
self.add_composite_file('blastdb.phr', is_binary=True)
self.add_composite_file('blastdb.pin', is_binary=True)
self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences
self.add_composite_file('blastdb.phd', is_binary=True, optional=True)
self.add_composite_file('blastdb.phi', is_binary=True, optional=True)
self.add_composite_file('blastdb.pnd', is_binary=True, optional=True)
self.add_composite_file('blastdb.pni', is_binary=True, optional=True)
self.add_composite_file('blastdb.pog', is_binary=True, optional=True)
self.add_composite_file('blastdb.psd', is_binary=True, optional=True)
self.add_composite_file('blastdb.psi', is_binary=True, optional=True)
# self.add_composite_file('blastdb.paa', is_binary=True, optional=True)
# self.add_composite_file('blastdb.pab', is_binary=True, optional=True)
# self.add_composite_file('blastdb.pac', is_binary=True, optional=True)
# The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc.
[docs]class BlastDomainDb5(_BlastDb):
"""Class for domain BLAST database files."""
file_ext = 'blastdbd5'
composite_type = 'basic'
[docs] def __init__(self, **kwd):
super().__init__(**kwd)
self.add_composite_file('blastdb.phr', is_binary=True)
self.add_composite_file('blastdb.pin', is_binary=True)
self.add_composite_file('blastdb.psq', is_binary=True)
self.add_composite_file('blastdb.freq', is_binary=True, optional=True)
self.add_composite_file('blastdb.loo', is_binary=True, optional=True)
self.add_composite_file('blastdb.psd', is_binary=True, optional=True)
self.add_composite_file('blastdb.psi', is_binary=True, optional=True)
self.add_composite_file('blastdb.rps', is_binary=True, optional=True)
self.add_composite_file('blastdb.aux', is_binary=True, optional=True)