Source code for galaxy.datatypes.phylip

"""
Created on January. 05, 2018

@authors: Kenzo-Hugo Hillion and Fabien Mareuil, Institut Pasteur, Paris
@contacts: kehillio@pasteur.fr and fabien.mareuil@pasteur.fr
@project: galaxy
@githuborganization: C3BI
Phylip datatype sniffer
"""

from typing import TYPE_CHECKING

from galaxy import util
from galaxy.datatypes.data import (
    get_file_peek,
    Text,
)
from galaxy.datatypes.protocols import DatasetProtocol
from galaxy.datatypes.sniff import (
    build_sniff_from_prefix,
    FilePrefix,
)
from galaxy.util import nice_size
from .metadata import MetadataElement

if TYPE_CHECKING:
    from io import StringIO


[docs]@build_sniff_from_prefix class Phylip(Text): """Phylip format stores a multiple sequence alignment""" edam_data = "data_0863" edam_format = "format_1997" file_ext = "phylip" MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: """ Set the number of sequences and the number of data lines in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) try: dataset.metadata.sequences = int(open(dataset.get_file_name()).readline().split()[0]) except Exception: raise Exception("Header does not correspond to PHYLIP header.")
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.get_file_name()) if dataset.metadata.sequences: dataset.blurb = f"{util.commaify(str(dataset.metadata.sequences))} sequences" else: dataset.blurb = nice_size(dataset.get_size()) else: dataset.peek = "file does not exist" dataset.blurb = "file purged from disk"
[docs] def sniff_strict_interleaved(self, nb_seq: int, seq_length: int, alignment_prefix: "StringIO") -> bool: found_seq_length = None for _ in range(nb_seq): line = alignment_prefix.readline() if not line: # Not enough lines, either the prefix is too short or this is not PHYLIP return False line = line.rstrip("\n") if len(line) < 11: # Sequence characters immediately follow the sequence ID. # They must start at the 11th character in the line, as the first 10 characters are reserved for the sequence ID return False seq = line[10:].replace(" ", "") this_seq_length = len(seq) if this_seq_length > seq_length: return False if found_seq_length is None: found_seq_length = this_seq_length elif this_seq_length != found_seq_length: # All sequence parts should have the same length return False # Fail if sequence is not ascii seq.encode("ascii") if any(str.isdigit(c) for c in seq): # Could tighten up further by requiring IUPAC strings chars return False # There may be more lines with the remaining parts of the sequences return True
[docs] def sniff_strict_sequential(self, nb_seq: int, seq_length: int, alignment_prefix: "StringIO") -> bool: raise NotImplementedError
[docs] def sniff_relaxed_interleaved(self, nb_seq: int, seq_length: int, alignment_prefix: "StringIO") -> bool: found_seq_length = None for _ in range(nb_seq): line = alignment_prefix.readline() if not line: # Not enough lines, either the prefix is too short or this is not PHYLIP return False line = line.rstrip("\n") # In the relaxed format the sequence id can have any length. # The id and sequence are separated by some whitespaces. seq = line.split(None, 1)[1].replace(" ", "") this_seq_length = len(seq) if this_seq_length > seq_length: return False if found_seq_length is None: found_seq_length = this_seq_length elif this_seq_length != found_seq_length: # All sequence parts should have the same length return False # Fail if sequence is not ascii seq.encode("ascii") if any(str.isdigit(c) for c in seq): # Could tighten up further by requiring IUPAC strings chars return False line = alignment_prefix.readline() if line.strip(): # There should be a newline separating alignments. # If we got more content this is probably not a phylip file return False # There may be more lines with the remaining parts of the sequences return True
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ All Phylip files starts with the number of sequences so we can use this to count the following number of sequences in the first 'stack' >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test_strict_interleaved.phylip') >>> Phylip().sniff(fname) True >>> fname = get_test_fname('test_relaxed_interleaved.phylip') >>> Phylip().sniff(fname) True >>> fname = get_test_fname("not_a_phylip_file.tabular") >>> Phylip().sniff(fname) False """ f = file_prefix.string_io() # Get number of sequences and sequence length from first line nb_seq, seq_length = (int(n) for n in f.readline().split()) if nb_seq <= 0 or seq_length <= 0: return False file_pos = f.tell() try: if self.sniff_strict_interleaved(nb_seq, seq_length, f): return True except Exception: pass f.seek(file_pos) try: if self.sniff_strict_sequential(nb_seq, seq_length, f): return True except Exception: pass f.seek(file_pos) try: if self.sniff_relaxed_interleaved(nb_seq, seq_length, f): return True except Exception: pass return False