Warning

This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.datatypes.phylip

"""
Created on January. 05, 2018

@authors: Kenzo-Hugo Hillion and Fabien Mareuil, Institut Pasteur, Paris
@contacts: kehillio@pasteur.fr and fabien.mareuil@pasteur.fr
@project: galaxy
@githuborganization: C3BI
Phylip datatype sniffer
"""

from typing import TYPE_CHECKING

from galaxy import util
from galaxy.datatypes.data import (
    get_file_peek,
    Text,
)
from galaxy.datatypes.protocols import DatasetProtocol
from galaxy.datatypes.sniff import (
    build_sniff_from_prefix,
    FilePrefix,
)
from galaxy.util import nice_size
from .metadata import MetadataElement

if TYPE_CHECKING:
    from io import StringIO


[docs]@build_sniff_from_prefix class Phylip(Text): """Phylip format stores a multiple sequence alignment""" edam_data = "data_0863" edam_format = "format_1997" file_ext = "phylip" MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: """ Set the number of sequences and the number of data lines in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) try: dataset.metadata.sequences = int(open(dataset.get_file_name()).readline().split()[0]) except Exception: raise Exception("Header does not correspond to PHYLIP header.")
[docs] def set_peek(self, dataset: DatasetProtocol, **kwd) -> None: if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.get_file_name()) if dataset.metadata.sequences: dataset.blurb = f"{util.commaify(str(dataset.metadata.sequences))} sequences" else: dataset.blurb = nice_size(dataset.get_size()) else: dataset.peek = "file does not exist" dataset.blurb = "file purged from disk"
[docs] def sniff_strict_interleaved(self, nb_seq: int, seq_length: int, alignment_prefix: "StringIO") -> bool: found_seq_length = None for _ in range(nb_seq): line = alignment_prefix.readline() if not line: # Not enough lines, either the prefix is too short or this is not PHYLIP return False line = line.rstrip("\n") if len(line) < 11: # Sequence characters immediately follow the sequence ID. # They must start at the 11th character in the line, as the first 10 characters are reserved for the sequence ID return False seq = line[10:].replace(" ", "") this_seq_length = len(seq) if this_seq_length > seq_length: return False if found_seq_length is None: found_seq_length = this_seq_length elif this_seq_length != found_seq_length: # All sequence parts should have the same length return False # Fail if sequence is not ascii seq.encode("ascii") if any(str.isdigit(c) for c in seq): # Could tighten up further by requiring IUPAC strings chars return False # There may be more lines with the remaining parts of the sequences return True
[docs] def sniff_strict_sequential(self, nb_seq: int, seq_length: int, alignment_prefix: "StringIO") -> bool: raise NotImplementedError
[docs] def sniff_relaxed_interleaved(self, nb_seq: int, seq_length: int, alignment_prefix: "StringIO") -> bool: found_seq_length = None for _ in range(nb_seq): line = alignment_prefix.readline() if not line: # Not enough lines, either the prefix is too short or this is not PHYLIP return False line = line.rstrip("\n") # In the relaxed format the sequence id can have any length. # The id and sequence are separated by some whitespaces. seq = line.split(None, 1)[1].replace(" ", "") this_seq_length = len(seq) if this_seq_length > seq_length: return False if found_seq_length is None: found_seq_length = this_seq_length elif this_seq_length != found_seq_length: # All sequence parts should have the same length return False # Fail if sequence is not ascii seq.encode("ascii") if any(str.isdigit(c) for c in seq): # Could tighten up further by requiring IUPAC strings chars return False line = alignment_prefix.readline() if line.strip(): # There should be a newline separating alignments. # If we got more content this is probably not a phylip file return False # There may be more lines with the remaining parts of the sequences return True
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ All Phylip files starts with the number of sequences so we can use this to count the following number of sequences in the first 'stack' >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test_strict_interleaved.phylip') >>> Phylip().sniff(fname) True >>> fname = get_test_fname('test_relaxed_interleaved.phylip') >>> Phylip().sniff(fname) True >>> fname = get_test_fname("not_a_phylip_file.tabular") >>> Phylip().sniff(fname) False """ f = file_prefix.string_io() # Get number of sequences and sequence length from first line nb_seq, seq_length = (int(n) for n in f.readline().split()) if nb_seq <= 0 or seq_length <= 0: return False file_pos = f.tell() try: if self.sniff_strict_interleaved(nb_seq, seq_length, f): return True except Exception: pass f.seek(file_pos) try: if self.sniff_strict_sequential(nb_seq, seq_length, f): return True except Exception: pass f.seek(file_pos) try: if self.sniff_relaxed_interleaved(nb_seq, seq_length, f): return True except Exception: pass return False