Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.datatypes.assembly

"""
velvet datatypes
James E Johnson - University of Minnesota
for velvet assembler tool in galaxy
"""

import logging
import os
import re

from galaxy.datatypes import (
    data,
    sequence,
)
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.protocols import (
    DatasetProtocol,
    HasExtraFilesAndMetadata,
)
from galaxy.datatypes.sniff import (
    build_sniff_from_prefix,
    FilePrefix,
)
from galaxy.datatypes.text import Html

log = logging.getLogger(__name__)


[docs]@build_sniff_from_prefix class Amos(data.Text): """Class describing the AMOS assembly file""" edam_data = "data_0925" edam_format = "format_3582" file_ext = "afg"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ Determines whether the file is an amos assembly file format Example:: {CTG iid:1 eid:1 seq: CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA . qlt: DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD . {TLE src:1027 off:0 clr:618,0 gap: 250 612 . } } """ for line in file_prefix.line_iterator(): if not line: break # EOF line = line.strip() if line: # first non-empty line if line.startswith("{"): if re.match(r"{(RED|CTG|TLE)$", line): return True return False
[docs]@build_sniff_from_prefix class Sequences(sequence.Fasta): """Class describing the Sequences file generated by velveth""" edam_data = "data_0925" file_ext = "sequences"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ Determines whether the file is a velveth produced fasta format The id line has 3 fields separated by tabs: sequence_name sequence_index category:: >SEQUENCE_0_length_35 1 1 GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT >SEQUENCE_1_length_35 2 1 CGACGAATGACAGGTCACGAATTTGGCGGGGATTA """ fh = file_prefix.string_io() for line in fh: line = line.strip() if line: # first non-empty line if line.startswith(">"): if not re.match(r">[^\t]+\t\d+\t\d+$", line): return False # The next line.strip() must not be '', nor startwith '>' line = fh.readline().strip() if line == "" or line.startswith(">"): return False return True else: return False return False
[docs]@build_sniff_from_prefix class Roadmaps(data.Text): """Class describing the Sequences file generated by velveth""" edam_format = "format_2561" file_ext = "roadmaps"
[docs] def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ Determines whether the file is a velveth produced RoadMap:: 142858 21 1 ROADMAP 1 ROADMAP 2 ... """ fh = file_prefix.string_io() for line in fh: line = line.strip() if line: # first non-empty line if not re.match(r"\d+\t\d+\t\d+$", line): return False # The next line.strip() should be 'ROADMAP 1' line = fh.readline().strip() return bool(re.match(r"ROADMAP \d+$", line)) else: return False # we found a non-empty line, but it's not a fasta header return False
[docs]class Velvet(Html): MetadataElement( name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True ) MetadataElement( name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True ) MetadataElement(name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True) MetadataElement( name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True ) composite_type = "auto_primary_file" file_ext = "velvet"
[docs] def __init__(self, **kwd): super().__init__(**kwd) self.add_composite_file( "Sequences", mimetype="text/html", description="Sequences", substitute_name_with_metadata=None, is_binary=False, ) self.add_composite_file( "Roadmaps", mimetype="text/html", description="Roadmaps", substitute_name_with_metadata=None, is_binary=False, ) self.add_composite_file( "Log", mimetype="text/html", description="Log", optional="True", substitute_name_with_metadata=None, is_binary=False, )
[docs] def generate_primary_file(self, dataset: HasExtraFilesAndMetadata) -> str: log.debug(f"Velvet log info JJ generate_primary_file {dataset}") rval = ["<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>"] rval.append("<div>This composite dataset is composed of the following files:<p/><ul>") for composite_name, composite_file in self.get_composite_files(dataset=dataset).items(): fn = composite_name log.debug(f"Velvet log info JJ generate_primary_file {fn} {composite_file}") opt_text = "" if composite_file.optional: opt_text = " (optional)" if composite_file.get("description"): rval.append( f"<li><a href=\"{fn}\" type=\"text/plain\">{fn} ({composite_file.get('description')})</a>{opt_text}</li>" ) else: rval.append(f'<li><a href="{fn}" type="text/plain">{fn}</a>{opt_text}</li>') rval.append("</ul></div></html>") return "\n".join(rval)
[docs] def regenerate_primary_file(self, dataset: DatasetProtocol) -> None: """ cannot do this until we are setting metadata """ log.debug(f"Velvet log info {'JJ regenerate_primary_file'}") gen_msg = "" try: efp = dataset.extra_files_path log_path = os.path.join(efp, "Log") with open(log_path) as f: log_content = f.read(1000) log_msg = re.sub(r"/\S*/", "", log_content) log.debug(f"Velveth log info {log_msg}") paired_end_reads = re.search(r"-(short|long)Paired", log_msg) is not None dataset.metadata.paired_end_reads = paired_end_reads long_reads = re.search(r"-long", log_msg) is not None dataset.metadata.long_reads = long_reads short2_reads = re.search(r"-short(Paired)?2", log_msg) is not None dataset.metadata.short2_reads = short2_reads dataset.info = re.sub(r".*velveth \S+", "hash_length", re.sub(r"\n", " ", log_msg)) if paired_end_reads: gen_msg = f"{gen_msg} Paired-End Reads" if long_reads: gen_msg = f"{gen_msg} Long Reads" if len(gen_msg) > 0: gen_msg = f"Uses: {gen_msg}" except Exception: log.debug(f"Velveth could not read Log file in {efp}") log.debug(f"Velveth log info {gen_msg}") rval = ["<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>"] # rval.append('<div>Generated:<p/><code> %s </code></div>' %(re.sub('\n','<br>',log_msg))) rval.append(f"<div>Generated:<p/> {gen_msg} </div>") rval.append("<div>Velveth dataset:<p/><ul>") for composite_name, composite_file in self.get_composite_files(dataset=dataset).items(): fn = composite_name log.debug(f"Velvet log info JJ regenerate_primary_file {fn} {composite_file}") if re.search("Log", fn) is None: opt_text = "" if composite_file.optional: opt_text = " (optional)" if composite_file.get("description"): rval.append( f"<li><a href=\"{fn}\" type=\"text/plain\">{fn} ({composite_file.get('description')})</a>{opt_text}</li>" ) else: rval.append(f'<li><a href="{fn}" type="text/plain">{fn}</a>{opt_text}</li>') rval.append("</ul></div></html>") with open(dataset.get_file_name(), "w") as f: f.write("\n".join(rval)) f.write("\n")
[docs] def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: Html.set_meta(self, dataset, overwrite=overwrite, **kwd) self.regenerate_primary_file(dataset)