Source code for galaxy.datatypes.assembly

"""
velvet datatypes
James E Johnson - University of Minnesota
for velvet assembler tool in galaxy
"""
from __future__ import absolute_import

import logging
import os
import re
import sys

from galaxy.datatypes import data
from galaxy.datatypes import sequence
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.sniff import build_sniff_from_prefix
from galaxy.datatypes.text import Html

log = logging.getLogger(__name__)


[docs]@build_sniff_from_prefix class Amos(data.Text): """Class describing the AMOS assembly file """ edam_data = "data_0925" edam_format = "format_3582" file_ext = 'afg'
[docs] def sniff_prefix(self, file_prefix): """ Determines whether the file is an amos assembly file format Example:: {CTG iid:1 eid:1 seq: CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA . qlt: DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD . {TLE src:1027 off:0 clr:618,0 gap: 250 612 . } } """ for line in file_prefix.line_iterator(): if not line: break # EOF line = line.strip() if line: # first non-empty line if line.startswith('{'): if re.match(r'{(RED|CTG|TLE)$', line): return True return False
[docs]@build_sniff_from_prefix class Sequences(sequence.Fasta): """Class describing the Sequences file generated by velveth """ edam_data = "data_0925" file_ext = 'sequences'
[docs] def sniff_prefix(self, file_prefix): """ Determines whether the file is a velveth produced fasta format The id line has 3 fields separated by tabs: sequence_name sequence_index category:: >SEQUENCE_0_length_35 1 1 GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT >SEQUENCE_1_length_35 2 1 CGACGAATGACAGGTCACGAATTTGGCGGGGATTA """ fh = file_prefix.string_io() while True: line = fh.readline() if not line: break # EOF line = line.strip() if line: # first non-empty line if line.startswith('>'): if not re.match(r'>[^\t]+\t\d+\t\d+$', line): break # The next line.strip() must not be '', nor startwith '>' line = fh.readline().strip() if line == '' or line.startswith('>'): break return True else: break # we found a non-empty line, but it's not a fasta header return False
[docs]@build_sniff_from_prefix class Roadmaps(data.Text): """Class describing the Sequences file generated by velveth """ edam_format = "format_2561" file_ext = 'roadmaps'
[docs] def sniff_prefix(self, file_prefix): """ Determines whether the file is a velveth produced RoadMap:: 142858 21 1 ROADMAP 1 ROADMAP 2 ... """ fh = file_prefix.string_io() while True: line = fh.readline() if not line: break # EOF line = line.strip() if line: # first non-empty line if not re.match(r'\d+\t\d+\t\d+$', line): break # The next line.strip() should be 'ROADMAP 1' line = fh.readline().strip() if not re.match(r'ROADMAP \d+$', line): break return True else: break # we found a non-empty line, but it's not a fasta header return False
[docs]class Velvet(Html): MetadataElement(name="base_name", desc="base name for velveth dataset", default="velvet", readonly=True, set_in_upload=True) MetadataElement(name="paired_end_reads", desc="has paired-end reads", default="False", readonly=False, set_in_upload=True) MetadataElement(name="long_reads", desc="has long reads", default="False", readonly=False, set_in_upload=True) MetadataElement(name="short2_reads", desc="has 2nd short reads", default="False", readonly=False, set_in_upload=True) composite_type = 'auto_primary_file' allow_datatype_change = False file_ext = 'velvet'
[docs] def __init__(self, **kwd): Html.__init__(self, **kwd) self.add_composite_file('Sequences', mimetype='text/html', description='Sequences', substitute_name_with_metadata=None, is_binary=False) self.add_composite_file('Roadmaps', mimetype='text/html', description='Roadmaps', substitute_name_with_metadata=None, is_binary=False) self.add_composite_file('Log', mimetype='text/html', description='Log', optional='True', substitute_name_with_metadata=None, is_binary=False)
[docs] def generate_primary_file(self, dataset=None): log.debug("Velvet log info %s %s" % ('JJ generate_primary_file', dataset)) rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>'] rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') for composite_name, composite_file in self.get_composite_files(dataset=dataset).items(): fn = composite_name log.debug("Velvet log info %s %s %s" % ('JJ generate_primary_file', fn, composite_file)) opt_text = '' if composite_file.optional: opt_text = ' (optional)' if composite_file.get('description'): rval.append('<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % (fn, fn, composite_file.get('description'), opt_text)) else: rval.append('<li><a href="%s" type="text/plain">%s</a>%s</li>' % (fn, fn, opt_text)) rval.append('</ul></div></html>') return "\n".join(rval)
[docs] def regenerate_primary_file(self, dataset): """ cannot do this until we are setting metadata """ log.debug("Velvet log info %s" % 'JJ regenerate_primary_file') gen_msg = '' try: efp = dataset.extra_files_path log_path = os.path.join(efp, 'Log') with open(log_path, 'r') as f: log_content = f.read(1000) log_msg = re.sub(r'/\S*/', '', log_content) log.debug("Velveth log info %s" % log_msg) paired_end_reads = re.search(r'-(short|long)Paired', log_msg) is not None dataset.metadata.paired_end_reads = paired_end_reads long_reads = re.search(r'-long', log_msg) is not None dataset.metadata.long_reads = long_reads short2_reads = re.search(r'-short(Paired)?2', log_msg) is not None dataset.metadata.short2_reads = short2_reads dataset.info = re.sub(r'.*velveth \S+', 'hash_length', re.sub(r'\n', ' ', log_msg)) if paired_end_reads: gen_msg = gen_msg + ' Paired-End Reads' if long_reads: gen_msg = gen_msg + ' Long Reads' if len(gen_msg) > 0: gen_msg = 'Uses: ' + gen_msg except Exception: log.debug("Velveth could not read Log file in %s" % efp) log.debug("Velveth log info %s" % gen_msg) rval = ['<html><head><title>Velvet Galaxy Composite Dataset </title></head><p/>'] # rval.append('<div>Generated:<p/><code> %s </code></div>' %(re.sub('\n','<br>',log_msg))) rval.append('<div>Generated:<p/> %s </div>' % (gen_msg)) rval.append('<div>Velveth dataset:<p/><ul>') for composite_name, composite_file in self.get_composite_files(dataset=dataset).items(): fn = composite_name log.debug("Velvet log info %s %s %s" % ('JJ regenerate_primary_file', fn, composite_file)) if re.search('Log', fn) is None: opt_text = '' if composite_file.optional: opt_text = ' (optional)' if composite_file.get('description'): rval.append('<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % (fn, fn, composite_file.get('description'), opt_text)) else: rval.append('<li><a href="%s" type="text/plain">%s</a>%s</li>' % (fn, fn, opt_text)) rval.append('</ul></div></html>') with open(dataset.file_name, 'w') as f: f.write("\n".join(rval)) f.write('\n')
[docs] def set_meta(self, dataset, **kwd): Html.set_meta(self, dataset, **kwd) self.regenerate_primary_file(dataset)
if __name__ == '__main__': import doctest doctest.testmod(sys.modules[__name__])