Warning

This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.datatypes.converters.interval_to_fli

'''
Creates a feature location index (FLI) for a given BED/GFF file.
FLI index has the form::

    [line_length]
    <symbol1_in_lowercase><tab><symbol1><tab><location>
    <symbol2_in_lowercase><tab><symbol2><tab><location>
    ...

where location is formatted as:

    contig:start-end

and symbols are sorted in lexigraphical order.
'''
import optparse

from bx.tabular.io import Comment, Header

from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed, GFFReaderWrapper, read_unordered_gtf


[docs]def main(): # Process arguments. parser = optparse.OptionParser() parser.add_option('-F', '--format', dest="input_format") (options, args) = parser.parse_args() in_fname, out_fname = args input_format = options.input_format.lower() # Create dict of name-location pairings. name_loc_dict = {} if input_format in ['gff', 'gtf']: # GTF/GFF format # Create reader. if input_format == 'gff': in_reader = GFFReaderWrapper(open(in_fname)) else: # input_format == 'gtf' in_reader = read_unordered_gtf(open(in_fname)) for feature in in_reader: if isinstance(feature, (Header, Comment)): continue for name in feature.attributes: val = feature.attributes[name] try: float(val) continue except ValueError: convert_gff_coords_to_bed(feature) # Value is not a number, so it can be indexed. if val not in name_loc_dict: # Value is not in dictionary. name_loc_dict[val] = { 'contig': feature.chrom, 'start': feature.start, 'end': feature.end } else: # Value already in dictionary, so update dictionary. loc = name_loc_dict[val] if feature.start < loc['start']: loc['start'] = feature.start if feature.end > loc['end']: loc['end'] = feature.end elif input_format == 'bed': # BED format. for line in open(in_fname): # Ignore track lines. if line.startswith("track"): continue fields = line.split() # Ignore lines with no feature name. if len(fields) < 4: continue # Process line name_loc_dict[fields[3]] = { 'contig': fields[0], 'start': int(fields[1]), 'end': int(fields[2]) } # Create sorted list of entries. max_len = 0 entries = [] for name in sorted(name_loc_dict.keys()): loc = name_loc_dict[name] entry = '{}\t{}\t{}'.format(name.lower(), name, '%s:%i-%i' % (loc['contig'], loc['start'], loc['end'])) if len(entry) > max_len: max_len = len(entry) entries.append(entry) # Write padded entries. with open(out_fname, 'w') as out: out.write(str(max_len + 1).ljust(max_len) + '\n') for entry in entries: out.write(entry.ljust(max_len) + '\n')
if __name__ == '__main__': main()