Warning
This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.
Source code for galaxy.datatypes.converters.interval_to_fli
'''
Creates a feature location index (FLI) for a given BED/GFF file.
FLI index has the form::
[line_length]
<symbol1_in_lowercase><tab><symbol1><tab><location>
<symbol2_in_lowercase><tab><symbol2><tab><location>
...
where location is formatted as:
contig:start-end
and symbols are sorted in lexigraphical order.
'''
import optparse
from bx.tabular.io import Comment, Header
from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed, GFFReaderWrapper, read_unordered_gtf
[docs]def main():
# Process arguments.
parser = optparse.OptionParser()
parser.add_option('-F', '--format', dest="input_format")
(options, args) = parser.parse_args()
in_fname, out_fname = args
input_format = options.input_format.lower()
# Create dict of name-location pairings.
name_loc_dict = {}
if input_format in ['gff', 'gtf']:
# GTF/GFF format
# Create reader.
if input_format == 'gff':
in_reader = GFFReaderWrapper(open(in_fname))
else: # input_format == 'gtf'
in_reader = read_unordered_gtf(open(in_fname))
for feature in in_reader:
if isinstance(feature, (Header, Comment)):
continue
for name in feature.attributes:
val = feature.attributes[name]
try:
float(val)
continue
except ValueError:
convert_gff_coords_to_bed(feature)
# Value is not a number, so it can be indexed.
if val not in name_loc_dict:
# Value is not in dictionary.
name_loc_dict[val] = {
'contig': feature.chrom,
'start': feature.start,
'end': feature.end
}
else:
# Value already in dictionary, so update dictionary.
loc = name_loc_dict[val]
if feature.start < loc['start']:
loc['start'] = feature.start
if feature.end > loc['end']:
loc['end'] = feature.end
elif input_format == 'bed':
# BED format.
for line in open(in_fname):
# Ignore track lines.
if line.startswith("track"):
continue
fields = line.split()
# Ignore lines with no feature name.
if len(fields) < 4:
continue
# Process line
name_loc_dict[fields[3]] = {
'contig': fields[0],
'start': int(fields[1]),
'end': int(fields[2])
}
# Create sorted list of entries.
max_len = 0
entries = []
for name in sorted(name_loc_dict.keys()):
loc = name_loc_dict[name]
entry = '{}\t{}\t{}'.format(name.lower(), name, '%s:%i-%i' % (loc['contig'], loc['start'], loc['end']))
if len(entry) > max_len:
max_len = len(entry)
entries.append(entry)
# Write padded entries.
with open(out_fname, 'w') as out:
out.write(str(max_len + 1).ljust(max_len) + '\n')
for entry in entries:
out.write(entry.ljust(max_len) + '\n')
if __name__ == '__main__':
main()