Warning

This document is for an in-development version of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.datatypes.converters.interval_to_bedstrict_converter

#!/usr/bin/env python
# Dan Blankenberg
from __future__ import print_function

import sys

import bx.intervals.io

assert sys.version_info[:2] >= (2, 6)


[docs]def stop_err(msg): sys.stderr.write(msg) sys.exit()
[docs]def force_bed_field_count(fields, region_count, force_num_columns): if force_num_columns >= 4 and len(fields) < 4: fields.append('region_%i' % (region_count)) if force_num_columns >= 5 and len(fields) < 5: fields.append('0') if force_num_columns >= 6 and len(fields) < 6: fields.append('+') if force_num_columns >= 7 and len(fields) < 7: fields.append(fields[1]) if force_num_columns >= 8 and len(fields) < 8: fields.append(fields[2]) if force_num_columns >= 9 and len(fields) < 9: fields.append('0') if force_num_columns >= 10 and len(fields) < 10: fields.append('0') if force_num_columns >= 11 and len(fields) < 11: fields.append(',') if force_num_columns >= 12 and len(fields) < 12: fields.append(',') return fields[:force_num_columns]
def __main__(): output_name = sys.argv[1] input_name = sys.argv[2] try: chromCol = int(sys.argv[3]) - 1 except Exception: stop_err("'%s' is an invalid chrom column, correct the column settings before attempting to convert the data format." % str(sys.argv[3])) try: startCol = int(sys.argv[4]) - 1 except Exception: stop_err("'%s' is an invalid start column, correct the column settings before attempting to convert the data format." % str(sys.argv[4])) try: endCol = int(sys.argv[5]) - 1 except Exception: stop_err("'%s' is an invalid end column, correct the column settings before attempting to convert the data format." % str(sys.argv[5])) try: strandCol = int(sys.argv[6]) - 1 except Exception: strandCol = -1 try: nameCol = int(sys.argv[7]) - 1 except Exception: nameCol = -1 try: extension = sys.argv[8] except IndexError: extension = 'interval' # default extension try: force_num_columns = int(sys.argv[9]) except Exception: force_num_columns = None skipped_lines = 0 first_skipped_line = None count = 0 # does file already conform to bed strict? # if so, we want to keep extended columns, otherwise we'll create a generic 6 column bed file strict_bed = True if extension in ['bed', 'bedstrict', 'bed6', 'bed12'] and (chromCol, startCol, endCol) == (0, 1, 2) and (nameCol < 0 or nameCol == 3) and (strandCol < 0 or strandCol == 5): with open(input_name) as fh, open(output_name, 'w') as out: for count, line in enumerate(fh): line = line.rstrip('\n\r') if line == "" or line.startswith("#"): skipped_lines += 1 if first_skipped_line is None: first_skipped_line = count + 1 continue fields = line.split('\t') try: assert len(fields) >= 3, 'A BED file requires at least 3 columns' # we can't fix this if len(fields) > 12: strict_bed = False break # name (fields[3]) can be anything, no verification needed if len(fields) > 4: float(fields[4]) # score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). if len(fields) > 5: assert fields[5] in ['+', '-'], 'Invalid strand' # strand - Defines the strand - either '+' or '-'. if len(fields) > 6: int(fields[6]) # thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays). if len(fields) > 7: int(fields[7]) # thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays). if len(fields) > 8: if fields[8] != '0': # itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser. fields2 = fields[8].split(',') assert len(fields2) == 3, 'RGB value must be 0 or have length of 3' for field in fields2: int(field) # rgb values are integers if len(fields) > 9: int(fields[9]) # blockCount - The number of blocks (exons) in the BED line. if len(fields) > 10: if fields[10] != ',': # blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount. fields2 = fields[10].rstrip(",").split(",") # remove trailing comma and split on comma for field in fields2: int(field) if len(fields) > 11: if fields[11] != ',': # blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount. fields2 = fields[11].rstrip(",").split(",") # remove trailing comma and split on comma for field in fields2: int(field) except Exception: strict_bed = False break if force_num_columns is not None and len(fields) != force_num_columns: line = '\t'.join(force_bed_field_count(fields, count, force_num_columns)) out.write("%s\n" % line) else: strict_bed = False if not strict_bed: skipped_lines = 0 first_skipped_line = None count = 0 with open(input_name, 'r') as fh, open(output_name, 'w') as out: for count, region in enumerate(bx.intervals.io.NiceReaderWrapper(fh, chrom_col=chromCol, start_col=startCol, end_col=endCol, strand_col=strandCol, fix_strand=True, return_header=False, return_comments=False)): try: if nameCol >= 0: name = region.fields[nameCol] else: raise IndexError except Exception: name = "region_%i" % count try: fields = [str(item) for item in (region.chrom, region.start, region.end, name, 0, region.strand)] if force_num_columns is not None and len(fields) != force_num_columns: fields = force_bed_field_count(fields, count, force_num_columns) out.write("%s\n" % '\t'.join(fields)) except Exception: skipped_lines += 1 if first_skipped_line is None: first_skipped_line = count + 1 print("%i regions converted to BED." % (count + 1 - skipped_lines)) if skipped_lines > 0: print("Skipped %d blank or invalid lines starting with line # %d." % (skipped_lines, first_skipped_line)) if __name__ == "__main__": __main__()