Warning

This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.visualization.data_providers.cigar

'''
Functions for working with SAM/BAM CIGAR representation.
'''

import operator


[docs]def get_ref_based_read_seq_and_cigar(read_seq, read_start, ref_seq, ref_seq_start, cigar): ''' Returns a ( new_read_seq, new_cigar ) that can be used with reference sequence to reconstruct the read. The new read sequence includes only bases that cannot be recovered from the reference: mismatches and insertions (soft clipped bases are not included). The new cigar replaces Ms with =s and Xs because the M operation can denote a sequence match or mismatch. ''' if not ref_seq: return read_seq, cigar # Set up position for reference, read. ref_seq_pos = read_start - ref_seq_start read_pos = 0 # Create new read sequence, cigar. new_read_seq = '' new_cigar = '' cigar_ops = 'MIDNSHP=X' for op_tuple in cigar: op, op_len = op_tuple # Op is index into string 'MIDNSHP=X' if op == 0: # Match # Transform Ms to =s and Xs using reference. new_op = '' total_count = 0 while total_count < op_len and ref_seq_pos < len(ref_seq): match, count = _match_mismatch_counter(read_seq, read_pos, ref_seq, ref_seq_pos) # Use min because count cannot exceed remainder of operation. count = min(count, op_len - total_count) if match: new_op = '=' else: new_op = 'X' # Include mismatched bases in new read sequence. new_read_seq += read_seq[read_pos:read_pos + count] new_cigar += '%i%s' % (count, new_op) total_count += count read_pos += count ref_seq_pos += count # If end of read falls outside of ref_seq data, leave as M. if total_count < op_len: new_cigar += '%iM' % (op_len - total_count) elif op == 1: # Insertion new_cigar += '%i%s' % (op_len, cigar_ops[op]) # Include insertion bases in new read sequence. new_read_seq += read_seq[read_pos:read_pos + op_len] read_pos += op_len elif op in [2, 3, 6]: # Deletion, Skip, or Padding ref_seq_pos += op_len new_cigar += '%i%s' % (op_len, cigar_ops[op]) elif op == 4: # Soft clipping read_pos += op_len new_cigar += '%i%s' % (op_len, cigar_ops[op]) elif op == 5: # Hard clipping new_cigar += '%i%s' % (op_len, cigar_ops[op]) elif op in [7, 8]: # Match or mismatch if op == 8: # Include mismatched bases in new read sequence. new_read_seq += read_seq[read_pos:read_pos + op_len] read_pos += op_len ref_seq_pos += op_len new_cigar += '%i%s' % (op_len, cigar_ops[op]) return (new_read_seq, new_cigar)
def _match_mismatch_counter(s1, p1, s2, p2): ''' Count consecutive matches/mismatches between strings s1 and s2 starting at p1 and p2, respectively. ''' # Do initial comparison to determine whether to count matches or # mismatches. if s1[p1] == s2[p2]: cmp_fn = operator.eq match = True else: cmp_fn = operator.ne match = False # Increment counts to move to next characters. count = 1 p1 += 1 p2 += 1 # Count matches/mismatches. while p1 < len(s1) and p2 < len(s2) and cmp_fn(s1[p1], s2[p2]): count += 1 p1 += 1 p2 += 1 return match, count