Source code for galaxy.visualization.data_providers.cigar

'''
Functions for working with SAM/BAM CIGAR representation.
'''

import operator


[docs]def get_ref_based_read_seq_and_cigar(read_seq, read_start, ref_seq, ref_seq_start, cigar):
    '''
    Returns a ( new_read_seq, new_cigar ) that can be used with reference
    sequence to reconstruct the read. The new read sequence includes only
    bases that cannot be recovered from the reference: mismatches and
    insertions (soft clipped bases are not included). The new cigar replaces
    Ms with =s and Xs because the M operation can denote a sequence match or
    mismatch.
    '''

    if not ref_seq:
        return read_seq, cigar

    # Set up position for reference, read.
    ref_seq_pos = read_start - ref_seq_start
    read_pos = 0

    # Create new read sequence, cigar.
    new_read_seq = ''
    new_cigar = ''
    cigar_ops = 'MIDNSHP=X'
    for op_tuple in cigar:
        op, op_len = op_tuple
        # Op is index into string 'MIDNSHP=X'
        if op == 0:  # Match
            # Transform Ms to =s and Xs using reference.
            new_op = ''
            total_count = 0
            while total_count < op_len and ref_seq_pos < len(ref_seq):
                match, count = _match_mismatch_counter(read_seq, read_pos, ref_seq, ref_seq_pos)
                # Use min because count cannot exceed remainder of operation.
                count = min(count, op_len - total_count)
                if match:
                    new_op = '='
                else:
                    new_op = 'X'
                    # Include mismatched bases in new read sequence.
                    new_read_seq += read_seq[read_pos:read_pos + count]
                new_cigar += '%i%s' % (count, new_op)
                total_count += count
                read_pos += count
                ref_seq_pos += count

            # If end of read falls outside of ref_seq data, leave as M.
            if total_count < op_len:
                new_cigar += '%iM' % (op_len - total_count)
        elif op == 1:  # Insertion
            new_cigar += '%i%s' % (op_len, cigar_ops[op])
            # Include insertion bases in new read sequence.
            new_read_seq += read_seq[read_pos:read_pos + op_len]
            read_pos += op_len
        elif op in [2, 3, 6]:  # Deletion, Skip, or Padding
            ref_seq_pos += op_len
            new_cigar += '%i%s' % (op_len, cigar_ops[op])
        elif op == 4:  # Soft clipping
            read_pos += op_len
            new_cigar += '%i%s' % (op_len, cigar_ops[op])
        elif op == 5:  # Hard clipping
            new_cigar += '%i%s' % (op_len, cigar_ops[op])
        elif op in [7, 8]:  # Match or mismatch
            if op == 8:
                # Include mismatched bases in new read sequence.
                new_read_seq += read_seq[read_pos:read_pos + op_len]
            read_pos += op_len
            ref_seq_pos += op_len
            new_cigar += '%i%s' % (op_len, cigar_ops[op])

    return (new_read_seq, new_cigar)


def _match_mismatch_counter(s1, p1, s2, p2):
    '''
    Count consecutive matches/mismatches between strings s1 and s2
    starting at p1 and p2, respectively.
    '''

    # Do initial comparison to determine whether to count matches or
    # mismatches.
    if s1[p1] == s2[p2]:
        cmp_fn = operator.eq
        match = True
    else:
        cmp_fn = operator.ne
        match = False

    # Increment counts to move to next characters.
    count = 1
    p1 += 1
    p2 += 1

    # Count matches/mismatches.
    while p1 < len(s1) and p2 < len(s2) and cmp_fn(s1[p1], s2[p2]):
        count += 1
        p1 += 1
        p2 += 1

    return match, count