knowah/fix_secondary_hard_clipping.py

## fix_secondary_hard_clipping.py
#!/usr/bin/env python3

#Copyright 2020, Noah Kessler
#
#Copying and distribution of this file, with or without modification, are
#permitted in any medium without royalty, provided the copyright notice and
#this notice are preserved. This file is offered as-is, without any warranty.

# fix hard clipping after biostars352930
# see https://www.biostars.org/p/352930/#352989

import sys
import re

H_pattern = re.compile("^([0-9]+H)?(?:[0-9]+[MIDNSPX=])*(\\d+H)?$")
MISXEQ_pattern = re.compile("([0-9]+[MIS=X])")

with sys.stdin as samf:
    for line in samf:
        # remove newline
        line = line.rstrip('\n')

        # skip header lines
        if line[0] == '@':
            print(line)
            continue

        tokens = line.split('\t')

        if int(tokens[1]) & 0x100 == 0x100: # secondary alignment
            expected_seq_len = sum([int(x[:-1]) for x in re.findall(MISXEQ_pattern, tokens[5])])

            if len(tokens[9]) > expected_seq_len: # need to do clipping
                h_clips = re.findall(H_pattern, tokens[5])[0]
                if len(h_clips) == 2: # CIGAR hard clipping properly formatted
                    h_clip_left  = int(h_clips[0][:-1]) if len(h_clips[0]) > 0 else 0
                    h_clip_right = int(h_clips[1][:-1]) if len(h_clips[1]) > 0 else 0

                    # clip SEQ and QUAL strings
                    tokens[9] = tokens[9][h_clip_left:(len(tokens[9])-h_clip_right)]
                    tokens[10] = tokens[10][h_clip_left:(len(tokens[10])-h_clip_right)]

        print('\t'.join(tokens))
	#!/usr/bin/env python3

	#Copyright 2020, Noah Kessler
	#
	#Copying and distribution of this file, with or without modification, are
	#permitted in any medium without royalty, provided the copyright notice and
	#this notice are preserved. This file is offered as-is, without any warranty.

	# fix hard clipping after biostars352930
	# see https://www.biostars.org/p/352930/#352989

	import sys
	import re

	H_pattern = re.compile("^([0-9]+H)?(?:[0-9]+[MIDNSPX=])*(\\d+H)?$")
	MISXEQ_pattern = re.compile("([0-9]+[MIS=X])")

	with sys.stdin as samf:
	for line in samf:
	# remove newline
	line = line.rstrip('\n')

	# skip header lines
	if line[0] == '@':
	print(line)
	continue

	tokens = line.split('\t')

	if int(tokens[1]) & 0x100 == 0x100: # secondary alignment
	expected_seq_len = sum([int(x[:-1]) for x in re.findall(MISXEQ_pattern, tokens[5])])

	if len(tokens[9]) > expected_seq_len: # need to do clipping
	h_clips = re.findall(H_pattern, tokens[5])[0]
	if len(h_clips) == 2: # CIGAR hard clipping properly formatted
	h_clip_left = int(h_clips[0][:-1]) if len(h_clips[0]) > 0 else 0
	h_clip_right = int(h_clips[1][:-1]) if len(h_clips[1]) > 0 else 0

	# clip SEQ and QUAL strings
	tokens[9] = tokens[9][h_clip_left:(len(tokens[9])-h_clip_right)]
	tokens[10] = tokens[10][h_clip_left:(len(tokens[10])-h_clip_right)]

	print('\t'.join(tokens))