Last active
November 1, 2020 01:27
-
-
Save knowah/24c8665334180120b94be3274c5cc706 to your computer and use it in GitHub Desktop.
This tool fixes the error "CIGAR and query sequence are of different length / [W::sam_read1] Parse error" that occurs when SEQ and QUAL strings are not hard-clipped to the correct length (e.g. after running http://lindenb.github.io/jvarkit/Biostar352930.html - which replaces the missing SEQ/QUAL strings in secondary alignments by tools like bwa/…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#Copyright 2020, Noah Kessler | |
# | |
#Copying and distribution of this file, with or without modification, are | |
#permitted in any medium without royalty, provided the copyright notice and | |
#this notice are preserved. This file is offered as-is, without any warranty. | |
# fix hard clipping after biostars352930 | |
# see https://www.biostars.org/p/352930/#352989 | |
import sys | |
import re | |
H_pattern = re.compile("^([0-9]+H)?(?:[0-9]+[MIDNSPX=])*(\\d+H)?$") | |
MISXEQ_pattern = re.compile("([0-9]+[MIS=X])") | |
with sys.stdin as samf: | |
for line in samf: | |
# remove newline | |
line = line.rstrip('\n') | |
# skip header lines | |
if line[0] == '@': | |
print(line) | |
continue | |
tokens = line.split('\t') | |
if int(tokens[1]) & 0x100 == 0x100: # secondary alignment | |
expected_seq_len = sum([int(x[:-1]) for x in re.findall(MISXEQ_pattern, tokens[5])]) | |
if len(tokens[9]) > expected_seq_len: # need to do clipping | |
h_clips = re.findall(H_pattern, tokens[5])[0] | |
if len(h_clips) == 2: # CIGAR hard clipping properly formatted | |
h_clip_left = int(h_clips[0][:-1]) if len(h_clips[0]) > 0 else 0 | |
h_clip_right = int(h_clips[1][:-1]) if len(h_clips[1]) > 0 else 0 | |
# clip SEQ and QUAL strings | |
tokens[9] = tokens[9][h_clip_left:(len(tokens[9])-h_clip_right)] | |
tokens[10] = tokens[10][h_clip_left:(len(tokens[10])-h_clip_right)] | |
print('\t'.join(tokens)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment