Skip to content

Instantly share code, notes, and snippets.

@malonge
Last active March 3, 2020 20:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save malonge/5d61e0c45a56f406bb37fc099bf4bf84 to your computer and use it in GitHub Desktop.
Save malonge/5d61e0c45a56f406bb37fc099bf4bf84 to your computer and use it in GitHub Desktop.
import re
import gzip
import argparse
def main():
message = """
Summarize the rate of edits (insertion, deletion, mismatch) from minimap2 alignments of a query to a target.
For each alignment, the rate is defined as the number of edits divided by the alignment length
with respect to the query.
"""
parser = argparse.ArgumentParser(description=message)
parser.add_argument("paf_file", metavar="<alns.paf(.gz)>", type=str, help="PAF file (difference string (minimap2 --cs) required).")
parser.add_argument("--normalize", action='store_true', default=False, help="Divide the number of edits by the query alignment length + gaps.")
# Get the command line arguments
args = parser.parse_args()
paf_file = args.paf_file
normalize = args.normalize
re_cs = re.compile(r'([:=*+-])(\d+|[A-Za-z]+)')
del_sum = 0
ins_sum = 0
mm_sum = 0
m_sum = 0
denom = 0
# Iterate through the PAF file
if paf_file[-3:] == ".gz":
f = gzip.open(paf_file)
else:
f = open(paf_file, 'r')
for line in f:
if not isinstance(line, str):
l = line.decode("utf-8").rstrip().split('\t')
else:
l = line.rstrip().split('\t')
cs = None
# Get the info we need
q_len = int(l[3]) - int(l[2])
# Get the difference string
for i in l[11:]:
if i.startswith("cs:Z"):
cs = i[5:]
if cs is None:
raise ValueError("The difference stirng (--cs) is required")
# Parse the difference string
del_line_sum = 0
for m in re.findall(re_cs, cs):
if m[0] == '*':
mm_sum += 1
elif m[0] == "+":
ins_sum += len(m[1])
elif m[0] == "-":
del_sum += len(m[1])
del_line_sum += len(m[1])
else:
m_sum += int(m[1])
if normalize:
denom += q_len
denom += del_line_sum
else:
denom += q_len
f.close()
print("The insertion rate is {:.3f}%".format((ins_sum/denom)*100))
print("The deletion rate is {:.3f}%".format((del_sum / denom) * 100))
print("The mismatch rate is {:.3f}%".format((mm_sum / denom) * 100))
print("The match rate is {:.3f}%".format((m_sum / denom) * 100))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment