Skip to content

Instantly share code, notes, and snippets.

@fedarko
Created April 30, 2024 23:53
Show Gist options
  • Save fedarko/35ea56fb1fb776c256f85f3da60ea08f to your computer and use it in GitHub Desktop.
Save fedarko/35ea56fb1fb776c256f85f3da60ea08f to your computer and use it in GitHub Desktop.
Filter low-kmer-coverage segments from a jumboDBG GFA file
#! /usr/bin/env python
# Filters low-coverage segments from a jumboDBG GFA file.
# This assumes that the second-from-the-last entry in each segment line will be
# length, and that the last entry in each segment line will be k-mer coverage.
# It also uses the definition of k-mer coverage assumed by jumboDBG -- so, the
# conventional "coverage" of a segment can be computed as KC / (length - K).
K = 5001
MINCOV = 5
UPDATE_FREQ = 1000
INPUT_GFA = "cg.noseq.fixed.gfa"
OUTPUT_GFA = f"cg.noseq.fixed.covgeq{MINCOV}.gfa"
new_gfa_text = ""
# Checking if something is present in a set is MUCH faster than checking if
# something is present in a list -- the difference is night-and-day when
# removed_segments can contain millions of elements.
# https://stackoverflow.com/a/38723424
removed_segments = set()
removed_link_ct = 0
good_segment_ct = 0
with open(INPUT_GFA, 'r') as fh:
for li, line in enumerate(fh):
if li % UPDATE_FREQ == 0:
print(
f"On line {li:,} ({good_segment_ct:,} \u2265 {MINCOV:,}x, "
f"{len(removed_segments):,} < {MINCOV:,}x)..."
)
if line.startswith("S\t"):
parts = line.split("\t")
assert parts[-1].startswith("KC:i:")
kc = int(parts[-1][5:])
assert parts[-2].startswith("LN:i:")
length = int(parts[-2][5:])
cov = kc / (length - K)
if cov >= MINCOV:
new_gfa_text += line
good_segment_ct += 1
else:
removed_segments.add(parts[1])
elif line.startswith("L\t"):
parts = line.split("\t")
src = parts[1]
tgt = parts[3]
if src in removed_segments or tgt in removed_segments:
removed_link_ct += 1
else:
new_gfa_text += line
else:
new_gfa_text += line
with open(OUTPUT_GFA, 'w') as fh:
fh.write(new_gfa_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment