Created
April 30, 2024 23:53
-
-
Save fedarko/35ea56fb1fb776c256f85f3da60ea08f to your computer and use it in GitHub Desktop.
Filter low-kmer-coverage segments from a jumboDBG GFA file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# Filters low-coverage segments from a jumboDBG GFA file. | |
# This assumes that the second-from-the-last entry in each segment line will be | |
# length, and that the last entry in each segment line will be k-mer coverage. | |
# It also uses the definition of k-mer coverage assumed by jumboDBG -- so, the | |
# conventional "coverage" of a segment can be computed as KC / (length - K). | |
K = 5001 | |
MINCOV = 5 | |
UPDATE_FREQ = 1000 | |
INPUT_GFA = "cg.noseq.fixed.gfa" | |
OUTPUT_GFA = f"cg.noseq.fixed.covgeq{MINCOV}.gfa" | |
new_gfa_text = "" | |
# Checking if something is present in a set is MUCH faster than checking if | |
# something is present in a list -- the difference is night-and-day when | |
# removed_segments can contain millions of elements. | |
# https://stackoverflow.com/a/38723424 | |
removed_segments = set() | |
removed_link_ct = 0 | |
good_segment_ct = 0 | |
with open(INPUT_GFA, 'r') as fh: | |
for li, line in enumerate(fh): | |
if li % UPDATE_FREQ == 0: | |
print( | |
f"On line {li:,} ({good_segment_ct:,} \u2265 {MINCOV:,}x, " | |
f"{len(removed_segments):,} < {MINCOV:,}x)..." | |
) | |
if line.startswith("S\t"): | |
parts = line.split("\t") | |
assert parts[-1].startswith("KC:i:") | |
kc = int(parts[-1][5:]) | |
assert parts[-2].startswith("LN:i:") | |
length = int(parts[-2][5:]) | |
cov = kc / (length - K) | |
if cov >= MINCOV: | |
new_gfa_text += line | |
good_segment_ct += 1 | |
else: | |
removed_segments.add(parts[1]) | |
elif line.startswith("L\t"): | |
parts = line.split("\t") | |
src = parts[1] | |
tgt = parts[3] | |
if src in removed_segments or tgt in removed_segments: | |
removed_link_ct += 1 | |
else: | |
new_gfa_text += line | |
else: | |
new_gfa_text += line | |
with open(OUTPUT_GFA, 'w') as fh: | |
fh.write(new_gfa_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment