fedarko/filter_gfa_lowcov.py

## filter_gfa_lowcov.py
#! /usr/bin/env python
# Filters low-coverage segments from a jumboDBG GFA file.
# This assumes that the second-from-the-last entry in each segment line will be
# length, and that the last entry in each segment line will be k-mer coverage.
# It also uses the definition of k-mer coverage assumed by jumboDBG -- so, the
# conventional "coverage" of a segment can be computed as KC / (length - K).

K = 5001
MINCOV = 5
UPDATE_FREQ = 1000

INPUT_GFA = "cg.noseq.fixed.gfa"
OUTPUT_GFA = f"cg.noseq.fixed.covgeq{MINCOV}.gfa"

new_gfa_text = ""
# Checking if something is present in a set is MUCH faster than checking if
# something is present in a list -- the difference is night-and-day when
# removed_segments can contain millions of elements.
# https://stackoverflow.com/a/38723424
removed_segments = set()
removed_link_ct = 0
good_segment_ct = 0
with open(INPUT_GFA, 'r') as fh:
    for li, line in enumerate(fh):
        if li % UPDATE_FREQ == 0:
            print(
                f"On line {li:,} ({good_segment_ct:,} \u2265 {MINCOV:,}x, "
                f"{len(removed_segments):,} < {MINCOV:,}x)..."
            )

        if line.startswith("S\t"):
            parts = line.split("\t")

            assert parts[-1].startswith("KC:i:")
            kc = int(parts[-1][5:])
            assert parts[-2].startswith("LN:i:")
            length = int(parts[-2][5:])

            cov = kc / (length - K)
            if cov >= MINCOV:
                new_gfa_text += line
                good_segment_ct += 1
            else:
                removed_segments.add(parts[1])

        elif line.startswith("L\t"):
            parts = line.split("\t")
            src = parts[1]
            tgt = parts[3]
            if src in removed_segments or tgt in removed_segments:
                removed_link_ct += 1
            else:
                new_gfa_text += line
        else:
            new_gfa_text += line

with open(OUTPUT_GFA, 'w') as fh:
    fh.write(new_gfa_text)
	#! /usr/bin/env python
	# Filters low-coverage segments from a jumboDBG GFA file.
	# This assumes that the second-from-the-last entry in each segment line will be
	# length, and that the last entry in each segment line will be k-mer coverage.
	# It also uses the definition of k-mer coverage assumed by jumboDBG -- so, the
	# conventional "coverage" of a segment can be computed as KC / (length - K).

	K = 5001
	MINCOV = 5
	UPDATE_FREQ = 1000

	INPUT_GFA = "cg.noseq.fixed.gfa"
	OUTPUT_GFA = f"cg.noseq.fixed.covgeq{MINCOV}.gfa"

	new_gfa_text = ""
	# Checking if something is present in a set is MUCH faster than checking if
	# something is present in a list -- the difference is night-and-day when
	# removed_segments can contain millions of elements.
	# https://stackoverflow.com/a/38723424
	removed_segments = set()
	removed_link_ct = 0
	good_segment_ct = 0
	with open(INPUT_GFA, 'r') as fh:
	for li, line in enumerate(fh):
	if li % UPDATE_FREQ == 0:
	print(
	f"On line {li:,} ({good_segment_ct:,} \u2265 {MINCOV:,}x, "
	f"{len(removed_segments):,} < {MINCOV:,}x)..."
	)

	if line.startswith("S\t"):
	parts = line.split("\t")

	assert parts[-1].startswith("KC:i:")
	kc = int(parts[-1][5:])
	assert parts[-2].startswith("LN:i:")
	length = int(parts[-2][5:])

	cov = kc / (length - K)
	if cov >= MINCOV:
	new_gfa_text += line
	good_segment_ct += 1
	else:
	removed_segments.add(parts[1])

	elif line.startswith("L\t"):
	parts = line.split("\t")
	src = parts[1]
	tgt = parts[3]
	if src in removed_segments or tgt in removed_segments:
	removed_link_ct += 1
	else:
	new_gfa_text += line
	else:
	new_gfa_text += line

	with open(OUTPUT_GFA, 'w') as fh:
	fh.write(new_gfa_text)