fedarko/rm_seqs_from_gfa.py

## rm_seqs_from_gfa.py
#! /usr/bin/env python3
#
# SUMMARY
# =======
# Outputs a copy of a GFA 1 file with each segment (S) line that contains a
# sequence (not just a "*" character) altered as follows:
#
#   - If an LN:i tag does not exist for this sequence:
#       - We will add an LN:i tag describing the length of the sequence.
#       - We will replace the sequence with a "*" character.
#
#   - If an LN:i tag DOES exist for this sequence:
#       - We will raise an error. (Maybe this is overly strict, but I figure
#         better safe than sorry.)
#
# All other lines (including S lines that already do not contain a sequence,
# and other types of lines [e.g. H, L, ...]) will be included unchanged in the
# output.
#
# MOTIVATION
# ==========
# Often, we are only interested in the topology of a GFA file (rather than the
# details of the sequences contained therein). The sequences in a GFA file can
# take up a massive amount of space, though; this can make the storage and
# visualization of these files challenging. This script helps with these
# problems -- it's often faster to, for example, download a GFA file without
# sequences to your laptop than it is to download the same file containing
# sequences.
#
# USAGE
# =======
# Depends on Python 3.6 or later. The input GFA file should be a GFA 1 file, in
# particular.
#
# You can use this script by running the following command (assuming this
# script is in the current working directory and is executable):
#
# $ ./rm_seqs_from_gfa.py input.gfa output.gfa

import sys

IN_FILENAME = sys.argv[1]
OUT_FILENAME = sys.argv[2]

print(f"Converting GFA {IN_FILENAME} --> no-seq GFA {OUT_FILENAME}...")

num_segs = 0
num_segs_with_seqs_rmd = 0

with open(IN_FILENAME, "r") as infile:
    with open(OUT_FILENAME, "w") as outfile:
        for line in infile:
            if line.startswith("S\t"):
                split = line.strip().split("\t")
                seq = split[2]
                if seq != "*":
                    for of in split[3:]:
                        if of.startswith("LN:i:"):
                            # Uh oh -- the sequence was given, but a LN:i tag
                            # was also given. This is technically allowed in
                            # the GFA spec, but this script is not designed to
                            # handle these complex cases. So I'm just gonna
                            # bail out.
                            raise ValueError(
                                f"Segment {split[1]} contains a defined "
                                "sequence and an LN:i tag; these are mutually "
                                "exclusive."
                            )
                    # OK, now we know that there *is* a defined sequence and
                    # there *is not* a defined LN:i tag here. Nice.
                    slen = len(seq)

                    # This works fine even if there are no optional fields --
                    # in that case, split[3:] will be [], so new_ofs will be
                    # "".
                    new_ofs = "\t".join(split[3:])
                    # ... But we do need to add another tab character if
                    # new_ofs isn't "", to separate this stuff from the new
                    # LN:i: tag we'll be adding in.
                    if len(new_ofs) > 0:
                        new_ofs = "\t" + new_ofs

                    new_line = f"S\t{split[1]}\t*\tLN:i:{slen}{new_ofs}\n"
                    num_segs_with_seqs_rmd += 1
                else:
                    new_line = line
                num_segs += 1
            else:
                new_line = line
            outfile.write(new_line)

print(f"Observed {num_segs:,} segment(s).")
print(
    f"{num_segs_with_seqs_rmd:,} of these segment(s) had defined sequences "
    "given."
)
print(
    f"Replaced these {num_segs_with_seqs_rmd:,} segment(s)' sequences with * "
    "characters and added LN:i tags."
)
finallogline = f"GFA with segment sequences removed created at {OUT_FILENAME}."
print("=" * len(finallogline))
print(finallogline)
	#! /usr/bin/env python3
	#
	# SUMMARY
	# =======
	# Outputs a copy of a GFA 1 file with each segment (S) line that contains a
	# sequence (not just a "*" character) altered as follows:
	#
	# - If an LN:i tag does not exist for this sequence:
	# - We will add an LN:i tag describing the length of the sequence.
	# - We will replace the sequence with a "*" character.
	#
	# - If an LN:i tag DOES exist for this sequence:
	# - We will raise an error. (Maybe this is overly strict, but I figure
	# better safe than sorry.)
	#
	# All other lines (including S lines that already do not contain a sequence,
	# and other types of lines [e.g. H, L, ...]) will be included unchanged in the
	# output.
	#
	# MOTIVATION
	# ==========
	# Often, we are only interested in the topology of a GFA file (rather than the
	# details of the sequences contained therein). The sequences in a GFA file can
	# take up a massive amount of space, though; this can make the storage and
	# visualization of these files challenging. This script helps with these
	# problems -- it's often faster to, for example, download a GFA file without
	# sequences to your laptop than it is to download the same file containing
	# sequences.
	#
	# USAGE
	# =======
	# Depends on Python 3.6 or later. The input GFA file should be a GFA 1 file, in
	# particular.
	#
	# You can use this script by running the following command (assuming this
	# script is in the current working directory and is executable):
	#
	# $ ./rm_seqs_from_gfa.py input.gfa output.gfa

	import sys

	IN_FILENAME = sys.argv[1]
	OUT_FILENAME = sys.argv[2]

	print(f"Converting GFA {IN_FILENAME} --> no-seq GFA {OUT_FILENAME}...")

	num_segs = 0
	num_segs_with_seqs_rmd = 0

	with open(IN_FILENAME, "r") as infile:
	with open(OUT_FILENAME, "w") as outfile:
	for line in infile:
	if line.startswith("S\t"):
	split = line.strip().split("\t")
	seq = split[2]
	if seq != "*":
	for of in split[3:]:
	if of.startswith("LN:i:"):
	# Uh oh -- the sequence was given, but a LN:i tag
	# was also given. This is technically allowed in
	# the GFA spec, but this script is not designed to
	# handle these complex cases. So I'm just gonna
	# bail out.
	raise ValueError(
	f"Segment {split[1]} contains a defined "
	"sequence and an LN:i tag; these are mutually "
	"exclusive."
	)
	# OK, now we know that there is a defined sequence and
	# there is not a defined LN:i tag here. Nice.
	slen = len(seq)

	# This works fine even if there are no optional fields --
	# in that case, split[3:] will be [], so new_ofs will be
	# "".
	new_ofs = "\t".join(split[3:])
	# ... But we do need to add another tab character if
	# new_ofs isn't "", to separate this stuff from the new
	# LN:i: tag we'll be adding in.
	if len(new_ofs) > 0:
	new_ofs = "\t" + new_ofs

	new_line = f"S\t{split[1]}\t*\tLN:i:{slen}{new_ofs}\n"
	num_segs_with_seqs_rmd += 1
	else:
	new_line = line
	num_segs += 1
	else:
	new_line = line
	outfile.write(new_line)

	print(f"Observed {num_segs:,} segment(s).")
	print(
	f"{num_segs_with_seqs_rmd:,} of these segment(s) had defined sequences "
	"given."
	)
	print(
	f"Replaced these {num_segs_with_seqs_rmd:,} segment(s)' sequences with * "
	"characters and added LN:i tags."
	)
	finallogline = f"GFA with segment sequences removed created at {OUT_FILENAME}."
	print("=" * len(finallogline))
	print(finallogline)