Skip to content

Instantly share code, notes, and snippets.

@fedarko
Last active May 29, 2024 05:15
Show Gist options
  • Save fedarko/1a98cc0641b75e4dc5314b46f1d5993a to your computer and use it in GitHub Desktop.
Save fedarko/1a98cc0641b75e4dc5314b46f1d5993a to your computer and use it in GitHub Desktop.
Remove sequences from a GFA 1 file
#! /usr/bin/env python3
#
# SUMMARY
# =======
# Outputs a copy of a GFA 1 file with each segment (S) line that contains a
# sequence (not just a "*" character) altered to have an LN:i: tag describing
# the length of the sequence, and the sequence replaced with a "*" character.
#
# All other lines (including S lines that already do not contain a sequence,
# and other types of lines [e.g. H, L, ...]) will be included unchanged in the
# output.
#
# MOTIVATION
# ==========
# Often, we are mostly interested in the structure of the graph described in a
# GFA file (rather than the details of the sequences contained therein). The
# sequences in a GFA file can take up a massive amount of space, though; this
# can make the storage and visualization of these files challenging. This
# script helps with these problems -- it's faster to, for example, download a
# GFA file without sequences to your laptop than it is to download a version of
# this file that contains sequences.
#
# USAGE
# =======
# Depends on Python 3.6 or later. The input GFA file should be a GFA 1 file (I
# thiiink this should also work with GFA 2 files, but I'm not sure about that).
#
# You can use this script by running the following command (assuming this
# script is in the current working directory and is executable):
#
# $ ./rm_seqs_from_gfa.py input.gfa output.gfa
import sys
if len(sys.argv) != 3:
print(f"Use as {sys.argv[0]} input.gfa output.gfa")
sys.exit()
IN_FILENAME = sys.argv[1]
OUT_FILENAME = sys.argv[2]
print(f"Converting GFA {IN_FILENAME} --> no-seq GFA {OUT_FILENAME}...")
num_segs = 0
num_segs_with_seqs_rmd = 0
with open(IN_FILENAME, "r") as infile:
with open(OUT_FILENAME, "w") as outfile:
for line in infile:
if line.startswith("S\t"):
split = line.strip().split("\t")
seq = split[2]
if seq != "*":
has_len_tag = False
for of in split[3:]:
if of.startswith("LN:i:"):
has_len_tag = True
given_len = int(of[5:])
if given_len != len(seq):
raise ValueError(
f"Segment {split[1]} contains a defined "
f"sequence (length {len(seq):,}) and the "
f"inconsistent length tag {of}."
)
if has_len_tag:
# no need to add a new LN tag
lntagtxt = ""
else:
slen = len(seq)
lntagtxt = f"\tLN:i:{slen}"
# This works fine even if there are no optional fields --
# in that case, split[3:] will be [], so new_ofs will be
# "".
new_ofs = "\t".join(split[3:])
# ... But we do need to add another tab character at the
# beginning if new_ofs isn't ""
if len(new_ofs) > 0:
new_ofs = "\t" + new_ofs
new_line = f"S\t{split[1]}\t*{lntagtxt}{new_ofs}\n"
num_segs_with_seqs_rmd += 1
else:
new_line = line
num_segs += 1
else:
new_line = line
outfile.write(new_line)
print(f"Observed {num_segs:,} segment(s).")
print(
f"{num_segs_with_seqs_rmd:,} of these segment(s) had defined sequences "
"given."
)
print(
f"Replaced these {num_segs_with_seqs_rmd:,} segment(s)' sequences with * "
"characters and added LN:i tags."
)
finallogline = f"GFA with segment sequences removed created at {OUT_FILENAME}."
print("=" * len(finallogline))
print(finallogline)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment