Created
August 6, 2023 02:42
-
-
Save fedarko/1a98cc0641b75e4dc5314b46f1d5993a to your computer and use it in GitHub Desktop.
Remove sequences from a GFA 1 file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# | |
# SUMMARY | |
# ======= | |
# Outputs a copy of a GFA 1 file with each segment (S) line that contains a | |
# sequence (not just a "*" character) altered as follows: | |
# | |
# - If an LN:i tag does not exist for this sequence: | |
# - We will add an LN:i tag describing the length of the sequence. | |
# - We will replace the sequence with a "*" character. | |
# | |
# - If an LN:i tag DOES exist for this sequence: | |
# - We will raise an error. (Maybe this is overly strict, but I figure | |
# better safe than sorry.) | |
# | |
# All other lines (including S lines that already do not contain a sequence, | |
# and other types of lines [e.g. H, L, ...]) will be included unchanged in the | |
# output. | |
# | |
# MOTIVATION | |
# ========== | |
# Often, we are only interested in the topology of a GFA file (rather than the | |
# details of the sequences contained therein). The sequences in a GFA file can | |
# take up a massive amount of space, though; this can make the storage and | |
# visualization of these files challenging. This script helps with these | |
# problems -- it's often faster to, for example, download a GFA file without | |
# sequences to your laptop than it is to download the same file containing | |
# sequences. | |
# | |
# USAGE | |
# ======= | |
# Depends on Python 3.6 or later. The input GFA file should be a GFA 1 file, in | |
# particular. | |
# | |
# You can use this script by running the following command (assuming this | |
# script is in the current working directory and is executable): | |
# | |
# $ ./rm_seqs_from_gfa.py input.gfa output.gfa | |
import sys | |
IN_FILENAME = sys.argv[1] | |
OUT_FILENAME = sys.argv[2] | |
print(f"Converting GFA {IN_FILENAME} --> no-seq GFA {OUT_FILENAME}...") | |
num_segs = 0 | |
num_segs_with_seqs_rmd = 0 | |
with open(IN_FILENAME, "r") as infile: | |
with open(OUT_FILENAME, "w") as outfile: | |
for line in infile: | |
if line.startswith("S\t"): | |
split = line.strip().split("\t") | |
seq = split[2] | |
if seq != "*": | |
for of in split[3:]: | |
if of.startswith("LN:i:"): | |
# Uh oh -- the sequence was given, but a LN:i tag | |
# was also given. This is technically allowed in | |
# the GFA spec, but this script is not designed to | |
# handle these complex cases. So I'm just gonna | |
# bail out. | |
raise ValueError( | |
f"Segment {split[1]} contains a defined " | |
"sequence and an LN:i tag; these are mutually " | |
"exclusive." | |
) | |
# OK, now we know that there *is* a defined sequence and | |
# there *is not* a defined LN:i tag here. Nice. | |
slen = len(seq) | |
# This works fine even if there are no optional fields -- | |
# in that case, split[3:] will be [], so new_ofs will be | |
# "". | |
new_ofs = "\t".join(split[3:]) | |
# ... But we do need to add another tab character if | |
# new_ofs isn't "", to separate this stuff from the new | |
# LN:i: tag we'll be adding in. | |
if len(new_ofs) > 0: | |
new_ofs = "\t" + new_ofs | |
new_line = f"S\t{split[1]}\t*\tLN:i:{slen}{new_ofs}\n" | |
num_segs_with_seqs_rmd += 1 | |
else: | |
new_line = line | |
num_segs += 1 | |
else: | |
new_line = line | |
outfile.write(new_line) | |
print(f"Observed {num_segs:,} segment(s).") | |
print( | |
f"{num_segs_with_seqs_rmd:,} of these segment(s) had defined sequences " | |
"given." | |
) | |
print( | |
f"Replaced these {num_segs_with_seqs_rmd:,} segment(s)' sequences with * " | |
"characters and added LN:i tags." | |
) | |
finallogline = f"GFA with segment sequences removed created at {OUT_FILENAME}." | |
print("=" * len(finallogline)) | |
print(finallogline) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment