Last active
May 29, 2024 05:15
-
-
Save fedarko/1a98cc0641b75e4dc5314b46f1d5993a to your computer and use it in GitHub Desktop.
Remove sequences from a GFA 1 file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# | |
# SUMMARY | |
# ======= | |
# Outputs a copy of a GFA 1 file with each segment (S) line that contains a | |
# sequence (not just a "*" character) altered to have an LN:i: tag describing | |
# the length of the sequence, and the sequence replaced with a "*" character. | |
# | |
# All other lines (including S lines that already do not contain a sequence, | |
# and other types of lines [e.g. H, L, ...]) will be included unchanged in the | |
# output. | |
# | |
# MOTIVATION | |
# ========== | |
# Often, we are mostly interested in the structure of the graph described in a | |
# GFA file (rather than the details of the sequences contained therein). The | |
# sequences in a GFA file can take up a massive amount of space, though; this | |
# can make the storage and visualization of these files challenging. This | |
# script helps with these problems -- it's faster to, for example, download a | |
# GFA file without sequences to your laptop than it is to download a version of | |
# this file that contains sequences. | |
# | |
# USAGE | |
# ======= | |
# Depends on Python 3.6 or later. The input GFA file should be a GFA 1 file (I | |
# thiiink this should also work with GFA 2 files, but I'm not sure about that). | |
# | |
# You can use this script by running the following command (assuming this | |
# script is in the current working directory and is executable): | |
# | |
# $ ./rm_seqs_from_gfa.py input.gfa output.gfa | |
import sys | |
if len(sys.argv) != 3: | |
print(f"Use as {sys.argv[0]} input.gfa output.gfa") | |
sys.exit() | |
IN_FILENAME = sys.argv[1] | |
OUT_FILENAME = sys.argv[2] | |
print(f"Converting GFA {IN_FILENAME} --> no-seq GFA {OUT_FILENAME}...") | |
num_segs = 0 | |
num_segs_with_seqs_rmd = 0 | |
with open(IN_FILENAME, "r") as infile: | |
with open(OUT_FILENAME, "w") as outfile: | |
for line in infile: | |
if line.startswith("S\t"): | |
split = line.strip().split("\t") | |
seq = split[2] | |
if seq != "*": | |
has_len_tag = False | |
for of in split[3:]: | |
if of.startswith("LN:i:"): | |
has_len_tag = True | |
given_len = int(of[5:]) | |
if given_len != len(seq): | |
raise ValueError( | |
f"Segment {split[1]} contains a defined " | |
f"sequence (length {len(seq):,}) and the " | |
f"inconsistent length tag {of}." | |
) | |
if has_len_tag: | |
# no need to add a new LN tag | |
lntagtxt = "" | |
else: | |
slen = len(seq) | |
lntagtxt = f"\tLN:i:{slen}" | |
# This works fine even if there are no optional fields -- | |
# in that case, split[3:] will be [], so new_ofs will be | |
# "". | |
new_ofs = "\t".join(split[3:]) | |
# ... But we do need to add another tab character at the | |
# beginning if new_ofs isn't "" | |
if len(new_ofs) > 0: | |
new_ofs = "\t" + new_ofs | |
new_line = f"S\t{split[1]}\t*{lntagtxt}{new_ofs}\n" | |
num_segs_with_seqs_rmd += 1 | |
else: | |
new_line = line | |
num_segs += 1 | |
else: | |
new_line = line | |
outfile.write(new_line) | |
print(f"Observed {num_segs:,} segment(s).") | |
print( | |
f"{num_segs_with_seqs_rmd:,} of these segment(s) had defined sequences " | |
"given." | |
) | |
print( | |
f"Replaced these {num_segs_with_seqs_rmd:,} segment(s)' sequences with * " | |
"characters and added LN:i tags." | |
) | |
finallogline = f"GFA with segment sequences removed created at {OUT_FILENAME}." | |
print("=" * len(finallogline)) | |
print(finallogline) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment