Skip to content

Instantly share code, notes, and snippets.

@jvkersch
Created June 21, 2021 19:44
Show Gist options
  • Save jvkersch/9e830dd6d9ee82b244a02343bc43b613 to your computer and use it in GitHub Desktop.
Save jvkersch/9e830dd6d9ee82b244a02343bc43b613 to your computer and use it in GitHub Desktop.
Deduplicate a FASTA file
import argparse
import sys
from Bio import SeqIO
def _parse_args():
p = argparse.ArgumentParser()
p.add_argument("input")
return p.parse_args().input
def _seq_generator(handle):
seen = set()
for record in SeqIO.parse(handle, "fasta"):
seq = str(record.seq).upper()
if seq not in seen:
seen.add(seq)
record.seq = record.seq.upper()
yield record
def main():
fname = _parse_args()
with open(fname, encoding="utf-8") as handle:
SeqIO.write(_seq_generator(handle), sys.stdout, "fasta")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment