Last active
March 15, 2019 14:10
-
-
Save mkweskin/8869358 to your computer and use it in GitHub Desktop.
Keep longest sequence when there are duplicate descriptions in FASTA file: Reads a FASTA file and if >1 sequence has the same description line, it only keeps the longest sequence. It outputs all the sequencs to stdout when complete.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Reads a FASTA file and if >1 sequence has the same description line, | |
it only keeps the longest sequence. It outputs all the sequencs to stdout | |
when complete. | |
""" | |
from Bio import SeqIO | |
import sys | |
if len(sys.argv) == 1: | |
print "ERROR: Please enter the filename to read as the first argument after the program name" | |
sys.exit() | |
else: | |
file = sys.argv[1] | |
seqs = {} | |
new = 0 #For testing, count of sequences as their added to seqs | |
existing = 0 #For testing, count of sequences NOT added because they already exist | |
for seq_record in SeqIO.parse(file, "fasta"): | |
if seq_record.name not in seqs: | |
seqs[seq_record.name]=seq_record.seq | |
new += 1 | |
else: | |
existing += 1 | |
if len(seqs[seq_record.name])<=len(seq_record.seq): | |
seqs[seq_record.name]=seq_record.seq | |
# print "Duplicate found, ", seq_record.name | |
for name, seq in seqs.iteritems(): | |
print ">"+name | |
print seq | |
#Uncomment below print statement to print out the sequence counts | |
#print new, existing, new+existing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment