Skip to content

Instantly share code, notes, and snippets.

@kdiverson
Created November 14, 2014 16:01
Show Gist options
  • Save kdiverson/7aa94126978079fd1a68 to your computer and use it in GitHub Desktop.
Save kdiverson/7aa94126978079fd1a68 to your computer and use it in GitHub Desktop.
import sys
def get_next_fasta (fileObject):
'''usage: for header, seq in get_next_fasta(fileObject):
'''
header = ''
seq = ''
#The following for loop gets the header of the first fasta
#record. Skips any leading junk in the file
for line in fileObject:
if line.startswith('>'):
header = line.strip()
break
for line in fileObject:
if line.startswith('>'):
yield header, seq
header = line.strip()
seq = ''
else:
seq += line.strip()
#yield the last entry
if header:
yield header, seq
with open(sys.argv[1], 'r') as fasta, open(sys.argv[2], 'w') as outfile:
seqset = set()
count = 0
for header, seq in get_next_fasta(fasta):
if seq in seqset:
continue
else:
seqset.add(seq)
count += 1
outfile.write(">%s\n%s\n" % (count, seq.upper()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment