Created
November 14, 2014 16:01
-
-
Save kdiverson/7aa94126978079fd1a68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
def get_next_fasta (fileObject): | |
'''usage: for header, seq in get_next_fasta(fileObject): | |
''' | |
header = '' | |
seq = '' | |
#The following for loop gets the header of the first fasta | |
#record. Skips any leading junk in the file | |
for line in fileObject: | |
if line.startswith('>'): | |
header = line.strip() | |
break | |
for line in fileObject: | |
if line.startswith('>'): | |
yield header, seq | |
header = line.strip() | |
seq = '' | |
else: | |
seq += line.strip() | |
#yield the last entry | |
if header: | |
yield header, seq | |
with open(sys.argv[1], 'r') as fasta, open(sys.argv[2], 'w') as outfile: | |
seqset = set() | |
count = 0 | |
for header, seq in get_next_fasta(fasta): | |
if seq in seqset: | |
continue | |
else: | |
seqset.add(seq) | |
count += 1 | |
outfile.write(">%s\n%s\n" % (count, seq.upper())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment