Skip to content

Instantly share code, notes, and snippets.

@yk-tanigawa
Last active May 18, 2016 08:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yk-tanigawa/4c514fb6ba414ab527a2c230b2592ce6 to your computer and use it in GitHub Desktop.
Save yk-tanigawa/4c514fb6ba414ab527a2c230b2592ce6 to your computer and use it in GitHub Desktop.
from itertools import groupby
import gzip
# Fasta IO
def fasta_iter(fasta_name):
'''
given a fasta file. yield tuples of header, sequence
modified from Brent Pedersen
Correct Way To Parse A Fasta File In Python
https://www.biostars.org/p/710/
'''
if((fasta_name[-3:] == '.gz') or
(fasta_name[-5:] == '.gzip')):
with gzip.open(fasta_name, 'rb') as f:
data = (x[1] for x in groupby(f, lambda line: line.decode('utf-8')[0] == ">"))
for header in data:
header = header.__next__().decode('utf-8')[1:].strip()
seq = "".join(s.decode('utf-8').strip() for s in data.__next__())
yield(header, seq)
else:
with open(fasta_name) as f:
# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
data = (x[1] for x in groupby(f, lambda line: line[0] == ">"))
for header in data:
# drop the ">"
header = header.__next__()[1:].strip()
# join all sequence lines to one.
seq = "".join(s.strip() for s in data.__next__())
yield(header, seq)
fasta = '/somewhere/hg19.fa.gz'
seqs = {}
for (head, seq) in fasta_iter(fasta):
seqs[head] = seq
print(seqs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment