Last active
May 18, 2016 08:51
-
-
Save yk-tanigawa/4c514fb6ba414ab527a2c230b2592ce6 to your computer and use it in GitHub Desktop.
fasta file のparser snipet ref: http://qiita.com/yk-tanigawa/items/0361b029863fa17e7ed9
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import groupby | |
import gzip | |
# Fasta IO | |
def fasta_iter(fasta_name): | |
''' | |
given a fasta file. yield tuples of header, sequence | |
modified from Brent Pedersen | |
Correct Way To Parse A Fasta File In Python | |
https://www.biostars.org/p/710/ | |
''' | |
if((fasta_name[-3:] == '.gz') or | |
(fasta_name[-5:] == '.gzip')): | |
with gzip.open(fasta_name, 'rb') as f: | |
data = (x[1] for x in groupby(f, lambda line: line.decode('utf-8')[0] == ">")) | |
for header in data: | |
header = header.__next__().decode('utf-8')[1:].strip() | |
seq = "".join(s.decode('utf-8').strip() for s in data.__next__()) | |
yield(header, seq) | |
else: | |
with open(fasta_name) as f: | |
# ditch the boolean (x[0]) and just keep the header or sequence since | |
# we know they alternate. | |
data = (x[1] for x in groupby(f, lambda line: line[0] == ">")) | |
for header in data: | |
# drop the ">" | |
header = header.__next__()[1:].strip() | |
# join all sequence lines to one. | |
seq = "".join(s.strip() for s in data.__next__()) | |
yield(header, seq) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fasta = '/somewhere/hg19.fa.gz' | |
seqs = {} | |
for (head, seq) in fasta_iter(fasta): | |
seqs[head] = seq | |
print(seqs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment