Skip to content

Instantly share code, notes, and snippets.

@ivan-krukov
Created May 29, 2013 21:04
Show Gist options
  • Save ivan-krukov/5673834 to your computer and use it in GitHub Desktop.
Save ivan-krukov/5673834 to your computer and use it in GitHub Desktop.
Different ways of parsing fasta files. Last one is my favorite - generators and named tuples.
#load everythin in memory
#split
def split_fasta(input_file):
with open(input_file) as fasta_file:
text = fasta_file.read().split(">")[1:]
data = []
for entry in text:
header,sequence = entry.split("\n",1)
sequence = sequence.replace("\n","")
data.append((header,sequence))
return data
#same using regular expressions
from re import findall, split, sub, MULTILINE
def re_fasta(input_file):
with open(input_file) as fasta_file:
text = findall(r">[^>]+",fasta_file.read(),MULTILINE)
data = []
for entry in text:
header,sequence = split("\n",entry,1)
sequence = sub("\n","",sequence)
data.append((header,sequence))
return data
#using named tuples
from collections import namedtuple
Entry = namedtuple("Entry",("header","seq"))
def nt_fasta(input_file):
with open(input_file) as fasta_file:
text = fasta_file.read().split(">")[1:]
data = []
for entry in text:
header,sequence = entry.split("\n",1)
sequence = sequence.replace("\n","")
data.append(Entry(header,seq=sequence))
return data
#line by line with generators
def gen_fasta(input_file):
Entry = namedtuple("Entry",("header","seq"))
seq_buffer = []
with open(input_file) as fasta_file:
for line in fasta_file:
line = line.strip()
if line.startswith(">"):
if seq_buffer:
yield Entry(header,"".join(seq_buffer))
seq_buffer = []
header = line
else:
seq_buffer.append(line)
yield Entry(header,"".join(seq_buffer))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment