ivan-krukov/fastaparse.py

## fastaparse.py
#load everythin in memory

#split
def split_fasta(input_file):
	with open(input_file) as fasta_file:
		text = fasta_file.read().split(">")[1:]
		data = []
		for entry in text:
			header,sequence = entry.split("\n",1)
			sequence = sequence.replace("\n","")
			data.append((header,sequence))
	return data

#same using regular expressions
from re import findall, split, sub, MULTILINE
def re_fasta(input_file):
	with open(input_file) as fasta_file:
		text = findall(r">[^>]+",fasta_file.read(),MULTILINE)
		data = []
		for entry in text:
			header,sequence = split("\n",entry,1)
			sequence = sub("\n","",sequence)
			data.append((header,sequence))
	return data

#using named tuples
from collections import namedtuple
Entry = namedtuple("Entry",("header","seq"))
def nt_fasta(input_file):
	with open(input_file) as fasta_file:
		text = fasta_file.read().split(">")[1:]
		data = []
		for entry in text:
			header,sequence = entry.split("\n",1)
			sequence = sequence.replace("\n","")
			data.append(Entry(header,seq=sequence))
	return data

#line by line with generators
def gen_fasta(input_file):
	Entry = namedtuple("Entry",("header","seq"))
	seq_buffer = []
	with open(input_file) as fasta_file:
		for line in fasta_file:
			line = line.strip()
			if line.startswith(">"):
				if seq_buffer:
					yield Entry(header,"".join(seq_buffer))
					seq_buffer = []
				header = line
			else:
				seq_buffer.append(line)
		yield Entry(header,"".join(seq_buffer))
	#load everythin in memory

	#split
	def split_fasta(input_file):
	with open(input_file) as fasta_file:
	text = fasta_file.read().split(">")[1:]
	data = []
	for entry in text:
	header,sequence = entry.split("\n",1)
	sequence = sequence.replace("\n","")
	data.append((header,sequence))
	return data

	#same using regular expressions
	from re import findall, split, sub, MULTILINE
	def re_fasta(input_file):
	with open(input_file) as fasta_file:
	text = findall(r">[^>]+",fasta_file.read(),MULTILINE)
	data = []
	for entry in text:
	header,sequence = split("\n",entry,1)
	sequence = sub("\n","",sequence)
	data.append((header,sequence))
	return data

	#using named tuples
	from collections import namedtuple
	Entry = namedtuple("Entry",("header","seq"))
	def nt_fasta(input_file):
	with open(input_file) as fasta_file:
	text = fasta_file.read().split(">")[1:]
	data = []
	for entry in text:
	header,sequence = entry.split("\n",1)
	sequence = sequence.replace("\n","")
	data.append(Entry(header,seq=sequence))
	return data

	#line by line with generators
	def gen_fasta(input_file):
	Entry = namedtuple("Entry",("header","seq"))
	seq_buffer = []
	with open(input_file) as fasta_file:
	for line in fasta_file:
	line = line.strip()
	if line.startswith(">"):
	if seq_buffer:
	yield Entry(header,"".join(seq_buffer))
	seq_buffer = []
	header = line
	else:
	seq_buffer.append(line)
	yield Entry(header,"".join(seq_buffer))