kgori/fasta.py

## fasta.py
import re

# Fasta one-liners
# Parse fasta from a string
def fasta_parse_string(s):
    return dict([(lambda x:(x[0].strip(), re.sub(r'\s+','',x[2])))(l.partition('\n')) for l in s.split('>')[1:]])

# Parse fasta from a file
def fasta_parse_file(f):
    with open(f) as h:return fasta_parse_string(h.read())

# Write data (d) back into fasta format, breaking into lines (length=n)
def fasta_format(d, n):
    return '\n'.join(['>{}\n'.format(k)+'\n'.join([v[i:i+n] for i in range(0,len(v),n)]) for k,v in d.items()])

# Memory efficient fasta
class Fasta(object):
    def __init__(self, filename):
        self.file = open(filename, 'r')
        self.buffer = ''
        while not self.buffer.startswith('>'):
            self.consume()
        self.exhausted = False

    def consume(self):
        self.buffer = self.file.readline()
        if self.buffer == '':
            self.file.close()
            raise StopIteration

    def next(self):
        if self.exhausted:
            raise StopIteration

        desc = self.buffer.strip('>').strip()
        self.consume()
        seq = []

        while not self.buffer.startswith('>'):
            seq.append(self.buffer)
            try:
                self.consume()
            except StopIteration:
                self.exhausted = True
                break

        return desc, re.sub(r'\s+', '', ''.join(seq))

    def __iter__(self):
        return self

def parse_fasta(filename):
    return {id: seq for id, seq in Fasta(filename)}
	import re

	# Fasta one-liners
	# Parse fasta from a string
	def fasta_parse_string(s):
	return dict([(lambda x:(x[0].strip(), re.sub(r'\s+','',x[2])))(l.partition('\n')) for l in s.split('>')[1:]])

	# Parse fasta from a file
	def fasta_parse_file(f):
	with open(f) as h:return fasta_parse_string(h.read())

	# Write data (d) back into fasta format, breaking into lines (length=n)
	def fasta_format(d, n):
	return '\n'.join(['>{}\n'.format(k)+'\n'.join([v[i:i+n] for i in range(0,len(v),n)]) for k,v in d.items()])

	# Memory efficient fasta
	class Fasta(object):
	def __init__(self, filename):
	self.file = open(filename, 'r')
	self.buffer = ''
	while not self.buffer.startswith('>'):
	self.consume()
	self.exhausted = False

	def consume(self):
	self.buffer = self.file.readline()
	if self.buffer == '':
	self.file.close()
	raise StopIteration

	def next(self):
	if self.exhausted:
	raise StopIteration

	desc = self.buffer.strip('>').strip()
	self.consume()
	seq = []

	while not self.buffer.startswith('>'):
	seq.append(self.buffer)
	try:
	self.consume()
	except StopIteration:
	self.exhausted = True
	break

	return desc, re.sub(r'\s+', '', ''.join(seq))

	def __iter__(self):
	return self

	def parse_fasta(filename):
	return {id: seq for id, seq in Fasta(filename)}