/streaming_fasta_stats.py

## streaming_fasta_stats.py
#!/usr/bin/env python

# calculate mean and variance of the sequence lengths in a large fasta file

# method taken from:
# http://math.stackexchange.com/questions/20593/calculate-variance-from-a-stream-of-sample-values

# m_k = m_(k-1) + x_k + m_(k-1)
# v_k = v_(k-1) + (x_k -m_(k-1))(x_k - m_k)

# m - mean
# v - variance

from Bio import SeqIO
import argparse

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fasta-file')
    return parser.parse_args()

def main():
    args = parse_args()

    m = None

    with open(args.fasta_file) as handle:
        records = SeqIO.parse(handle, 'fasta')

        # extremely un-pythonic literal translation of John D. Cook's C++ code
        # http://www.johndcook.com/standard_deviation.html
        for k, record in enumerate(records, start=1):
            x = float(len(record.seq))
            if k == 1:
                m_oldM = m_newM = x
                m_oldS = 0
            else:
                m_newM = m_oldM + (x - m_oldM)/k
                m_newS = m_oldS + (x - m_oldM)*(x - m_newM)

                m_oldM = m_newM
                m_oldS = m_newS


        print 'mean = %s, variance = %s' % (m_newM, m_newS)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	# calculate mean and variance of the sequence lengths in a large fasta file

	# method taken from:
	# http://math.stackexchange.com/questions/20593/calculate-variance-from-a-stream-of-sample-values

	# m_k = m_(k-1) + x_k + m_(k-1)
	# v_k = v_(k-1) + (x_k -m_(k-1))(x_k - m_k)

	# m - mean
	# v - variance

	from Bio import SeqIO
	import argparse

	def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('--fasta-file')
	return parser.parse_args()

	def main():
	args = parse_args()

	m = None

	with open(args.fasta_file) as handle:
	records = SeqIO.parse(handle, 'fasta')

	# extremely un-pythonic literal translation of John D. Cook's C++ code
	# http://www.johndcook.com/standard_deviation.html
	for k, record in enumerate(records, start=1):
	x = float(len(record.seq))
	if k == 1:
	m_oldM = m_newM = x
	m_oldS = 0
	else:
	m_newM = m_oldM + (x - m_oldM)/k
	m_newS = m_oldS + (x - m_oldM)*(x - m_newM)

	m_oldM = m_newM
	m_oldS = m_newS


	print 'mean = %s, variance = %s' % (m_newM, m_newS)

	if __name__ == '__main__':
	main()