Skip to content

Instantly share code, notes, and snippets.

@audy
Forked from anonymous/streaming_fasta_stats.py
Last active December 23, 2015 09:09
Show Gist options
  • Save audy/6612637 to your computer and use it in GitHub Desktop.
Save audy/6612637 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# calculate mean and variance of the sequence lengths in a large fasta file
# method taken from:
# http://math.stackexchange.com/questions/20593/calculate-variance-from-a-stream-of-sample-values
# m_k = m_(k-1) + x_k + m_(k-1)
# v_k = v_(k-1) + (x_k -m_(k-1))(x_k - m_k)
# m - mean
# v - variance
from Bio import SeqIO
import argparse
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--fasta-file')
return parser.parse_args()
def main():
args = parse_args()
m = None
with open(args.fasta_file) as handle:
records = SeqIO.parse(handle, 'fasta')
# extremely un-pythonic literal translation of John D. Cook's C++ code
# http://www.johndcook.com/standard_deviation.html
for k, record in enumerate(records, start=1):
x = float(len(record.seq))
if k == 1:
m_oldM = m_newM = x
m_oldS = 0
else:
m_newM = m_oldM + (x - m_oldM)/k
m_newS = m_oldS + (x - m_oldM)*(x - m_newM)
m_oldM = m_newM
m_oldS = m_newS
print 'mean = %s, variance = %s' % (m_newM, m_newS/k)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment