Skip to content

Instantly share code, notes, and snippets.

@kgori
Last active March 23, 2018 11:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kgori/06ab0a7b338e223c38022c09fc5ae037 to your computer and use it in GitHub Desktop.
Save kgori/06ab0a7b338e223c38022c09fc5ae037 to your computer and use it in GitHub Desktop.
Estimate insert size distribution of proper pairs in bam file
import pysam
import math
from itertools import islice
def filterfn(read):
return (read.is_proper_pair and
read.is_paired and
read.tlen > 0 and
not read.is_supplementary and
not read.is_duplicate and
not read.is_unmapped and
not read.mate_is_unmapped)
if __name__ == '__main__':
import sys
try:
filename = sys.argv[1]
samplesize = int(sys.argv[2])
except:
print('Usage: {} <filename:BAM> <samplesize:INT>'.format(sys.argv[0]))
bam = pysam.Samfile(filename, 'rb')
l = islice((read.tlen for read in bam if filterfn(read)), samplesize)
l = list(l)
assert len(l) == samplesize
mean = float(sum(l)) / len(l)
sdev = math.sqrt(float(sum([(x - mean)**2 for x in l])) / (len(l) - 1))
print('Insert size distribution of {}, based on {} reads:'.format(filename, samplesize))
print('Mean: {:.2f}\nStandard deviation: {:.2f}'.format(mean, sdev))
print('5*: {:.0f}'.format(round(mean + 5 * sdev)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment