Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Counting read spacing
#!/usr/env python
from argparse import ArgumentParser
import os, re
from pybedtools import BedTool
import HTSeq
import numpy as np
import pandas as pd
import string
import itertools
import cPickle as pickle
def makeWindows(windowWidth, genome):
"""Generate 1kb windows genome-wide."""
w = BedTool.window_maker(BedTool(), genome=genome, w=windowWidth)
windows = dict()
for interval in w:
feature = HTSeq.GenomicInterval(
name = string.join(interval.fields, sep="_")
windows[name] = feature
return windows
def distances(bam, intervals, fragmentsize, duplicates=True):
""" Gets read coverage in bed regions, returns dict with region:count.
bam - Bam object from HTSeq.BAM_Reader.
intervals - dict with HTSeq.GenomicInterval objects as values.
fragmentsize - integer.
duplicates - boolean.
dists = dict()
for name, feature in intervals.items():
# Fetch alignments in feature window
for aln1, aln2 in itertools.combinations(bam[feature], 2):
# check if duplicate
if not duplicates and (aln1.pcr_or_optical_duplicate or aln2.pcr_or_optical_duplicate):
# adjust fragment to size
aln1.iv.length = fragmentsize
aln2.iv.length = fragmentsize
dist = abs(aln1.iv.start - aln2.iv.start)
# add +1 to dict
if dist not in dists.keys():
dists[dist] = 1
dists[dist] += 1
return dists
def main(args):
args.plots_dir = os.path.abspath(args.plots_dir)
# Get sample names
names = list()
for bam in args.bamfiles:
names.append(re.sub("\.bam", "", os.path.basename(bam)))
# Get genome-wide windows
print("Making %ibp windows genome-wide" % args.window_width)
windows = makeWindows(args.window_width, args.genome)
# Loop through all signals, compute distances, plot
for bam in xrange(len(args.bamfiles)):
print("Sample " + names[bam])
# Load bam
bamfile = HTSeq.BAM_Reader(os.path.abspath(args.bamfiles[bam]))
# Get dataframe of bam coverage in bed regions, append to dict
dists = distances(bamfile, windows, args.fragment_size, args.duplicates)
pickle.dump(dists, open(names[bam] + ".counts.pickle", "wb"), protocol = pickle.HIGHEST_PROTOCOL)
x = range(50, 200)
y = [dists[i] for i in x]
plt.scatter(x, y)
p20 = np.poly1d(np.polyfit(x, y, 20))
p50 = np.poly1d(np.polyfit(x, y, 50))
p100 = np.poly1d(np.polyfit(x, y, 100))
plt.plot(x, p20(x), '-', x, p50(x), '--', x, p100(x), '.')
plt.savefig(os.path.join(args.plots_dir, names[bam] + ".fit.pdf"))
if __name__ == '__main__':
### Parse command-line arguments
parser = ArgumentParser(
description = '',
usage = 'python <directory> file1, file2... '
### Global options
# positional arguments
parser.add_argument(dest='plots_dir', type=str, help='Directory to save plots to.')
parser.add_argument('bamfiles', nargs = '*', help = 'bamFiles')
# optional arguments
parser.add_argument('--duplicates', dest='duplicates', action='store_true')
parser.add_argument('--window-width', dest='window_width', type=int, default=1000)
parser.add_argument('--fragment-size', dest='fragment_size', type=int, default=1)
parser.add_argument('--genome', dest='genome', type=str, default='hg19')
args = parser.parse_args()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment