Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active December 31, 2015 02:29
Show Gist options
  • Save crazyhottommy/7921488 to your computer and use it in GitHub Desktop.
Save crazyhottommy/7921488 to your computer and use it in GitHub Desktop.
def TSS_Profile(ifile1,ifile2):
'''read in three files, ifile1 is the sortedbamfile prepared by samtool
ifile2 is the promoters (upstream 5kb of TSS) bed file with five columns: chr, start ,end, name and strand'''
import HTSeq
import numpy
import itertools
sortedbamfile=HTSeq.BAM_Reader(ifile1)
promoters = open(ifile2)
halfwinwidth=5000
fragmentsize=200
promoters_pos=set()
for line in promoters:
linelist=line.split()
if linelist[4] == "+":
promoters_pos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[2]),linelist[4]))
else:
promoters_pos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[1]),linelist[4]))
for promoter in itertools.islice(promoters_pos,10):
print promoter # print out 10 promoters
profile=numpy.zeros(2*halfwinwidth, dtype='i')
for p in promoters_pos:
try:
window=HTSeq.GenomicInterval(p.chrom, p.pos-halfwinwidth-fragmentsize,p.pos+halfwinwidth + fragmentsize,".")
for almnt in sortedbamfile[window]:
almnt.iv.length=fragmentsize
if p.strand=="+":
start_in_window=almnt.iv.start- p.pos +halfwinwidth
end_in_window =almnt.iv.end - p.pos +halfwinwidth
else:
start_in_window=p.pos+halfwinwidth-almnt.iv.end
end_in_window =p.pos+halfwinwidth-almnt.iv.start
start_in_window=max(start_in_window,0)
end_in_window=min(end_in_window, 2*halfwinwidth)
if start_in_window >= 2*halfwinwidth or end_in_window <0:
continue
profile[start_in_window : end_in_window] +=1
except:
continue
return profile
ifile1.close()
ifile2.close()
def aligned_counts(ifile1):
'''count how many alignments are aligned back to genome, ifile1 is a sorted bam file'''
import HTSeq
sortedbamfile= HTSeq.BAM_Reader(ifile1)
aligned_counts=0
unaligned_counts=0
for almnt in sortedbamfile:
if almnt.aligned:
aligned_counts+= 1
else:
unaligned_counts+=1
print "number of aligned tags of %s is %d " % (ifile1, aligned_counts)
print "number of unaligned tags of %s is %d "% (ifile1, unaligned_counts)
return aligned_counts
input_bamfile=("/home/tommy/Tet1/Tet1.sorted.bam")
counts=aligned_counts(input_bamfile)
halfwinwidth=5000
profile1=TSS_Profile(input_bamfile,\
"/home/tommy/Tet1/CpG_rich_promoters.bed")
profile1_normalized= (profile1 *1000000.0/counts) /16171.0
#normalize to count per million (cpm)
profile2=TSS_Profile(input_bamfile,\
"/home/tommy/Tet1/CpG_poor_promoters.bed")
profile2_normalized= (profile2*1000000.0/counts) /16384.0
from matplotlib import pyplot
import numpy
line1=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile1_normalized, color="red",label="CpG_rich_promoters")
line2=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile2_normalized, color="blue",label="CpG_poor_promoters")
pyplot.legend()
pyplot.xlabel("distance related to TSS bp")
pyplot.ylabel("Tet1 tag density cpm")
pyplot.title("Tet1 ChIP-seq intensity around TSS ")
pyplot.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment