crazyhottommy/Tet1_CpG.py

## Tet1_CpG.py
# This code was modified from the tss plot code. It can plot any other ChIP-seq signal
# at other genomic positions in addtion to tss. In this case, it is the HRE. HIF1a ChIP-seq data
# is available, peaks were called by MACS, generated a bed file. the middle point
# of each peak is used as the center of the plot (you can also use summit of the peak from the exel file
# generated from MACS. HREs at promoters are not included
# 04/10/13

def TSS_Profile(ifile1,ifile2):
    '''read in three files, ifile1 is the sortedbamfile prepared by samtool
    ifile2 is the promoters (upstream 5kb of TSS) bed  file with five  columns: chr, start ,end, name and  strand'''

    import HTSeq
    import numpy
    import itertools

    sortedbamfile=HTSeq.BAM_Reader(ifile1)
    promoters = open(ifile2)
    halfwinwidth=5000
    fragmentsize=200


    promoters_pos=set()
    for line in promoters:
        linelist=line.split()
        if linelist[4] == "+":
            promoters_pos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[2]),linelist[4]))
        else:
            promoters_pos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[1]),linelist[4]))


    for promoter in itertools.islice(promoters_pos,10):
        print promoter  # print out 10 promoters

    profile=numpy.zeros(2*halfwinwidth, dtype='i')
    for p in promoters_pos:
        try:
            window=HTSeq.GenomicInterval(p.chrom, p.pos-halfwinwidth-fragmentsize,p.pos+halfwinwidth + fragmentsize,".")
            for almnt in sortedbamfile[window]:
                almnt.iv.length=fragmentsize
                if p.strand=="+":
                    start_in_window=almnt.iv.start- p.pos +halfwinwidth
                    end_in_window  =almnt.iv.end  - p.pos +halfwinwidth
                else:
                    start_in_window=p.pos+halfwinwidth-almnt.iv.end
                    end_in_window =p.pos+halfwinwidth-almnt.iv.start
                start_in_window=max(start_in_window,0)
                end_in_window=min(end_in_window, 2*halfwinwidth)
                if start_in_window >= 2*halfwinwidth or end_in_window <0:
                    continue
                profile[start_in_window : end_in_window] +=1
        except:
           continue
    return profile
    ifile1.close()
    ifile2.close()

def aligned_counts(ifile1):
    '''count how many alignments are aligned back to genome, ifile1 is a sorted bam file'''
    import HTSeq
    sortedbamfile= HTSeq.BAM_Reader(ifile1)
    aligned_counts=0
    unaligned_counts=0
    for almnt in sortedbamfile:
        if almnt.aligned:
            aligned_counts+= 1
        else:
            unaligned_counts+=1
    print "number of aligned tags of %s is %d " % (ifile1, aligned_counts)
    print "number of unaligned tags of %s is %d "% (ifile1, unaligned_counts)
    return aligned_counts

input_bamfile=("/home/tommy/Tet1/SRR764931_shGFP.sorted.bam")

#counts=aligned_counts(input_bamfile)

counts = 49437901

halfwinwidth=5000

profile1=TSS_Profile(input_bamfile,\
                     "/home/tommy/Tet1/CpG_rich_promoters.bed")
profile1_normalized= (profile1 *1000000.0/counts) /16171.0

profile2=TSS_Profile(input_bamfile,\
                     "/home/tommy/Tet1/CpG_poor_promoters.bed")
profile2_normalized= (profile2*1000000.0/counts) /16384.0


from matplotlib import pyplot
import numpy


line1=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile1_normalized, color="red",label="CpG_rich_promoters")
line2=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile2_normalized, color="blue",label="CpG_poor_promoters")

pyplot.legend()
pyplot.xlabel("distance related to TSS  bp")
pyplot.ylabel("MeDIP-seq  tag density cpm")
pyplot.title("MeDIP-seq intensity around TSS ")


pyplot.show()
	# This code was modified from the tss plot code. It can plot any other ChIP-seq signal
	# at other genomic positions in addtion to tss. In this case, it is the HRE. HIF1a ChIP-seq data
	# is available, peaks were called by MACS, generated a bed file. the middle point
	# of each peak is used as the center of the plot (you can also use summit of the peak from the exel file
	# generated from MACS. HREs at promoters are not included
	# 04/10/13

	def TSS_Profile(ifile1,ifile2):
	'''read in three files, ifile1 is the sortedbamfile prepared by samtool
	ifile2 is the promoters (upstream 5kb of TSS) bed file with five columns: chr, start ,end, name and strand'''

	import HTSeq
	import numpy
	import itertools

	sortedbamfile=HTSeq.BAM_Reader(ifile1)
	promoters = open(ifile2)
	halfwinwidth=5000
	fragmentsize=200


	promoters_pos=set()
	for line in promoters:
	linelist=line.split()
	if linelist[4] == "+":
	promoters_pos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[2]),linelist[4]))
	else:
	promoters_pos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[1]),linelist[4]))


	for promoter in itertools.islice(promoters_pos,10):
	print promoter # print out 10 promoters

	profile=numpy.zeros(2*halfwinwidth, dtype='i')
	for p in promoters_pos:
	try:
	window=HTSeq.GenomicInterval(p.chrom, p.pos-halfwinwidth-fragmentsize,p.pos+halfwinwidth + fragmentsize,".")
	for almnt in sortedbamfile[window]:
	almnt.iv.length=fragmentsize
	if p.strand=="+":
	start_in_window=almnt.iv.start- p.pos +halfwinwidth
	end_in_window =almnt.iv.end - p.pos +halfwinwidth
	else:
	start_in_window=p.pos+halfwinwidth-almnt.iv.end
	end_in_window =p.pos+halfwinwidth-almnt.iv.start
	start_in_window=max(start_in_window,0)
	end_in_window=min(end_in_window, 2*halfwinwidth)
	if start_in_window >= 2*halfwinwidth or end_in_window <0:
	continue
	profile[start_in_window : end_in_window] +=1
	except:
	continue
	return profile
	ifile1.close()
	ifile2.close()

	def aligned_counts(ifile1):
	'''count how many alignments are aligned back to genome, ifile1 is a sorted bam file'''
	import HTSeq
	sortedbamfile= HTSeq.BAM_Reader(ifile1)
	aligned_counts=0
	unaligned_counts=0
	for almnt in sortedbamfile:
	if almnt.aligned:
	aligned_counts+= 1
	else:
	unaligned_counts+=1
	print "number of aligned tags of %s is %d " % (ifile1, aligned_counts)
	print "number of unaligned tags of %s is %d "% (ifile1, unaligned_counts)
	return aligned_counts

	input_bamfile=("/home/tommy/Tet1/SRR764931_shGFP.sorted.bam")

	#counts=aligned_counts(input_bamfile)

	counts = 49437901

	halfwinwidth=5000

	profile1=TSS_Profile(input_bamfile,\
	"/home/tommy/Tet1/CpG_rich_promoters.bed")
	profile1_normalized= (profile1 *1000000.0/counts) /16171.0

	profile2=TSS_Profile(input_bamfile,\
	"/home/tommy/Tet1/CpG_poor_promoters.bed")
	profile2_normalized= (profile2*1000000.0/counts) /16384.0


	from matplotlib import pyplot
	import numpy


	line1=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile1_normalized, color="red",label="CpG_rich_promoters")
	line2=pyplot.plot(numpy.arange(-halfwinwidth, halfwinwidth), profile2_normalized, color="blue",label="CpG_poor_promoters")

	pyplot.legend()
	pyplot.xlabel("distance related to TSS bp")
	pyplot.ylabel("MeDIP-seq tag density cpm")
	pyplot.title("MeDIP-seq intensity around TSS ")


	pyplot.show()