Created
August 27, 2011 15:02
-
-
Save brentp/1175483 to your computer and use it in GitHub Desktop.
calculate per-exon coverage of CpG islands
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
from subprocess import Popen, PIPE | |
genome = sys.argv[1] | |
gene = sys.argv[2] | |
annos = "refGene" | |
def sh(cmd): | |
p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) | |
p.wait() | |
if p.returncode != 0: | |
print >>sys.stderr, "%s \nfailed" % (cmd) | |
print p.stderr.read() | |
sys.exit(p.returncode) | |
return p.stdout.read() | |
MYSQL = 'mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D %(genome)s -e ' % dict(genome=genome) | |
def get_coverage(estart, eend, cpgs): | |
exon = set(range(estart, eend)) | |
# inters is each cpg intersected with the exon | |
if len(cpgs) == 0: return 0 | |
inters = [exon.intersection(range(cstart, cend)) for cstart, cend in cpgs] | |
# covered is the union of the intersections | |
if len(inters) == 1: | |
covered = inters[0] | |
else: | |
covered = inters[0].union(*inters[1:]) | |
return len(covered) / float(len(exon)) | |
def get_exons(gene_name, anno_table): | |
""" | |
get the exons for a gene by name | |
""" | |
sql = '"SELECT exonStarts,exonEnds from %(table)s where name2 = \'%(gene)s\' limit 1;"' | |
sql %= dict(table=anno_table, gene=gene_name) | |
exons = (sh(MYSQL + sql).split("\n")[1]).split() | |
starts = [int(x) for x in exons[0].rstrip(",").split(",")] | |
ends = [int(x) for x in exons[1].rstrip(",").split(",")] | |
return zip(starts, ends) | |
def get_cpgs(start, end): | |
sql = '"SELECT chromStart,chromEnd, perGc, name from cpgIslandExt where chromEnd >= %(start)i and chromStart <= %(end)i"' | |
sql %= locals() | |
cpgs = sh(MYSQL + sql).strip().split("\n") | |
cpgs = [c.split() for c in cpgs[1:]] | |
ses = [(int(x[0]), int(x[1])) for x in cpgs] | |
# name is "name:%gc" | |
names = ["%s:%s" % (x[3], x[2]) for x in cpgs] | |
return ses, names | |
exons = get_exons(gene, annos) | |
print "exon-start\texon-end\tcoverage\tcpg:pct,..." | |
for start, end in exons: | |
cpgs, names = get_cpgs(start, end) | |
print "\t".join(map(str, (start, end, get_coverage(start, end, cpgs), ",".join(names)))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python exon-cpg.py hg19 gata3 | |
exon-start exon-end coverage cpg:pct,... | |
8096666 8096854 1.0 CpG::60.1 | |
8097249 8097859 1.0 CpG::60.1 | |
8100267 8100804 0.715083798883 CpG::71.4 | |
8105955 8106101 0 | |
8111435 8111561 0 | |
8115701 8117164 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment