Skip to content

Instantly share code, notes, and snippets.

@brentp
Created August 27, 2011 15:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brentp/1175483 to your computer and use it in GitHub Desktop.
Save brentp/1175483 to your computer and use it in GitHub Desktop.
calculate per-exon coverage of CpG islands
#!/usr/bin/python
import sys
from subprocess import Popen, PIPE
genome = sys.argv[1]
gene = sys.argv[2]
annos = "refGene"
def sh(cmd):
p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
p.wait()
if p.returncode != 0:
print >>sys.stderr, "%s \nfailed" % (cmd)
print p.stderr.read()
sys.exit(p.returncode)
return p.stdout.read()
MYSQL = 'mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D %(genome)s -e ' % dict(genome=genome)
def get_coverage(estart, eend, cpgs):
exon = set(range(estart, eend))
# inters is each cpg intersected with the exon
if len(cpgs) == 0: return 0
inters = [exon.intersection(range(cstart, cend)) for cstart, cend in cpgs]
# covered is the union of the intersections
if len(inters) == 1:
covered = inters[0]
else:
covered = inters[0].union(*inters[1:])
return len(covered) / float(len(exon))
def get_exons(gene_name, anno_table):
"""
get the exons for a gene by name
"""
sql = '"SELECT exonStarts,exonEnds from %(table)s where name2 = \'%(gene)s\' limit 1;"'
sql %= dict(table=anno_table, gene=gene_name)
exons = (sh(MYSQL + sql).split("\n")[1]).split()
starts = [int(x) for x in exons[0].rstrip(",").split(",")]
ends = [int(x) for x in exons[1].rstrip(",").split(",")]
return zip(starts, ends)
def get_cpgs(start, end):
sql = '"SELECT chromStart,chromEnd, perGc, name from cpgIslandExt where chromEnd >= %(start)i and chromStart <= %(end)i"'
sql %= locals()
cpgs = sh(MYSQL + sql).strip().split("\n")
cpgs = [c.split() for c in cpgs[1:]]
ses = [(int(x[0]), int(x[1])) for x in cpgs]
# name is "name:%gc"
names = ["%s:%s" % (x[3], x[2]) for x in cpgs]
return ses, names
exons = get_exons(gene, annos)
print "exon-start\texon-end\tcoverage\tcpg:pct,..."
for start, end in exons:
cpgs, names = get_cpgs(start, end)
print "\t".join(map(str, (start, end, get_coverage(start, end, cpgs), ",".join(names))))
$ python exon-cpg.py hg19 gata3
exon-start exon-end coverage cpg:pct,...
8096666 8096854 1.0 CpG::60.1
8097249 8097859 1.0 CpG::60.1
8100267 8100804 0.715083798883 CpG::71.4
8105955 8106101 0
8111435 8111561 0
8115701 8117164 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment