willtownes/refgene2bed.py

## refgene2bed.py
"""
Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
All output files have the structure expected by bedtools, namely,
CHROM POSITION1 POSITION2 REFSEQ_ID
Possible output files include:
1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
2. proximal promoter (transcription start [-1KB,1KB])
3. gene body (anywhere between transcription start and transcription end)
4. transcript (anywhere in an exon)- outputs each exon as a separate line
5. first 1/3 transcript- outputs each exon as a separate line
6. middle 1/3 transcript- outputs each exon as a separate line
7. last 1/3 transcript- outputs each exon as a separate line
8. introns- outputs each intron as a separate line
"""

def splitExons(exons):
    '''given a list of tuples [(a,b),(c,d)...] representing intervals of exon locations, return a dictionary of tuple lists with the exons split out among first, mid, and last thirds of the overall transcript length. Exons that straddle the split point(s), will be split into separate exon intervals. Returns a dictionary of form:
    {"first":[(a,b),...],"mid":[(c,d),...],"last":[(e,f),...]}'''
    l = [i[1]-i[0] for i in exons] #all values should be positive!
    length = sum(l)
    Q = length/3 #boundary between first and mid
    R = 2*length/3 #boundary between mid and high
    first = []
    mid = []
    last = []
    acc = 0 #accumulated length
    for i in xrange(len(exons)):
        acc_new = acc + l[i]
        if acc_new <= Q: #entirely in the first third
            first.append(exons[i])
        elif Q<acc_new<=R and acc >Q: #entirely in the middle third
            mid.append(exons[i])
        elif acc_new>R and acc>R: #entirely in the last third
            last.append(exons[i])
        elif Q<acc_new<=R and acc<=Q: #straddling first and mid thirds
            cut_loc = exons[i][0]+Q-acc
            first.append((exons[i][0],cut_loc))
            mid.append((cut_loc,exons[i][1]))
        elif acc_new>R and Q<acc<=R: #straddling mid and last thirds
            cut_loc = exons[i][0]+R-acc
            mid.append((exons[i][0],cut_loc))
            last.append((cut_loc,exons[i][1]))
        elif acc_new>R and acc<=Q: #straddling all
            cut_loc_1 = exons[i][0]+Q-acc
            cut_loc_2 = exons[i][1]-(acc_new-R)
            first.append((exons[i][0],cut_loc_1))
            mid.append((cut_loc_1,cut_loc_2))
            last.append((cut_loc_2,exons[i][1]))
        else: raise Exception("Unable to process exon %d: %s"%(i,exons[i]))
        acc = acc_new
    return {"first":first,"mid":mid,"last":last}

def refgene_stream(genome_filename="refGene.txt"):
    """Given a refseq genome file (for example download from:
    http://hgdownload.soe.ucsc.edu/downloads.html
    streams out dicts of information about each line of the file in following format:
    "chrom":chromosome number
    "refseq_id":refseq ID
    "txstart":transcription start site
    "txend": transcription end site
    "exons": list of (start,end) tuples for all exons in the refseq_id/gene
    "introns": list of (start,end) tuples for all introns in the refseq_id/gene, should be one less than the number of exons.
    No uniqueness checks are performed.
    """
    with open(genome_filename,mode="r") as genome_file:
        for line in genome_file:
            vals=line.split('\t')
            #print("len(vals)=",len(vals)) #debugging
            exonstarts = [int(i) for i in vals[9][:-1].split(",")] #:-1 to get rid of trailing comma
            exonends = [int(i) for i in vals[10][:-1].split(",")]
            exons = zip(exonstarts,exonends)
            exon_bins = splitExons(exons)
            introns = [(exons[i-1][1],exons[i][0]) for i in xrange(1,len(exons))]
            yield {"chrom":vals[2],"refseq_id":vals[1],"txstart":int(vals[4]),"txend":int(vals[5]),"introns":introns,"exons":exons,"exon_bins":exon_bins}

def distalPromoter(g):
    '''given a gene/refseq dictionary emitted by refgene_stream, output info about the distal promoter as a tuple'''
    tx = g["txstart"]
    #note this can return an interval of form (0,0) if the gene is very close to the end of the chromosome!
    yield (g["chrom"],str(max(0,tx-5000)),str(max(0,tx-1000)),g["refseq_id"])

def proximalPromoter(g):
    '''given a gene/refseq dictionary emitted by refgene_stream, output info about the proximal promoter as a tuple'''
    tx = g["txstart"]
    yield (g["chrom"],str(max(0,tx-1000)),str(tx+1000),g["refseq_id"])

def geneBody(g):
    '''given a gene/refseq dictionary emitted by refgene_stream, output info about the gene body as a tuple'''
    yield (g["chrom"],str(g["txstart"]),str(g["txend"]),g["refseq_id"])

def exonStream(g,subset="all"):
    '''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each exon.
    subset argument can take on several values (case sensitive!)
    subset="all": emit all exons
    subset="first": emit all exons in the first third of the transcript
    subset="mid": emit all exons in the middle third of the transcript
    subset="last": emit all exons in the last third of the transcript.
    For options "first","mid","last", if an exon lies on the first/mid or mid/last boundary, the exon is partitioned into two exons and assigned to the separate boundaries'''
    #if subset not in ("all","first","mid","last"):
    #    raise ValueError('Invalid Subset, correct values include "all","first","mid","last"')
    if subset=="all":
        exons = g["exons"]
    else:
        exons = g["exon_bins"][subset]
    for i in xrange(len(exons)):
        yield (g["chrom"],str(exons[i][0]),str(exons[i][1]),g["refseq_id"])#+"_exon"+str(i))

def exonsAll(g):
    return exonStream(g,subset="all")
def exonsFirst(g):
    return exonStream(g,subset="first")
def exonsMid(g):
    return exonStream(g,subset="mid")
def exonsLast(g):
    return exonStream(g,subset="last")

def introns(g):
    '''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each intron.'''
    introns = g["introns"]
    for i in xrange(len(introns)):
        yield (g["chrom"],str(introns[i][0]),str(introns[i][1]),g["refseq_id"])#+"_intron"+str(i))

def genome2bed(genome,ofilename,func=geneBody):
    """write out BED (txt, tab delim) files for a feature (func is a function to stream out the features desired).
    genome can be a stream (for example from parsing refGene.txt via refgene_stream()."""
    with open(ofilename,"wb") as ofile:
        for g in genome:
            for res in func(g):
                ofile.write("\t".join(res)+"\n")

if __name__=="__main__":
    #genome = list(refgene_stream("refGene_test.txt"))
    #assert len(genome[0]["introns"])==len(genome[0]["exons"])-1
    genome = list(refgene_stream("refGene.txt")) #big object, but shouldn't take more than a few seconds to load
    methods = [distalPromoter,proximalPromoter,geneBody,exonsAll,exonsFirst,exonsMid,exonsLast,introns]
    ofiles = ["refGene%d.txt"%i for i in xrange(1,9)]
    with open("file_index_refGene.txt","w") as out:
        for i in xrange(8):
            out.write(ofiles[i]+"\t"+methods[i].func_name+"\n")
            genome2bed(genome,ofiles[i],methods[i])

# refGene1.txt distalPromoter
# refGene2.txt proximalPromoter
# refGene3.txt geneBody
# refGene4.txt exonsAll
# refGene5.txt exonsFirst
# refGene6.txt exonsMid
# refGene7.txt exonsLast
# refGene8.txt introns
	"""
	Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
	All output files have the structure expected by bedtools, namely,
	CHROM POSITION1 POSITION2 REFSEQ_ID
	Possible output files include:
	1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
	2. proximal promoter (transcription start [-1KB,1KB])
	3. gene body (anywhere between transcription start and transcription end)
	4. transcript (anywhere in an exon)- outputs each exon as a separate line
	5. first 1/3 transcript- outputs each exon as a separate line
	6. middle 1/3 transcript- outputs each exon as a separate line
	7. last 1/3 transcript- outputs each exon as a separate line
	8. introns- outputs each intron as a separate line
	"""

	def splitExons(exons):
	'''given a list of tuples [(a,b),(c,d)...] representing intervals of exon locations, return a dictionary of tuple lists with the exons split out among first, mid, and last thirds of the overall transcript length. Exons that straddle the split point(s), will be split into separate exon intervals. Returns a dictionary of form:
	{"first":[(a,b),...],"mid":[(c,d),...],"last":[(e,f),...]}'''
	l = [i[1]-i[0] for i in exons] #all values should be positive!
	length = sum(l)
	Q = length/3 #boundary between first and mid
	R = 2*length/3 #boundary between mid and high
	first = []
	mid = []
	last = []
	acc = 0 #accumulated length
	for i in xrange(len(exons)):
	acc_new = acc + l[i]
	if acc_new <= Q: #entirely in the first third
	first.append(exons[i])
	elif Q<acc_new<=R and acc >Q: #entirely in the middle third
	mid.append(exons[i])
	elif acc_new>R and acc>R: #entirely in the last third
	last.append(exons[i])
	elif Q<acc_new<=R and acc<=Q: #straddling first and mid thirds
	cut_loc = exons[i][0]+Q-acc
	first.append((exons[i][0],cut_loc))
	mid.append((cut_loc,exons[i][1]))
	elif acc_new>R and Q<acc<=R: #straddling mid and last thirds
	cut_loc = exons[i][0]+R-acc
	mid.append((exons[i][0],cut_loc))
	last.append((cut_loc,exons[i][1]))
	elif acc_new>R and acc<=Q: #straddling all
	cut_loc_1 = exons[i][0]+Q-acc
	cut_loc_2 = exons[i][1]-(acc_new-R)
	first.append((exons[i][0],cut_loc_1))
	mid.append((cut_loc_1,cut_loc_2))
	last.append((cut_loc_2,exons[i][1]))
	else: raise Exception("Unable to process exon %d: %s"%(i,exons[i]))
	acc = acc_new
	return {"first":first,"mid":mid,"last":last}

	def refgene_stream(genome_filename="refGene.txt"):
	"""Given a refseq genome file (for example download from:
	http://hgdownload.soe.ucsc.edu/downloads.html
	streams out dicts of information about each line of the file in following format:
	"chrom":chromosome number
	"refseq_id":refseq ID
	"txstart":transcription start site
	"txend": transcription end site
	"exons": list of (start,end) tuples for all exons in the refseq_id/gene
	"introns": list of (start,end) tuples for all introns in the refseq_id/gene, should be one less than the number of exons.
	No uniqueness checks are performed.
	"""
	with open(genome_filename,mode="r") as genome_file:
	for line in genome_file:
	vals=line.split('\t')
	#print("len(vals)=",len(vals)) #debugging
	exonstarts = [int(i) for i in vals[9][:-1].split(",")] #:-1 to get rid of trailing comma
	exonends = [int(i) for i in vals[10][:-1].split(",")]
	exons = zip(exonstarts,exonends)
	exon_bins = splitExons(exons)
	introns = [(exons[i-1][1],exons[i][0]) for i in xrange(1,len(exons))]
	yield {"chrom":vals[2],"refseq_id":vals[1],"txstart":int(vals[4]),"txend":int(vals[5]),"introns":introns,"exons":exons,"exon_bins":exon_bins}

	def distalPromoter(g):
	'''given a gene/refseq dictionary emitted by refgene_stream, output info about the distal promoter as a tuple'''
	tx = g["txstart"]
	#note this can return an interval of form (0,0) if the gene is very close to the end of the chromosome!
	yield (g["chrom"],str(max(0,tx-5000)),str(max(0,tx-1000)),g["refseq_id"])

	def proximalPromoter(g):
	'''given a gene/refseq dictionary emitted by refgene_stream, output info about the proximal promoter as a tuple'''
	tx = g["txstart"]
	yield (g["chrom"],str(max(0,tx-1000)),str(tx+1000),g["refseq_id"])

	def geneBody(g):
	'''given a gene/refseq dictionary emitted by refgene_stream, output info about the gene body as a tuple'''
	yield (g["chrom"],str(g["txstart"]),str(g["txend"]),g["refseq_id"])

	def exonStream(g,subset="all"):
	'''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each exon.
	subset argument can take on several values (case sensitive!)
	subset="all": emit all exons
	subset="first": emit all exons in the first third of the transcript
	subset="mid": emit all exons in the middle third of the transcript
	subset="last": emit all exons in the last third of the transcript.
	For options "first","mid","last", if an exon lies on the first/mid or mid/last boundary, the exon is partitioned into two exons and assigned to the separate boundaries'''
	#if subset not in ("all","first","mid","last"):
	# raise ValueError('Invalid Subset, correct values include "all","first","mid","last"')
	if subset=="all":
	exons = g["exons"]
	else:
	exons = g["exon_bins"][subset]
	for i in xrange(len(exons)):
	yield (g["chrom"],str(exons[i][0]),str(exons[i][1]),g["refseq_id"])#+"_exon"+str(i))

	def exonsAll(g):
	return exonStream(g,subset="all")
	def exonsFirst(g):
	return exonStream(g,subset="first")
	def exonsMid(g):
	return exonStream(g,subset="mid")
	def exonsLast(g):
	return exonStream(g,subset="last")

	def introns(g):
	'''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each intron.'''
	introns = g["introns"]
	for i in xrange(len(introns)):
	yield (g["chrom"],str(introns[i][0]),str(introns[i][1]),g["refseq_id"])#+"_intron"+str(i))

	def genome2bed(genome,ofilename,func=geneBody):
	"""write out BED (txt, tab delim) files for a feature (func is a function to stream out the features desired).
	genome can be a stream (for example from parsing refGene.txt via refgene_stream()."""
	with open(ofilename,"wb") as ofile:
	for g in genome:
	for res in func(g):
	ofile.write("\t".join(res)+"\n")

	if __name__=="__main__":
	#genome = list(refgene_stream("refGene_test.txt"))
	#assert len(genome[0]["introns"])==len(genome[0]["exons"])-1
	genome = list(refgene_stream("refGene.txt")) #big object, but shouldn't take more than a few seconds to load
	methods = [distalPromoter,proximalPromoter,geneBody,exonsAll,exonsFirst,exonsMid,exonsLast,introns]
	ofiles = ["refGene%d.txt"%i for i in xrange(1,9)]
	with open("file_index_refGene.txt","w") as out:
	for i in xrange(8):
	out.write(ofiles[i]+"\t"+methods[i].func_name+"\n")
	genome2bed(genome,ofiles[i],methods[i])

	# refGene1.txt distalPromoter
	# refGene2.txt proximalPromoter
	# refGene3.txt geneBody
	# refGene4.txt exonsAll
	# refGene5.txt exonsFirst
	# refGene6.txt exonsMid
	# refGene7.txt exonsLast
	# refGene8.txt introns