Skip to content

Instantly share code, notes, and snippets.

@willtownes
Last active January 30, 2024 16:31
Show Gist options
  • Save willtownes/ebb87e7b224dd206a3d2 to your computer and use it in GitHub Desktop.
Save willtownes/ebb87e7b224dd206a3d2 to your computer and use it in GitHub Desktop.
Splits a refGene.txt file into multiple .bed files for various genome features (exon,intron, etc), suitable for input to bedtools coverage
"""
Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
All output files have the structure expected by bedtools, namely,
CHROM POSITION1 POSITION2 REFSEQ_ID
Possible output files include:
1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
2. proximal promoter (transcription start [-1KB,1KB])
3. gene body (anywhere between transcription start and transcription end)
4. transcript (anywhere in an exon)- outputs each exon as a separate line
5. first 1/3 transcript- outputs each exon as a separate line
6. middle 1/3 transcript- outputs each exon as a separate line
7. last 1/3 transcript- outputs each exon as a separate line
8. introns- outputs each intron as a separate line
"""
def splitExons(exons):
'''given a list of tuples [(a,b),(c,d)...] representing intervals of exon locations, return a dictionary of tuple lists with the exons split out among first, mid, and last thirds of the overall transcript length. Exons that straddle the split point(s), will be split into separate exon intervals. Returns a dictionary of form:
{"first":[(a,b),...],"mid":[(c,d),...],"last":[(e,f),...]}'''
l = [i[1]-i[0] for i in exons] #all values should be positive!
length = sum(l)
Q = length/3 #boundary between first and mid
R = 2*length/3 #boundary between mid and high
first = []
mid = []
last = []
acc = 0 #accumulated length
for i in xrange(len(exons)):
acc_new = acc + l[i]
if acc_new <= Q: #entirely in the first third
first.append(exons[i])
elif Q<acc_new<=R and acc >Q: #entirely in the middle third
mid.append(exons[i])
elif acc_new>R and acc>R: #entirely in the last third
last.append(exons[i])
elif Q<acc_new<=R and acc<=Q: #straddling first and mid thirds
cut_loc = exons[i][0]+Q-acc
first.append((exons[i][0],cut_loc))
mid.append((cut_loc,exons[i][1]))
elif acc_new>R and Q<acc<=R: #straddling mid and last thirds
cut_loc = exons[i][0]+R-acc
mid.append((exons[i][0],cut_loc))
last.append((cut_loc,exons[i][1]))
elif acc_new>R and acc<=Q: #straddling all
cut_loc_1 = exons[i][0]+Q-acc
cut_loc_2 = exons[i][1]-(acc_new-R)
first.append((exons[i][0],cut_loc_1))
mid.append((cut_loc_1,cut_loc_2))
last.append((cut_loc_2,exons[i][1]))
else: raise Exception("Unable to process exon %d: %s"%(i,exons[i]))
acc = acc_new
return {"first":first,"mid":mid,"last":last}
def refgene_stream(genome_filename="refGene.txt"):
"""Given a refseq genome file (for example download from:
http://hgdownload.soe.ucsc.edu/downloads.html
streams out dicts of information about each line of the file in following format:
"chrom":chromosome number
"refseq_id":refseq ID
"txstart":transcription start site
"txend": transcription end site
"exons": list of (start,end) tuples for all exons in the refseq_id/gene
"introns": list of (start,end) tuples for all introns in the refseq_id/gene, should be one less than the number of exons.
No uniqueness checks are performed.
"""
with open(genome_filename,mode="r") as genome_file:
for line in genome_file:
vals=line.split('\t')
#print("len(vals)=",len(vals)) #debugging
exonstarts = [int(i) for i in vals[9][:-1].split(",")] #:-1 to get rid of trailing comma
exonends = [int(i) for i in vals[10][:-1].split(",")]
exons = zip(exonstarts,exonends)
exon_bins = splitExons(exons)
introns = [(exons[i-1][1],exons[i][0]) for i in xrange(1,len(exons))]
yield {"chrom":vals[2],"refseq_id":vals[1],"txstart":int(vals[4]),"txend":int(vals[5]),"introns":introns,"exons":exons,"exon_bins":exon_bins}
def distalPromoter(g):
'''given a gene/refseq dictionary emitted by refgene_stream, output info about the distal promoter as a tuple'''
tx = g["txstart"]
#note this can return an interval of form (0,0) if the gene is very close to the end of the chromosome!
yield (g["chrom"],str(max(0,tx-5000)),str(max(0,tx-1000)),g["refseq_id"])
def proximalPromoter(g):
'''given a gene/refseq dictionary emitted by refgene_stream, output info about the proximal promoter as a tuple'''
tx = g["txstart"]
yield (g["chrom"],str(max(0,tx-1000)),str(tx+1000),g["refseq_id"])
def geneBody(g):
'''given a gene/refseq dictionary emitted by refgene_stream, output info about the gene body as a tuple'''
yield (g["chrom"],str(g["txstart"]),str(g["txend"]),g["refseq_id"])
def exonStream(g,subset="all"):
'''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each exon.
subset argument can take on several values (case sensitive!)
subset="all": emit all exons
subset="first": emit all exons in the first third of the transcript
subset="mid": emit all exons in the middle third of the transcript
subset="last": emit all exons in the last third of the transcript.
For options "first","mid","last", if an exon lies on the first/mid or mid/last boundary, the exon is partitioned into two exons and assigned to the separate boundaries'''
#if subset not in ("all","first","mid","last"):
# raise ValueError('Invalid Subset, correct values include "all","first","mid","last"')
if subset=="all":
exons = g["exons"]
else:
exons = g["exon_bins"][subset]
for i in xrange(len(exons)):
yield (g["chrom"],str(exons[i][0]),str(exons[i][1]),g["refseq_id"])#+"_exon"+str(i))
def exonsAll(g):
return exonStream(g,subset="all")
def exonsFirst(g):
return exonStream(g,subset="first")
def exonsMid(g):
return exonStream(g,subset="mid")
def exonsLast(g):
return exonStream(g,subset="last")
def introns(g):
'''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each intron.'''
introns = g["introns"]
for i in xrange(len(introns)):
yield (g["chrom"],str(introns[i][0]),str(introns[i][1]),g["refseq_id"])#+"_intron"+str(i))
def genome2bed(genome,ofilename,func=geneBody):
"""write out BED (txt, tab delim) files for a feature (func is a function to stream out the features desired).
genome can be a stream (for example from parsing refGene.txt via refgene_stream()."""
with open(ofilename,"wb") as ofile:
for g in genome:
for res in func(g):
ofile.write("\t".join(res)+"\n")
if __name__=="__main__":
#genome = list(refgene_stream("refGene_test.txt"))
#assert len(genome[0]["introns"])==len(genome[0]["exons"])-1
genome = list(refgene_stream("refGene.txt")) #big object, but shouldn't take more than a few seconds to load
methods = [distalPromoter,proximalPromoter,geneBody,exonsAll,exonsFirst,exonsMid,exonsLast,introns]
ofiles = ["refGene%d.txt"%i for i in xrange(1,9)]
with open("file_index_refGene.txt","w") as out:
for i in xrange(8):
out.write(ofiles[i]+"\t"+methods[i].func_name+"\n")
genome2bed(genome,ofiles[i],methods[i])
# refGene1.txt distalPromoter
# refGene2.txt proximalPromoter
# refGene3.txt geneBody
# refGene4.txt exonsAll
# refGene5.txt exonsFirst
# refGene6.txt exonsMid
# refGene7.txt exonsLast
# refGene8.txt introns
@willtownes
Copy link
Author

Thanks for the note. I wrote this so long ago I don't even remember what it was for (maybe a homework assignment?). Future users should consider the gist deprecated. If you have a correct implementation please feel free to link to it so others won't be led astray by my mistakes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment