Skip to content

Instantly share code, notes, and snippets.

@seandavi
Created March 28, 2011 22:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seandavi/891496 to your computer and use it in GitHub Desktop.
Save seandavi/891496 to your computer and use it in GitHub Desktop.
Use this file as a sketch for finding the genomic location of a base relative to an transcript, exon (or intron), and offset
#!/usr/bin/env python
import csv
import itertools
import argparse
# File "refGene.txt" downloaded from UCSC and gunzipped
# must be in this directory
# Works only on "+" strand genes
# Minus strand genes left as an exercise
# To get help, type:
# "python javedgene.py --help"
def parse_ucsc_file(fname):
"""
Takes a filename of a refGene-like file from UCSC
downloaded from http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/
Breaks up the transcripts into component exon
and intron lists and returns a dict keyed
by the transcript name"""
transcripts = {}
for row in csv.reader(open(fname,'r'),delimiter="\t"):
# Only work with positive strand genes
if(row[3]=='-'):
next
starts=row[9].strip(",").split(",")
ends = row[10].strip(",").split(",")
exons=[tuple((int(start),int(end))) for start,end in itertools.izip(starts,ends)]
introns=[tuple((int(start),int(end))) for start,end in itertools.izip(ends[0:(len(ends)-1)],starts[1:len(starts)])]
transcripts[row[1]]={'intron':introns,
'exon':exons,
'chromosome':row[2],
'strand':row[3]}
return transcripts
def main():
parser = argparse.ArgumentParser(
description="""
Use this little program by doing something like:
"python javedgene.py -t NR_026874 -f exon -o -10 -n 2 refGene.txt"
This will return something like:
"location for exon 2, offset -10 of transcript NR_026874 is 843254 on chromosome chr1"
"""
)
parser.add_argument("-t","--transcript",
help="Name of the transcript")
parser.add_argument("-f","--feature",choices=['intron','exon'],
help="Type of feature, either 'intron' or 'exon'")
parser.add_argument("-n","--number",type=int,
help="intron or exon number, counting from 5' end")
parser.add_argument('-o','--offset',type=int,
help="offset from 5' end of intron or exon")
parser.add_argument("ucscfile",
help="Location of UCSC refgene-like file")
args=parser.parse_args()
transcripts=parse_ucsc_file(args.ucscfile)
if(args.transcript not in transcripts):
print "transcript %s not represented in transcript database" % (args.transcript)
exit(1)
loc = transcripts[args.transcript][args.feature][args.number-1][0]+args.offset
print "location for %s %d, offset %d of transcript %s is %d on chromosome %s" % (args.feature,
args.number,
args.offset,
args.transcript,
loc,
transcripts[args.transcript]['chromosome'])
if __name__=="__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment