Created
March 28, 2011 22:58
-
-
Save seandavi/891496 to your computer and use it in GitHub Desktop.
Use this file as a sketch for finding the genomic location of a base relative to an transcript, exon (or intron), and offset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import itertools | |
import argparse | |
# File "refGene.txt" downloaded from UCSC and gunzipped | |
# must be in this directory | |
# Works only on "+" strand genes | |
# Minus strand genes left as an exercise | |
# To get help, type: | |
# "python javedgene.py --help" | |
def parse_ucsc_file(fname): | |
""" | |
Takes a filename of a refGene-like file from UCSC | |
downloaded from http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | |
Breaks up the transcripts into component exon | |
and intron lists and returns a dict keyed | |
by the transcript name""" | |
transcripts = {} | |
for row in csv.reader(open(fname,'r'),delimiter="\t"): | |
# Only work with positive strand genes | |
if(row[3]=='-'): | |
next | |
starts=row[9].strip(",").split(",") | |
ends = row[10].strip(",").split(",") | |
exons=[tuple((int(start),int(end))) for start,end in itertools.izip(starts,ends)] | |
introns=[tuple((int(start),int(end))) for start,end in itertools.izip(ends[0:(len(ends)-1)],starts[1:len(starts)])] | |
transcripts[row[1]]={'intron':introns, | |
'exon':exons, | |
'chromosome':row[2], | |
'strand':row[3]} | |
return transcripts | |
def main(): | |
parser = argparse.ArgumentParser( | |
description=""" | |
Use this little program by doing something like: | |
"python javedgene.py -t NR_026874 -f exon -o -10 -n 2 refGene.txt" | |
This will return something like: | |
"location for exon 2, offset -10 of transcript NR_026874 is 843254 on chromosome chr1" | |
""" | |
) | |
parser.add_argument("-t","--transcript", | |
help="Name of the transcript") | |
parser.add_argument("-f","--feature",choices=['intron','exon'], | |
help="Type of feature, either 'intron' or 'exon'") | |
parser.add_argument("-n","--number",type=int, | |
help="intron or exon number, counting from 5' end") | |
parser.add_argument('-o','--offset',type=int, | |
help="offset from 5' end of intron or exon") | |
parser.add_argument("ucscfile", | |
help="Location of UCSC refgene-like file") | |
args=parser.parse_args() | |
transcripts=parse_ucsc_file(args.ucscfile) | |
if(args.transcript not in transcripts): | |
print "transcript %s not represented in transcript database" % (args.transcript) | |
exit(1) | |
loc = transcripts[args.transcript][args.feature][args.number-1][0]+args.offset | |
print "location for %s %d, offset %d of transcript %s is %d on chromosome %s" % (args.feature, | |
args.number, | |
args.offset, | |
args.transcript, | |
loc, | |
transcripts[args.transcript]['chromosome']) | |
if __name__=="__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment