seandavi/javedgene.py

## javedgene.py
#!/usr/bin/env python
import csv
import itertools
import argparse

# File "refGene.txt" downloaded from UCSC and gunzipped
# must be in this directory
# Works only on "+" strand genes
# Minus strand genes left as an exercise

# To get help, type:
# "python javedgene.py --help"


def parse_ucsc_file(fname):
    """
    Takes a filename of a refGene-like file from UCSC
    downloaded from http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/

    Breaks up the transcripts into component exon
    and intron lists and returns a dict keyed
    by the transcript name"""

    transcripts = {}
    for row in csv.reader(open(fname,'r'),delimiter="\t"):
        # Only work with positive strand genes
        if(row[3]=='-'):
            next
        starts=row[9].strip(",").split(",")
        ends = row[10].strip(",").split(",")
        exons=[tuple((int(start),int(end))) for start,end in itertools.izip(starts,ends)]
        introns=[tuple((int(start),int(end))) for start,end in itertools.izip(ends[0:(len(ends)-1)],starts[1:len(starts)])]
        transcripts[row[1]]={'intron':introns,
                             'exon':exons,
                             'chromosome':row[2],
                             'strand':row[3]}
    return transcripts

def main():
    parser = argparse.ArgumentParser(
        description="""
        Use this little program by doing something like:

        "python javedgene.py -t NR_026874 -f exon -o -10 -n 2 refGene.txt"

        This will return something like:

        "location for exon 2, offset -10 of transcript NR_026874 is 843254 on chromosome chr1"
        """
        )
    parser.add_argument("-t","--transcript",
                        help="Name of the transcript")
    parser.add_argument("-f","--feature",choices=['intron','exon'],
                        help="Type of feature, either 'intron' or 'exon'")
    parser.add_argument("-n","--number",type=int,
                        help="intron or exon number, counting from 5' end")
    parser.add_argument('-o','--offset',type=int,
                        help="offset from 5' end of intron or exon")
    parser.add_argument("ucscfile",
                        help="Location of UCSC refgene-like file")

    args=parser.parse_args()

    transcripts=parse_ucsc_file(args.ucscfile)

    if(args.transcript not in transcripts):
        print "transcript %s not represented in transcript database" % (args.transcript)
        exit(1)
    loc = transcripts[args.transcript][args.feature][args.number-1][0]+args.offset
    print "location for %s %d, offset %d of transcript %s is %d on chromosome %s" % (args.feature,
                                                                                     args.number,
                                                                                     args.offset,
                                                                                     args.transcript,
                                                                                     loc,
                                                                                     transcripts[args.transcript]['chromosome'])

if __name__=="__main__":
    main()
	#!/usr/bin/env python
	import csv
	import itertools
	import argparse

	# File "refGene.txt" downloaded from UCSC and gunzipped
	# must be in this directory
	# Works only on "+" strand genes
	# Minus strand genes left as an exercise

	# To get help, type:
	# "python javedgene.py --help"


	def parse_ucsc_file(fname):
	"""
	Takes a filename of a refGene-like file from UCSC
	downloaded from http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/

	Breaks up the transcripts into component exon
	and intron lists and returns a dict keyed
	by the transcript name"""

	transcripts = {}
	for row in csv.reader(open(fname,'r'),delimiter="\t"):
	# Only work with positive strand genes
	if(row[3]=='-'):
	next
	starts=row[9].strip(",").split(",")
	ends = row[10].strip(",").split(",")
	exons=[tuple((int(start),int(end))) for start,end in itertools.izip(starts,ends)]
	introns=[tuple((int(start),int(end))) for start,end in itertools.izip(ends[0:(len(ends)-1)],starts[1:len(starts)])]
	transcripts[row[1]]={'intron':introns,
	'exon':exons,
	'chromosome':row[2],
	'strand':row[3]}
	return transcripts

	def main():
	parser = argparse.ArgumentParser(
	description="""
	Use this little program by doing something like:

	"python javedgene.py -t NR_026874 -f exon -o -10 -n 2 refGene.txt"

	This will return something like:

	"location for exon 2, offset -10 of transcript NR_026874 is 843254 on chromosome chr1"
	"""
	)
	parser.add_argument("-t","--transcript",
	help="Name of the transcript")
	parser.add_argument("-f","--feature",choices=['intron','exon'],
	help="Type of feature, either 'intron' or 'exon'")
	parser.add_argument("-n","--number",type=int,
	help="intron or exon number, counting from 5' end")
	parser.add_argument('-o','--offset',type=int,
	help="offset from 5' end of intron or exon")
	parser.add_argument("ucscfile",
	help="Location of UCSC refgene-like file")

	args=parser.parse_args()

	transcripts=parse_ucsc_file(args.ucscfile)

	if(args.transcript not in transcripts):
	print "transcript %s not represented in transcript database" % (args.transcript)
	exit(1)
	loc = transcripts[args.transcript][args.feature][args.number-1][0]+args.offset
	print "location for %s %d, offset %d of transcript %s is %d on chromosome %s" % (args.feature,
	args.number,
	args.offset,
	args.transcript,
	loc,
	transcripts[args.transcript]['chromosome'])

	if __name__=="__main__":
	main()