finswimmer/intron2fasta.py

## intron2fasta.py
import requests
import sys
import argparse


def set_args():
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-t", "--transcript", help="ensembl's transcript id")
    group.add_argument("-l", "--list", help="file which contains list of transcript id's")
    parser.add_argument("-o", "--out", help="write output to given file instead of stdout")
    return parser.parse_args()


def read_transcript_file(list):
    with open(list) as f:
        for line in f:
            yield line.strip()


def lookup_transcript(transcript):
    server = "http://rest.ensembl.org"
    ext = "/lookup/id/"+transcript+"?expand=1"

    r = requests.get(server+ext, headers={"Content-Type": "application/json"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    lookup = r.json()

    return lookup


def intron_coords(exons, i):
    if exons[i]['strand'] == -1:
        intron_start = str(exons[i+1]['end']+1)
        intron_end = str(exons[i]['start']-1)
    else:
        intron_start = str(exons[i]['end']+1)
        intron_end = str(exons[i+1]['start']-1)

    coords = str(exons[i]['seq_region_name'])+":"+intron_start+".."+intron_end+":"+str(exons[i]['strand'])

    return coords


if __name__ == "__main__":
    args = set_args()
    transcripts = []

    if args.transcript:
        transcripts.append(args.transcript)
    elif args.list:
        transcripts = read_transcript_file(args.list)

    for t in transcripts:
        lookup = lookup_transcript(t)

        for i in range(0, len(lookup['Exon'])-1):
            coords = intron_coords(lookup['Exon'], i)
            server = "http://rest.ensembl.org"
            ext = "/sequence/region/human/"+coords

            r = requests.get(server+ext, headers={"Content-Type": "text/x-fasta"})

            if not r.ok:
                r.raise_for_status()
                sys.exit()
            # create a header like >ENST00000412061.3.Intron_1 chromosome:GRCh38:17:43094861:43095845:-1
            # use headline = ">"+t+".Intron_"+str(i+1) if you don't want the chromosomal position
            headline = ">"+t+".Intron_"+str(i+1)+" "+r.text.split("\n")[0][1:]
            sequence = r.text.split("\n", 1)[1]

            if args.out:
                outfile = open(args.out, "a")
                outfile.write(headline+"\n"+sequence)
                outfile.close()
            else:
                print(headline, sequence, sep="\n")
	import requests
	import sys
	import argparse


	def set_args():
	parser = argparse.ArgumentParser()
	group = parser.add_mutually_exclusive_group()
	group.add_argument("-t", "--transcript", help="ensembl's transcript id")
	group.add_argument("-l", "--list", help="file which contains list of transcript id's")
	parser.add_argument("-o", "--out", help="write output to given file instead of stdout")
	return parser.parse_args()


	def read_transcript_file(list):
	with open(list) as f:
	for line in f:
	yield line.strip()


	def lookup_transcript(transcript):
	server = "http://rest.ensembl.org"
	ext = "/lookup/id/"+transcript+"?expand=1"

	r = requests.get(server+ext, headers={"Content-Type": "application/json"})

	if not r.ok:
	r.raise_for_status()
	sys.exit()

	lookup = r.json()

	return lookup


	def intron_coords(exons, i):
	if exons[i]['strand'] == -1:
	intron_start = str(exons[i+1]['end']+1)
	intron_end = str(exons[i]['start']-1)
	else:
	intron_start = str(exons[i]['end']+1)
	intron_end = str(exons[i+1]['start']-1)

	coords = str(exons[i]['seq_region_name'])+":"+intron_start+".."+intron_end+":"+str(exons[i]['strand'])

	return coords


	if __name__ == "__main__":
	args = set_args()
	transcripts = []

	if args.transcript:
	transcripts.append(args.transcript)
	elif args.list:
	transcripts = read_transcript_file(args.list)

	for t in transcripts:
	lookup = lookup_transcript(t)

	for i in range(0, len(lookup['Exon'])-1):
	coords = intron_coords(lookup['Exon'], i)
	server = "http://rest.ensembl.org"
	ext = "/sequence/region/human/"+coords

	r = requests.get(server+ext, headers={"Content-Type": "text/x-fasta"})

	if not r.ok:
	r.raise_for_status()
	sys.exit()
	# create a header like >ENST00000412061.3.Intron_1 chromosome:GRCh38:17:43094861:43095845:-1
	# use headline = ">"+t+".Intron_"+str(i+1) if you don't want the chromosomal position
	headline = ">"+t+".Intron_"+str(i+1)+" "+r.text.split("\n")[0][1:]
	sequence = r.text.split("\n", 1)[1]

	if args.out:
	outfile = open(args.out, "a")
	outfile.write(headline+"\n"+sequence)
	outfile.close()
	else:
	print(headline, sequence, sep="\n")