Created
May 16, 2018 12:43
-
-
Save finswimmer/b0aa4646a5ffd154e2eb6e31cdbc5e2c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import sys | |
import argparse | |
def set_args(): | |
parser = argparse.ArgumentParser() | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument("-t", "--transcript", help="ensembl's transcript id") | |
group.add_argument("-l", "--list", help="file which contains list of transcript id's") | |
parser.add_argument("-o", "--out", help="write output to given file instead of stdout") | |
return parser.parse_args() | |
def read_transcript_file(list): | |
with open(list) as f: | |
for line in f: | |
yield line.strip() | |
def lookup_transcript(transcript): | |
server = "http://rest.ensembl.org" | |
ext = "/lookup/id/"+transcript+"?expand=1" | |
r = requests.get(server+ext, headers={"Content-Type": "application/json"}) | |
if not r.ok: | |
r.raise_for_status() | |
sys.exit() | |
lookup = r.json() | |
return lookup | |
def intron_coords(exons, i): | |
if exons[i]['strand'] == -1: | |
intron_start = str(exons[i+1]['end']+1) | |
intron_end = str(exons[i]['start']-1) | |
else: | |
intron_start = str(exons[i]['end']+1) | |
intron_end = str(exons[i+1]['start']-1) | |
coords = str(exons[i]['seq_region_name'])+":"+intron_start+".."+intron_end+":"+str(exons[i]['strand']) | |
return coords | |
if __name__ == "__main__": | |
args = set_args() | |
transcripts = [] | |
if args.transcript: | |
transcripts.append(args.transcript) | |
elif args.list: | |
transcripts = read_transcript_file(args.list) | |
for t in transcripts: | |
lookup = lookup_transcript(t) | |
for i in range(0, len(lookup['Exon'])-1): | |
coords = intron_coords(lookup['Exon'], i) | |
server = "http://rest.ensembl.org" | |
ext = "/sequence/region/human/"+coords | |
r = requests.get(server+ext, headers={"Content-Type": "text/x-fasta"}) | |
if not r.ok: | |
r.raise_for_status() | |
sys.exit() | |
# create a header like >ENST00000412061.3.Intron_1 chromosome:GRCh38:17:43094861:43095845:-1 | |
# use headline = ">"+t+".Intron_"+str(i+1) if you don't want the chromosomal position | |
headline = ">"+t+".Intron_"+str(i+1)+" "+r.text.split("\n")[0][1:] | |
sequence = r.text.split("\n", 1)[1] | |
if args.out: | |
outfile = open(args.out, "a") | |
outfile.write(headline+"\n"+sequence) | |
outfile.close() | |
else: | |
print(headline, sequence, sep="\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Answer to biostar question: https://www.biostars.org/p/259768