Skip to content

Instantly share code, notes, and snippets.

@finswimmer
Created May 16, 2018 12:43
Show Gist options
  • Save finswimmer/b0aa4646a5ffd154e2eb6e31cdbc5e2c to your computer and use it in GitHub Desktop.
Save finswimmer/b0aa4646a5ffd154e2eb6e31cdbc5e2c to your computer and use it in GitHub Desktop.
import requests
import sys
import argparse
def set_args():
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group()
group.add_argument("-t", "--transcript", help="ensembl's transcript id")
group.add_argument("-l", "--list", help="file which contains list of transcript id's")
parser.add_argument("-o", "--out", help="write output to given file instead of stdout")
return parser.parse_args()
def read_transcript_file(list):
with open(list) as f:
for line in f:
yield line.strip()
def lookup_transcript(transcript):
server = "http://rest.ensembl.org"
ext = "/lookup/id/"+transcript+"?expand=1"
r = requests.get(server+ext, headers={"Content-Type": "application/json"})
if not r.ok:
r.raise_for_status()
sys.exit()
lookup = r.json()
return lookup
def intron_coords(exons, i):
if exons[i]['strand'] == -1:
intron_start = str(exons[i+1]['end']+1)
intron_end = str(exons[i]['start']-1)
else:
intron_start = str(exons[i]['end']+1)
intron_end = str(exons[i+1]['start']-1)
coords = str(exons[i]['seq_region_name'])+":"+intron_start+".."+intron_end+":"+str(exons[i]['strand'])
return coords
if __name__ == "__main__":
args = set_args()
transcripts = []
if args.transcript:
transcripts.append(args.transcript)
elif args.list:
transcripts = read_transcript_file(args.list)
for t in transcripts:
lookup = lookup_transcript(t)
for i in range(0, len(lookup['Exon'])-1):
coords = intron_coords(lookup['Exon'], i)
server = "http://rest.ensembl.org"
ext = "/sequence/region/human/"+coords
r = requests.get(server+ext, headers={"Content-Type": "text/x-fasta"})
if not r.ok:
r.raise_for_status()
sys.exit()
# create a header like >ENST00000412061.3.Intron_1 chromosome:GRCh38:17:43094861:43095845:-1
# use headline = ">"+t+".Intron_"+str(i+1) if you don't want the chromosomal position
headline = ">"+t+".Intron_"+str(i+1)+" "+r.text.split("\n")[0][1:]
sequence = r.text.split("\n", 1)[1]
if args.out:
outfile = open(args.out, "a")
outfile.write(headline+"\n"+sequence)
outfile.close()
else:
print(headline, sequence, sep="\n")
@finswimmer
Copy link
Author

Answer to biostar question: https://www.biostars.org/p/259768

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment