Created
September 23, 2018 17:45
-
-
Save carolinecunningham/985291a36b94a78b0aca3d2fe444946d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import cycle | |
from flask import * | |
import multiprocessing | |
import json | |
import requests | |
import urllib | |
from requests.packages.urllib3.exceptions import InsecureRequestWarning | |
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
def mp_ensembl_worker(raw_mutations_chunk): | |
""" Helper function for ENSEMBL API call """ | |
cn = complementary_nucleotide | |
verify_reference = {} | |
contexts = {} # result | |
api_input = {'regions': []} | |
for i, (assembly, chrom, pos, x, y) in enumerate(raw_mutations_chunk): | |
# print("!!!", assembly, chrom, pos, x, y) | |
api_input['regions'].append("{}:{}..{}".format(chrom, int(pos)-MAX_POST_SIZE, int(pos)+MAX_POST_SIZE)) | |
#api_input['regions'].append("{}:{}..{}".format(chrom, int(pos)-1, int(pos)+1)) | |
verify_reference[(chrom, int(pos))] = x | |
api_input_json = json.dumps(api_input) | |
asm = { | |
38: "GRCh38", | |
37: "GRCh37", | |
36: "NCBI36", | |
35: "NCBI35", | |
34: "NCBI34"} | |
server = "https://rest.ensembl.org" | |
ext = "/sequence/region/human/?coord_system_version={}".format(asm.get(int(assembly), "GRCh37")) | |
r = requests.post(server + ext, headers={ | |
"Content-Type": "application/json", | |
"Accept": "application/json"}, data=api_input_json, timeout=2.000, verify=False) # 2 second timeout | |
if not r.ok: | |
print("ENSEMBL Exception") | |
return contexts | |
# raise Exception("POST request does not work") | |
decoded = r.json() | |
if 'error' in decoded: | |
return contexts | |
for i, record in enumerate(decoded): | |
if 'id' not in record: | |
continue | |
_, asm, chrom, begin, end, expand = record['id'].split(":") | |
pos = int(begin) + 1 | |
if _ != "chromosome": | |
continue | |
seq = record['seq'] | |
seq_with_coords = list(zip( | |
cycle([chrom]), | |
range(pos -1 , pos + 2 * MAX_POST_SIZE), | |
seq, | |
cycle("+"))) | |
nuc3 = seq_with_coords[MAX_POST_SIZE-1][2] | |
nuc = seq_with_coords[MAX_POST_SIZE][2] | |
nuc5 = seq_with_coords[MAX_POST_SIZE + 1][2] | |
#nuc5 = seq[0] | |
#nuc = seq[1] | |
#nuc3 = seq[2] | |
#x = verify_reference.get((chrom, pos)) | |
#print(x, nuc) | |
if nuc != "N" and nuc != x: | |
if cn[nuc] == x: | |
nuc3 = cn[nuc5] | |
nuc5 = cn[nuc3] | |
# else: | |
# print("Could not match reference allele in ", raw_mutations_chunk[i], "in sequence", seq) | |
# nuc3 = 'N' | |
# nuc5 = 'N' | |
# print("{} {}: {} [{} > ?] {}".format(chrom, pos, nuc5, x, nuc3)) | |
contexts[(chrom, pos + MAX_POST_SIZE -1)] = (nuc5, nuc3), seq_with_coords | |
return contexts | |
def get_context_ensembl(mutations, assembly): | |
""" | |
Use Ensembl to get context of mutations | |
ENSEMBL REST API http://rest.ensembl.org/documentation/info/sequence_region_post | |
""" | |
raw_mutations = [] | |
for (chrom, pos, x, y) in mutations: | |
raw_mutations.append((assembly, chrom, pos, x, y)) | |
L = len(raw_mutations) | |
global MAX_POST_SIZE | |
MAX_POST_SIZE = 50 # https://github.com/Ensembl/ensembl-rest/blob/78eebccb589798662c0ea758081ba2b2cb6acbca/ensembl_rest.conf.default max_post_size = 50 | |
chunks = [] | |
for chunk in range(0, L, MAX_POST_SIZE): | |
chunks.append(raw_mutations[chunk: chunk + MAX_POST_SIZE]) | |
p = multiprocessing.Pool(7) | |
cc = p.map(mp_ensembl_worker, chunks, MAX_POST_SIZE) | |
#except: | |
# raise | |
#cc = [] | |
#finally: | |
p.close() | |
p.join() | |
contexts = {} | |
for c in cc: | |
contexts.update(c) | |
if len(contexts) != len(raw_mutations): | |
# print("Not all mutations have context") | |
pass | |
return contexts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment