Created
July 25, 2019 20:46
-
-
Save ahwagner/3a3be37c109c2d34464320a42fc6b792 to your computer and use it in GitHub Desktop.
Add VR index to VICC records
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from ga4gh.core import ga4gh_digest\n", | |
"from ga4gh.vr import __version__, ga4gh_identify, ga4gh_serialize, models, normalize\n", | |
"from ga4gh.vr.extras.dataproxy import SeqRepoRESTDataProxy\n", | |
"from ga4gh.vr.extras.translator import Translator\n", | |
"\n", | |
"seqrepo_rest_service_url = \"http://localhost:5000/seqrepo\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tlr = Translator(data_proxy=dp)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from bioutils.exceptions import BioutilsError" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def vr_file_encode(sourcename):\n", | |
" with open('{}.json'.format(sourcename), 'r') as infile, open('{}.vr.json'.format(sourcename), 'w') as outfile:\n", | |
" for record in infile:\n", | |
" json_record = json.loads(record)\n", | |
" vr_record_encode(json_record)\n", | |
" json.dump(json_record, outfile)\n", | |
" outfile.write('\\n')\n", | |
" return '{} complete.'.format(sourcename)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def vr_record_encode(record):\n", | |
" vr_set = set()\n", | |
" for feature in record['features']:\n", | |
" feature_vr_set = set()\n", | |
" feature_ga4gh = list()\n", | |
" if 'hgvs' not in feature:\n", | |
" continue\n", | |
" for hgvs_string in feature['hgvs']:\n", | |
" try:\n", | |
" vr_obj = tlr.from_hgvs(hgvs_string)\n", | |
" except (KeyError, ValueError, BioutilsError, AttributeError):\n", | |
" continue\n", | |
" vr_obj_dict = vr_obj.as_dict()\n", | |
" va_id = vr_obj_dict['id'].split(':')[-1]\n", | |
" vl_id = vr_obj_dict['location']['id'].split(':')[-1]\n", | |
" sq_id = vr_obj_dict['location']['sequence_id'].split(':')[-1]\n", | |
" if va_id not in feature_vr_set:\n", | |
" feature_vr_set.add(va_id)\n", | |
" feature_ga4gh.append(vr_obj_dict)\n", | |
" vr_set.update([va_id, vl_id, sq_id])\n", | |
" feature['ga4gh'] = feature_ga4gh\n", | |
" text_obj = models.Text(definition=record['feature_names'])\n", | |
" t_id = ga4gh_identify(text_obj)\n", | |
" record['text_object'] = text_obj.as_dict()\n", | |
" vr_set.add(t_id.split(':')[-1])\n", | |
" record['ga4gh'] = list(vr_set)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"pmkb_records = list()\n", | |
"with open('pmkb.json','r') as infile:\n", | |
" for record in infile:\n", | |
" pmkb_records.append(json.loads(record))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 78, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import Counter\n", | |
"\n", | |
"c = Counter()\n", | |
"for record in pmkb_records:\n", | |
" # vr_record_encode(record)\n", | |
" if not isinstance(record['feature_names'], list):\n", | |
" c['strings'] += 1\n", | |
" elif len(record['feature_names']) > 0:\n", | |
" c['terms'] += 1\n", | |
" else:\n", | |
" c[None] += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"vr_record_encode(pmkb_records[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'definition': 'MPL W515K, MPL W515L, MPL codon(s) 515 missense',\n", | |
" 'id': 'ga4gh:VT.Bbyyg0vqd2ExVierd2pH1UvC6XwHSzTe',\n", | |
" 'type': 'Text'}" | |
] | |
}, | |
"execution_count": 88, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pmkb_records[1]['text_object']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import multiprocessing as mp" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"pool = mp.Pool()\n", | |
"sources = ['brca','cgi','civic','jax','molecularmatch','oncokb','pmkb']\n", | |
"result = pool.map(vr_file_encode, sources)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['brca complete.',\n", | |
" 'cgi complete.',\n", | |
" 'civic complete.',\n", | |
" 'jax complete.',\n", | |
" 'molecularmatch complete.',\n", | |
" 'oncokb complete.',\n", | |
" 'pmkb complete.']" | |
] | |
}, | |
"execution_count": 97, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"result" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# TODO: convert jax_trials to string for feature_names field" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [py37]", | |
"language": "python", | |
"name": "Python [py37]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment