Skip to content

Instantly share code, notes, and snippets.

@ahwagner
Created July 25, 2019 20:46
Show Gist options
  • Save ahwagner/3a3be37c109c2d34464320a42fc6b792 to your computer and use it in GitHub Desktop.
Save ahwagner/3a3be37c109c2d34464320a42fc6b792 to your computer and use it in GitHub Desktop.
Add VR index to VICC records
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from ga4gh.core import ga4gh_digest\n",
"from ga4gh.vr import __version__, ga4gh_identify, ga4gh_serialize, models, normalize\n",
"from ga4gh.vr.extras.dataproxy import SeqRepoRESTDataProxy\n",
"from ga4gh.vr.extras.translator import Translator\n",
"\n",
"seqrepo_rest_service_url = \"http://localhost:5000/seqrepo\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tlr = Translator(data_proxy=dp)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from bioutils.exceptions import BioutilsError"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def vr_file_encode(sourcename):\n",
" with open('{}.json'.format(sourcename), 'r') as infile, open('{}.vr.json'.format(sourcename), 'w') as outfile:\n",
" for record in infile:\n",
" json_record = json.loads(record)\n",
" vr_record_encode(json_record)\n",
" json.dump(json_record, outfile)\n",
" outfile.write('\\n')\n",
" return '{} complete.'.format(sourcename)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def vr_record_encode(record):\n",
" vr_set = set()\n",
" for feature in record['features']:\n",
" feature_vr_set = set()\n",
" feature_ga4gh = list()\n",
" if 'hgvs' not in feature:\n",
" continue\n",
" for hgvs_string in feature['hgvs']:\n",
" try:\n",
" vr_obj = tlr.from_hgvs(hgvs_string)\n",
" except (KeyError, ValueError, BioutilsError, AttributeError):\n",
" continue\n",
" vr_obj_dict = vr_obj.as_dict()\n",
" va_id = vr_obj_dict['id'].split(':')[-1]\n",
" vl_id = vr_obj_dict['location']['id'].split(':')[-1]\n",
" sq_id = vr_obj_dict['location']['sequence_id'].split(':')[-1]\n",
" if va_id not in feature_vr_set:\n",
" feature_vr_set.add(va_id)\n",
" feature_ga4gh.append(vr_obj_dict)\n",
" vr_set.update([va_id, vl_id, sq_id])\n",
" feature['ga4gh'] = feature_ga4gh\n",
" text_obj = models.Text(definition=record['feature_names'])\n",
" t_id = ga4gh_identify(text_obj)\n",
" record['text_object'] = text_obj.as_dict()\n",
" vr_set.add(t_id.split(':')[-1])\n",
" record['ga4gh'] = list(vr_set)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pmkb_records = list()\n",
"with open('pmkb.json','r') as infile:\n",
" for record in infile:\n",
" pmkb_records.append(json.loads(record))"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import Counter\n",
"\n",
"c = Counter()\n",
"for record in pmkb_records:\n",
" # vr_record_encode(record)\n",
" if not isinstance(record['feature_names'], list):\n",
" c['strings'] += 1\n",
" elif len(record['feature_names']) > 0:\n",
" c['terms'] += 1\n",
" else:\n",
" c[None] += 1"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"vr_record_encode(pmkb_records[1])"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'definition': 'MPL W515K, MPL W515L, MPL codon(s) 515 missense',\n",
" 'id': 'ga4gh:VT.Bbyyg0vqd2ExVierd2pH1UvC6XwHSzTe',\n",
" 'type': 'Text'}"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pmkb_records[1]['text_object']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import multiprocessing as mp"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pool = mp.Pool()\n",
"sources = ['brca','cgi','civic','jax','molecularmatch','oncokb','pmkb']\n",
"result = pool.map(vr_file_encode, sources)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['brca complete.',\n",
" 'cgi complete.',\n",
" 'civic complete.',\n",
" 'jax complete.',\n",
" 'molecularmatch complete.',\n",
" 'oncokb complete.',\n",
" 'pmkb complete.']"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TODO: convert jax_trials to string for feature_names field"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [py37]",
"language": "python",
"name": "Python [py37]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment