Last active
August 29, 2016 03:47
-
-
Save ahwagner/707dae34173b7e7251bb4f2d28d836d0 to your computer and use it in GitHub Desktop.
CIViC_term_mapping.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import re\n", | |
"from Bio.Alphabet import IUPAC\n", | |
"import csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"params = {'count': 1000}\n", | |
"r = requests.get('http://civic.genome.wustl.edu/api/variants', params=params)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"580" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"variants = r.json()['records']\n", | |
"len(variants)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"prot_alpha = IUPAC.IUPACProtein.letters" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Test which variants match examples" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"regex_dict = \\\n", | |
" {'missense_variant': \n", | |
" {'G12D': re.compile(\"[{0}]\\\\d+[{0}]\".format(prot_alpha))},\n", | |
" 'protein_altering_variant':\n", | |
" {'G12': re.compile(\"[{0}]\\\\d+\".format(prot_alpha)),\n", | |
" 'KINASE DOMAIN MUTATION': re.compile(r'.* DOMAIN MUTATION')},\n", | |
" 'frameshift_truncation':\n", | |
" {'V2288fs*1': re.compile(\"[{0}]\\\\d+fs\\\\*\\\\d+\".format(prot_alpha))},\n", | |
" 'inframe_deletion': \n", | |
" {'DEL I843': re.compile(\"DEL [{0}]\\\\d+\".format(prot_alpha)),\n", | |
" 'V560DEL': re.compile(\"[{0}]\\\\d+DEL\".format(prot_alpha)),\n", | |
" 'DEL 755-759': re.compile(r'DEL \\d+-\\d+')},\n", | |
" 'inframe_insertion':\n", | |
" {'P790INS': re.compile(\"[{0}]\\\\d+INS\".format(prot_alpha)),\n", | |
" 'M774INSAYVM': re.compile(\"[{0}]\\\\d+INS[{0}]+\".format(prot_alpha)),\n", | |
" 'ITD': re.compile(r'ITD')},\n", | |
" 'gene_variant':\n", | |
" {'MUTATION': re.compile(r'MUTATION')},\n", | |
" 'exon_variant':\n", | |
" {'EXON 10 MUTATION': re.compile(r'EXON \\d+ MUTATION')},\n", | |
" 'transcript_fusion':\n", | |
" {'EML4-ALK': re.compile(r'\\w+-\\w+'),\n", | |
" 'ALK FUSIONS': re.compile(r'\\w+ FUSIONS')},\n", | |
" 'transcript_fusion and missense_variant':\n", | |
" {'EML4-ALK G332K': re.compile(\"\\\\w+-\\\\w+ [{0}]\\\\d+[{0}]\".format(prot_alpha))},\n", | |
" 'transcript_translocation or feature_translocation':\n", | |
" {'REARRANGEMENT': re.compile(r'REARRANGEMENT')},\n", | |
" 'wild_type':\n", | |
" {'WILD TYPE': re.compile(r'WILD TYPE')},\n", | |
" 'loss_of_heterozygosity':\n", | |
" {'LOH': re.compile(r'LOH')},\n", | |
" 'transcript_amplification':\n", | |
" {'AMPLIFICATION': re.compile(r'AMPLIFICATION')},\n", | |
" 'transcript_ablation':\n", | |
" {'DELETION': re.compile(r'DELETION')},\n", | |
" 'copy_number_change':\n", | |
" {'COPY NUMBER VARIATION': re.compile(r'COPY NUMBER VARIATION')},\n", | |
" 'loss_of_function_variant': \n", | |
" {'LOSS-OF-FUNCTION': re.compile(r'LOSS-OF-FUNCTION'),\n", | |
" 'LOSS': re.compile(r'LOSS')},\n", | |
" 'exon_loss_variant':\n", | |
" {'EXON 14 SKIPPING MUTATION': re.compile(r'EXON \\d+ SKIPPING MUTATION')},\n", | |
" '5_prime_UTR_variant':\n", | |
" {\"5' UTR MUTATION\": re.compile(r\"5' UTR MUTATION\")},\n", | |
" '3_prime_UTR_variant':\n", | |
" {\"3' UTR MUTATION\": re.compile(r\"3' UTR MUTATION\")},\n", | |
" 'NA':\n", | |
" { x: re.compile(x) for x in ['EXPRESSION',\n", | |
" 'NUCLEAR EXPRESSION',\n", | |
" 'CYTOPLASMIC EXPRESSION',\n", | |
" 'OVEREXPRESSION',\n", | |
" 'UNDEREXPRESSION',\n", | |
" 'METHYLATION',\n", | |
" 'PROMOTER METHYLATION',\n", | |
" 'HYPOMETHYLATION',\n", | |
" 'HYPERMETHYLATION']}\n", | |
" }" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def count_matches(rd):\n", | |
" results = dict()\n", | |
" results['all'] = set()\n", | |
" for term, examples in rd.items():\n", | |
" t = set()\n", | |
" results[term] = dict()\n", | |
" print(\"{0}:\".format(term))\n", | |
" for example, r in examples.items():\n", | |
" s = set([v['id'] for v in variants if r.fullmatch(v['name'])])\n", | |
" results[term][example] = s\n", | |
" print(\"\\t{0}: {1}\".format(example, len(s)))\n", | |
" t |= s\n", | |
" results[term]['all'] = t\n", | |
" results['all'] |= t\n", | |
" print(\"\\tTotal: {0}\".format(len(t)))\n", | |
" x, y = len(results['all']), len(variants)\n", | |
" print(\"Total: {0} / {1} ({2}%)\".format(x, y, round(x/y * 100, 1)))\n", | |
" s = results['all'] - results['NA']['all']\n", | |
" print(\"Total (no NA): {0} / {1} ({2}%)\".format(len(s), y, round(len(s) / y * 100, 1)))\n", | |
" return results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"inframe_insertion:\n", | |
"\tP790INS: 1\n", | |
"\tITD: 1\n", | |
"\tM774INSAYVM: 1\n", | |
"\tTotal: 3\n", | |
"inframe_deletion:\n", | |
"\tDEL 755-759: 1\n", | |
"\tV560DEL: 1\n", | |
"\tDEL I843: 1\n", | |
"\tTotal: 3\n", | |
"transcript_fusion:\n", | |
"\tEML4-ALK: 32\n", | |
"\tALK FUSIONS: 6\n", | |
"\tTotal: 38\n", | |
"transcript_amplification:\n", | |
"\tAMPLIFICATION: 29\n", | |
"\tTotal: 29\n", | |
"wild_type:\n", | |
"\tWILD TYPE: 3\n", | |
"\tTotal: 3\n", | |
"frameshift_truncation:\n", | |
"\tV2288fs*1: 2\n", | |
"\tTotal: 2\n", | |
"NA:\n", | |
"\tHYPERMETHYLATION: 0\n", | |
"\tEXPRESSION: 73\n", | |
"\tUNDEREXPRESSION: 14\n", | |
"\tMETHYLATION: 1\n", | |
"\tCYTOPLASMIC EXPRESSION: 1\n", | |
"\tPROMOTER METHYLATION: 1\n", | |
"\tNUCLEAR EXPRESSION: 7\n", | |
"\tOVEREXPRESSION: 35\n", | |
"\tHYPOMETHYLATION: 0\n", | |
"\tTotal: 132\n", | |
"loss_of_heterozygosity:\n", | |
"\tLOH: 1\n", | |
"\tTotal: 1\n", | |
"transcript_ablation:\n", | |
"\tDELETION: 3\n", | |
"\tTotal: 3\n", | |
"loss_of_function_variant:\n", | |
"\tLOSS-OF-FUNCTION: 10\n", | |
"\tLOSS: 9\n", | |
"\tTotal: 19\n", | |
"missense_variant:\n", | |
"\tG12D: 144\n", | |
"\tTotal: 144\n", | |
"protein_altering_variant:\n", | |
"\tG12: 18\n", | |
"\tKINASE DOMAIN MUTATION: 4\n", | |
"\tTotal: 22\n", | |
"exon_loss_variant:\n", | |
"\tEXON 14 SKIPPING MUTATION: 1\n", | |
"\tTotal: 1\n", | |
"copy_number_change:\n", | |
"\tCOPY NUMBER VARIATION: 1\n", | |
"\tTotal: 1\n", | |
"transcript_fusion and missense_variant:\n", | |
"\tEML4-ALK G332K: 15\n", | |
"\tTotal: 15\n", | |
"transcript_translocation or feature_translocation:\n", | |
"\tREARRANGEMENT: 2\n", | |
"\tTotal: 2\n", | |
"5_prime_UTR_variant:\n", | |
"\t5' UTR MUTATION: 1\n", | |
"\tTotal: 1\n", | |
"3_prime_UTR_variant:\n", | |
"\t3' UTR MUTATION: 2\n", | |
"\tTotal: 2\n", | |
"exon_variant:\n", | |
"\tEXON 10 MUTATION: 11\n", | |
"\tTotal: 11\n", | |
"gene_variant:\n", | |
"\tMUTATION: 49\n", | |
"\tTotal: 49\n", | |
"Total: 481 / 580 (82.9%)\n", | |
"Total (no NA): 349 / 580 (60.2%)\n" | |
] | |
} | |
], | |
"source": [ | |
"results = count_matches(regex_dict)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# regex_dict['discussion'] = dict()\n", | |
"regex_dict['NA']['PROMOTER METHYLATION'] = re.compile(r'PROMOTER \\w*METHYLATION')\n", | |
"regex_dict['inframe_insertion']['ITD'] = re.compile(r'(ITD)|(INTERNAL DUPLICATION)')\n", | |
"regex_dict['NA']['PHOSPHORYLATION'] = re.compile(r'\\w*( )?PHOSPHORYLATION')\n", | |
"regex_dict['frameshift_truncation']['frameshift variance'] = re.compile(\"[{0}]\\\\d+((fs)|(FS))(\\\\*\\\\d)?\".format(prot_alpha))\n", | |
"regex_dict['frameshift_truncation']['FRAMESHIFT TRUNCATION'] = re.compile(\"FRAMESHIFT TRUNCATION\")\n", | |
"regex_dict['transcript_fusion']['fusion nomenclature'] = re.compile(re.compile(r'\\w+-\\w+ FUSION'))\n", | |
"# regex_dict['discussion']['polymorphisms'] = re.compile(r'(SNP)|(.*polymorphism)', flags=re.IGNORECASE)\n", | |
"# regex_dict['discussion']['serum'] = re.compile(r'SERUM LEVELS')\n", | |
"# regex_dict['discussion']['exon deletions'] = re.compile(r'EXON \\d+(-\\d+)? DELETION')\n", | |
"regex_dict['transcript_fusion']['EML4-ALK'] = re.compile(\"\\w+-\\w+( [{0}]\\\\d+;[{0}]\\\\d+)?\".format(prot_alpha))\n", | |
"regex_dict['missense_variant']['stars and slashes'] = re.compile(\"[{0}]\\\\d+[{0}*](/[{0}])?\".format(prot_alpha))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"inframe_insertion:\n", | |
"\tP790INS: 1\n", | |
"\tITD: 2\n", | |
"\tM774INSAYVM: 1\n", | |
"\tTotal: 4\n", | |
"inframe_deletion:\n", | |
"\tDEL 755-759: 1\n", | |
"\tV560DEL: 1\n", | |
"\tDEL I843: 1\n", | |
"\tTotal: 3\n", | |
"transcript_fusion:\n", | |
"\tEML4-ALK: 35\n", | |
"\tfusion nomenclature: 6\n", | |
"\tALK FUSIONS: 6\n", | |
"\tTotal: 47\n", | |
"transcript_amplification:\n", | |
"\tAMPLIFICATION: 29\n", | |
"\tTotal: 29\n", | |
"wild_type:\n", | |
"\tWILD TYPE: 3\n", | |
"\tTotal: 3\n", | |
"frameshift_truncation:\n", | |
"\tframeshift variance: 9\n", | |
"\tFRAMESHIFT TRUNCATION: 1\n", | |
"\tV2288fs*1: 2\n", | |
"\tTotal: 11\n", | |
"NA:\n", | |
"\tHYPERMETHYLATION: 0\n", | |
"\tEXPRESSION: 73\n", | |
"\tUNDEREXPRESSION: 14\n", | |
"\tMETHYLATION: 1\n", | |
"\tPHOSPHORYLATION: 5\n", | |
"\tCYTOPLASMIC EXPRESSION: 1\n", | |
"\tPROMOTER METHYLATION: 5\n", | |
"\tNUCLEAR EXPRESSION: 7\n", | |
"\tOVEREXPRESSION: 35\n", | |
"\tHYPOMETHYLATION: 0\n", | |
"\tTotal: 141\n", | |
"loss_of_heterozygosity:\n", | |
"\tLOH: 1\n", | |
"\tTotal: 1\n", | |
"transcript_ablation:\n", | |
"\tDELETION: 3\n", | |
"\tTotal: 3\n", | |
"loss_of_function_variant:\n", | |
"\tLOSS-OF-FUNCTION: 10\n", | |
"\tLOSS: 9\n", | |
"\tTotal: 19\n", | |
"missense_variant:\n", | |
"\tstars and slashes: 153\n", | |
"\tG12D: 144\n", | |
"\tTotal: 153\n", | |
"protein_altering_variant:\n", | |
"\tG12: 18\n", | |
"\tKINASE DOMAIN MUTATION: 4\n", | |
"\tTotal: 22\n", | |
"exon_loss_variant:\n", | |
"\tEXON 14 SKIPPING MUTATION: 1\n", | |
"\tTotal: 1\n", | |
"copy_number_change:\n", | |
"\tCOPY NUMBER VARIATION: 1\n", | |
"\tTotal: 1\n", | |
"transcript_fusion and missense_variant:\n", | |
"\tEML4-ALK G332K: 15\n", | |
"\tTotal: 15\n", | |
"transcript_translocation or feature_translocation:\n", | |
"\tREARRANGEMENT: 2\n", | |
"\tTotal: 2\n", | |
"5_prime_UTR_variant:\n", | |
"\t5' UTR MUTATION: 1\n", | |
"\tTotal: 1\n", | |
"3_prime_UTR_variant:\n", | |
"\t3' UTR MUTATION: 2\n", | |
"\tTotal: 2\n", | |
"exon_variant:\n", | |
"\tEXON 10 MUTATION: 11\n", | |
"\tTotal: 11\n", | |
"gene_variant:\n", | |
"\tMUTATION: 49\n", | |
"\tTotal: 49\n", | |
"Total: 518 / 580 (89.3%)\n", | |
"Total (no NA): 377 / 580 (65.0%)\n" | |
] | |
} | |
], | |
"source": [ | |
"results = count_matches(regex_dict)\n", | |
"missed_variants = [(v['name'], v['id']) for v in variants if v['id'] not in results['all']]\n", | |
"with open('missed_variants.txt', 'w') as f:\n", | |
" writer = csv.writer(f)\n", | |
" writer.writerow(['variant_name', 'civic_id'])\n", | |
" writer.writerows(missed_variants)" | |
] | |
} | |
], | |
"metadata": { | |
"gist": { | |
"data": { | |
"description": "CIViC_term_mapping.ipynb", | |
"public": true | |
}, | |
"id": "" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment