Skip to content

Instantly share code, notes, and snippets.

@ahwagner
Last active August 29, 2016 03:47
Show Gist options
  • Save ahwagner/707dae34173b7e7251bb4f2d28d836d0 to your computer and use it in GitHub Desktop.
Save ahwagner/707dae34173b7e7251bb4f2d28d836d0 to your computer and use it in GitHub Desktop.
CIViC_term_mapping.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import re\n",
"from Bio.Alphabet import IUPAC\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"params = {'count': 1000}\n",
"r = requests.get('http://civic.genome.wustl.edu/api/variants', params=params)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"580"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"variants = r.json()['records']\n",
"len(variants)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"prot_alpha = IUPAC.IUPACProtein.letters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test which variants match examples"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"regex_dict = \\\n",
" {'missense_variant': \n",
" {'G12D': re.compile(\"[{0}]\\\\d+[{0}]\".format(prot_alpha))},\n",
" 'protein_altering_variant':\n",
" {'G12': re.compile(\"[{0}]\\\\d+\".format(prot_alpha)),\n",
" 'KINASE DOMAIN MUTATION': re.compile(r'.* DOMAIN MUTATION')},\n",
" 'frameshift_truncation':\n",
" {'V2288fs*1': re.compile(\"[{0}]\\\\d+fs\\\\*\\\\d+\".format(prot_alpha))},\n",
" 'inframe_deletion': \n",
" {'DEL I843': re.compile(\"DEL [{0}]\\\\d+\".format(prot_alpha)),\n",
" 'V560DEL': re.compile(\"[{0}]\\\\d+DEL\".format(prot_alpha)),\n",
" 'DEL 755-759': re.compile(r'DEL \\d+-\\d+')},\n",
" 'inframe_insertion':\n",
" {'P790INS': re.compile(\"[{0}]\\\\d+INS\".format(prot_alpha)),\n",
" 'M774INSAYVM': re.compile(\"[{0}]\\\\d+INS[{0}]+\".format(prot_alpha)),\n",
" 'ITD': re.compile(r'ITD')},\n",
" 'gene_variant':\n",
" {'MUTATION': re.compile(r'MUTATION')},\n",
" 'exon_variant':\n",
" {'EXON 10 MUTATION': re.compile(r'EXON \\d+ MUTATION')},\n",
" 'transcript_fusion':\n",
" {'EML4-ALK': re.compile(r'\\w+-\\w+'),\n",
" 'ALK FUSIONS': re.compile(r'\\w+ FUSIONS')},\n",
" 'transcript_fusion and missense_variant':\n",
" {'EML4-ALK G332K': re.compile(\"\\\\w+-\\\\w+ [{0}]\\\\d+[{0}]\".format(prot_alpha))},\n",
" 'transcript_translocation or feature_translocation':\n",
" {'REARRANGEMENT': re.compile(r'REARRANGEMENT')},\n",
" 'wild_type':\n",
" {'WILD TYPE': re.compile(r'WILD TYPE')},\n",
" 'loss_of_heterozygosity':\n",
" {'LOH': re.compile(r'LOH')},\n",
" 'transcript_amplification':\n",
" {'AMPLIFICATION': re.compile(r'AMPLIFICATION')},\n",
" 'transcript_ablation':\n",
" {'DELETION': re.compile(r'DELETION')},\n",
" 'copy_number_change':\n",
" {'COPY NUMBER VARIATION': re.compile(r'COPY NUMBER VARIATION')},\n",
" 'loss_of_function_variant': \n",
" {'LOSS-OF-FUNCTION': re.compile(r'LOSS-OF-FUNCTION'),\n",
" 'LOSS': re.compile(r'LOSS')},\n",
" 'exon_loss_variant':\n",
" {'EXON 14 SKIPPING MUTATION': re.compile(r'EXON \\d+ SKIPPING MUTATION')},\n",
" '5_prime_UTR_variant':\n",
" {\"5' UTR MUTATION\": re.compile(r\"5' UTR MUTATION\")},\n",
" '3_prime_UTR_variant':\n",
" {\"3' UTR MUTATION\": re.compile(r\"3' UTR MUTATION\")},\n",
" 'NA':\n",
" { x: re.compile(x) for x in ['EXPRESSION',\n",
" 'NUCLEAR EXPRESSION',\n",
" 'CYTOPLASMIC EXPRESSION',\n",
" 'OVEREXPRESSION',\n",
" 'UNDEREXPRESSION',\n",
" 'METHYLATION',\n",
" 'PROMOTER METHYLATION',\n",
" 'HYPOMETHYLATION',\n",
" 'HYPERMETHYLATION']}\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def count_matches(rd):\n",
" results = dict()\n",
" results['all'] = set()\n",
" for term, examples in rd.items():\n",
" t = set()\n",
" results[term] = dict()\n",
" print(\"{0}:\".format(term))\n",
" for example, r in examples.items():\n",
" s = set([v['id'] for v in variants if r.fullmatch(v['name'])])\n",
" results[term][example] = s\n",
" print(\"\\t{0}: {1}\".format(example, len(s)))\n",
" t |= s\n",
" results[term]['all'] = t\n",
" results['all'] |= t\n",
" print(\"\\tTotal: {0}\".format(len(t)))\n",
" x, y = len(results['all']), len(variants)\n",
" print(\"Total: {0} / {1} ({2}%)\".format(x, y, round(x/y * 100, 1)))\n",
" s = results['all'] - results['NA']['all']\n",
" print(\"Total (no NA): {0} / {1} ({2}%)\".format(len(s), y, round(len(s) / y * 100, 1)))\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"inframe_insertion:\n",
"\tP790INS: 1\n",
"\tITD: 1\n",
"\tM774INSAYVM: 1\n",
"\tTotal: 3\n",
"inframe_deletion:\n",
"\tDEL 755-759: 1\n",
"\tV560DEL: 1\n",
"\tDEL I843: 1\n",
"\tTotal: 3\n",
"transcript_fusion:\n",
"\tEML4-ALK: 32\n",
"\tALK FUSIONS: 6\n",
"\tTotal: 38\n",
"transcript_amplification:\n",
"\tAMPLIFICATION: 29\n",
"\tTotal: 29\n",
"wild_type:\n",
"\tWILD TYPE: 3\n",
"\tTotal: 3\n",
"frameshift_truncation:\n",
"\tV2288fs*1: 2\n",
"\tTotal: 2\n",
"NA:\n",
"\tHYPERMETHYLATION: 0\n",
"\tEXPRESSION: 73\n",
"\tUNDEREXPRESSION: 14\n",
"\tMETHYLATION: 1\n",
"\tCYTOPLASMIC EXPRESSION: 1\n",
"\tPROMOTER METHYLATION: 1\n",
"\tNUCLEAR EXPRESSION: 7\n",
"\tOVEREXPRESSION: 35\n",
"\tHYPOMETHYLATION: 0\n",
"\tTotal: 132\n",
"loss_of_heterozygosity:\n",
"\tLOH: 1\n",
"\tTotal: 1\n",
"transcript_ablation:\n",
"\tDELETION: 3\n",
"\tTotal: 3\n",
"loss_of_function_variant:\n",
"\tLOSS-OF-FUNCTION: 10\n",
"\tLOSS: 9\n",
"\tTotal: 19\n",
"missense_variant:\n",
"\tG12D: 144\n",
"\tTotal: 144\n",
"protein_altering_variant:\n",
"\tG12: 18\n",
"\tKINASE DOMAIN MUTATION: 4\n",
"\tTotal: 22\n",
"exon_loss_variant:\n",
"\tEXON 14 SKIPPING MUTATION: 1\n",
"\tTotal: 1\n",
"copy_number_change:\n",
"\tCOPY NUMBER VARIATION: 1\n",
"\tTotal: 1\n",
"transcript_fusion and missense_variant:\n",
"\tEML4-ALK G332K: 15\n",
"\tTotal: 15\n",
"transcript_translocation or feature_translocation:\n",
"\tREARRANGEMENT: 2\n",
"\tTotal: 2\n",
"5_prime_UTR_variant:\n",
"\t5' UTR MUTATION: 1\n",
"\tTotal: 1\n",
"3_prime_UTR_variant:\n",
"\t3' UTR MUTATION: 2\n",
"\tTotal: 2\n",
"exon_variant:\n",
"\tEXON 10 MUTATION: 11\n",
"\tTotal: 11\n",
"gene_variant:\n",
"\tMUTATION: 49\n",
"\tTotal: 49\n",
"Total: 481 / 580 (82.9%)\n",
"Total (no NA): 349 / 580 (60.2%)\n"
]
}
],
"source": [
"results = count_matches(regex_dict)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# regex_dict['discussion'] = dict()\n",
"regex_dict['NA']['PROMOTER METHYLATION'] = re.compile(r'PROMOTER \\w*METHYLATION')\n",
"regex_dict['inframe_insertion']['ITD'] = re.compile(r'(ITD)|(INTERNAL DUPLICATION)')\n",
"regex_dict['NA']['PHOSPHORYLATION'] = re.compile(r'\\w*( )?PHOSPHORYLATION')\n",
"regex_dict['frameshift_truncation']['frameshift variance'] = re.compile(\"[{0}]\\\\d+((fs)|(FS))(\\\\*\\\\d)?\".format(prot_alpha))\n",
"regex_dict['frameshift_truncation']['FRAMESHIFT TRUNCATION'] = re.compile(\"FRAMESHIFT TRUNCATION\")\n",
"regex_dict['transcript_fusion']['fusion nomenclature'] = re.compile(re.compile(r'\\w+-\\w+ FUSION'))\n",
"# regex_dict['discussion']['polymorphisms'] = re.compile(r'(SNP)|(.*polymorphism)', flags=re.IGNORECASE)\n",
"# regex_dict['discussion']['serum'] = re.compile(r'SERUM LEVELS')\n",
"# regex_dict['discussion']['exon deletions'] = re.compile(r'EXON \\d+(-\\d+)? DELETION')\n",
"regex_dict['transcript_fusion']['EML4-ALK'] = re.compile(\"\\w+-\\w+( [{0}]\\\\d+;[{0}]\\\\d+)?\".format(prot_alpha))\n",
"regex_dict['missense_variant']['stars and slashes'] = re.compile(\"[{0}]\\\\d+[{0}*](/[{0}])?\".format(prot_alpha))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"inframe_insertion:\n",
"\tP790INS: 1\n",
"\tITD: 2\n",
"\tM774INSAYVM: 1\n",
"\tTotal: 4\n",
"inframe_deletion:\n",
"\tDEL 755-759: 1\n",
"\tV560DEL: 1\n",
"\tDEL I843: 1\n",
"\tTotal: 3\n",
"transcript_fusion:\n",
"\tEML4-ALK: 35\n",
"\tfusion nomenclature: 6\n",
"\tALK FUSIONS: 6\n",
"\tTotal: 47\n",
"transcript_amplification:\n",
"\tAMPLIFICATION: 29\n",
"\tTotal: 29\n",
"wild_type:\n",
"\tWILD TYPE: 3\n",
"\tTotal: 3\n",
"frameshift_truncation:\n",
"\tframeshift variance: 9\n",
"\tFRAMESHIFT TRUNCATION: 1\n",
"\tV2288fs*1: 2\n",
"\tTotal: 11\n",
"NA:\n",
"\tHYPERMETHYLATION: 0\n",
"\tEXPRESSION: 73\n",
"\tUNDEREXPRESSION: 14\n",
"\tMETHYLATION: 1\n",
"\tPHOSPHORYLATION: 5\n",
"\tCYTOPLASMIC EXPRESSION: 1\n",
"\tPROMOTER METHYLATION: 5\n",
"\tNUCLEAR EXPRESSION: 7\n",
"\tOVEREXPRESSION: 35\n",
"\tHYPOMETHYLATION: 0\n",
"\tTotal: 141\n",
"loss_of_heterozygosity:\n",
"\tLOH: 1\n",
"\tTotal: 1\n",
"transcript_ablation:\n",
"\tDELETION: 3\n",
"\tTotal: 3\n",
"loss_of_function_variant:\n",
"\tLOSS-OF-FUNCTION: 10\n",
"\tLOSS: 9\n",
"\tTotal: 19\n",
"missense_variant:\n",
"\tstars and slashes: 153\n",
"\tG12D: 144\n",
"\tTotal: 153\n",
"protein_altering_variant:\n",
"\tG12: 18\n",
"\tKINASE DOMAIN MUTATION: 4\n",
"\tTotal: 22\n",
"exon_loss_variant:\n",
"\tEXON 14 SKIPPING MUTATION: 1\n",
"\tTotal: 1\n",
"copy_number_change:\n",
"\tCOPY NUMBER VARIATION: 1\n",
"\tTotal: 1\n",
"transcript_fusion and missense_variant:\n",
"\tEML4-ALK G332K: 15\n",
"\tTotal: 15\n",
"transcript_translocation or feature_translocation:\n",
"\tREARRANGEMENT: 2\n",
"\tTotal: 2\n",
"5_prime_UTR_variant:\n",
"\t5' UTR MUTATION: 1\n",
"\tTotal: 1\n",
"3_prime_UTR_variant:\n",
"\t3' UTR MUTATION: 2\n",
"\tTotal: 2\n",
"exon_variant:\n",
"\tEXON 10 MUTATION: 11\n",
"\tTotal: 11\n",
"gene_variant:\n",
"\tMUTATION: 49\n",
"\tTotal: 49\n",
"Total: 518 / 580 (89.3%)\n",
"Total (no NA): 377 / 580 (65.0%)\n"
]
}
],
"source": [
"results = count_matches(regex_dict)\n",
"missed_variants = [(v['name'], v['id']) for v in variants if v['id'] not in results['all']]\n",
"with open('missed_variants.txt', 'w') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerow(['variant_name', 'civic_id'])\n",
" writer.writerows(missed_variants)"
]
}
],
"metadata": {
"gist": {
"data": {
"description": "CIViC_term_mapping.ipynb",
"public": true
},
"id": ""
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment