ahwagner/CIViC term mapping.ipynb

## CIViC term mapping.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "from Bio.Alphabet import IUPAC\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params = {'count': 1000}\n",
    "r = requests.get('http://civic.genome.wustl.edu/api/variants', params=params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "580"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "variants = r.json()['records']\n",
    "len(variants)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "prot_alpha = IUPAC.IUPACProtein.letters"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test which variants match examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "regex_dict = \\\n",
    "    {'missense_variant': \n",
    "         {'G12D': re.compile(\"[{0}]\\\\d+[{0}]\".format(prot_alpha))},\n",
    "     'protein_altering_variant':\n",
    "         {'G12': re.compile(\"[{0}]\\\\d+\".format(prot_alpha)),\n",
    "          'KINASE DOMAIN MUTATION': re.compile(r'.* DOMAIN MUTATION')},\n",
    "     'frameshift_truncation':\n",
    "         {'V2288fs*1': re.compile(\"[{0}]\\\\d+fs\\\\*\\\\d+\".format(prot_alpha))},\n",
    "     'inframe_deletion': \n",
    "         {'DEL I843': re.compile(\"DEL [{0}]\\\\d+\".format(prot_alpha)),\n",
    "          'V560DEL': re.compile(\"[{0}]\\\\d+DEL\".format(prot_alpha)),\n",
    "          'DEL 755-759': re.compile(r'DEL \\d+-\\d+')},\n",
    "     'inframe_insertion':\n",
    "         {'P790INS': re.compile(\"[{0}]\\\\d+INS\".format(prot_alpha)),\n",
    "          'M774INSAYVM': re.compile(\"[{0}]\\\\d+INS[{0}]+\".format(prot_alpha)),\n",
    "          'ITD': re.compile(r'ITD')},\n",
    "     'gene_variant':\n",
    "         {'MUTATION': re.compile(r'MUTATION')},\n",
    "     'exon_variant':\n",
    "         {'EXON 10 MUTATION': re.compile(r'EXON \\d+ MUTATION')},\n",
    "     'transcript_fusion':\n",
    "         {'EML4-ALK': re.compile(r'\\w+-\\w+'),\n",
    "          'ALK FUSIONS': re.compile(r'\\w+ FUSIONS')},\n",
    "     'transcript_fusion and missense_variant':\n",
    "         {'EML4-ALK G332K': re.compile(\"\\\\w+-\\\\w+ [{0}]\\\\d+[{0}]\".format(prot_alpha))},\n",
    "     'transcript_translocation or feature_translocation':\n",
    "         {'REARRANGEMENT': re.compile(r'REARRANGEMENT')},\n",
    "     'wild_type':\n",
    "         {'WILD TYPE': re.compile(r'WILD TYPE')},\n",
    "     'loss_of_heterozygosity':\n",
    "         {'LOH': re.compile(r'LOH')},\n",
    "     'transcript_amplification':\n",
    "         {'AMPLIFICATION': re.compile(r'AMPLIFICATION')},\n",
    "     'transcript_ablation':\n",
    "         {'DELETION': re.compile(r'DELETION')},\n",
    "     'copy_number_change':\n",
    "         {'COPY NUMBER VARIATION': re.compile(r'COPY NUMBER VARIATION')},\n",
    "     'loss_of_function_variant': \n",
    "         {'LOSS-OF-FUNCTION': re.compile(r'LOSS-OF-FUNCTION'),\n",
    "          'LOSS': re.compile(r'LOSS')},\n",
    "     'exon_loss_variant':\n",
    "         {'EXON 14 SKIPPING MUTATION': re.compile(r'EXON \\d+ SKIPPING MUTATION')},\n",
    "     '5_prime_UTR_variant':\n",
    "         {\"5' UTR MUTATION\": re.compile(r\"5' UTR MUTATION\")},\n",
    "     '3_prime_UTR_variant':\n",
    "         {\"3' UTR MUTATION\": re.compile(r\"3' UTR MUTATION\")},\n",
    "     'NA':\n",
    "         { x: re.compile(x) for x in ['EXPRESSION',\n",
    "                                        'NUCLEAR EXPRESSION',\n",
    "                                        'CYTOPLASMIC EXPRESSION',\n",
    "                                        'OVEREXPRESSION',\n",
    "                                        'UNDEREXPRESSION',\n",
    "                                        'METHYLATION',\n",
    "                                        'PROMOTER METHYLATION',\n",
    "                                        'HYPOMETHYLATION',\n",
    "                                        'HYPERMETHYLATION']}\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def count_matches(rd):\n",
    "    results = dict()\n",
    "    results['all'] = set()\n",
    "    for term, examples in rd.items():\n",
    "        t = set()\n",
    "        results[term] = dict()\n",
    "        print(\"{0}:\".format(term))\n",
    "        for example, r in examples.items():\n",
    "            s = set([v['id'] for v in variants if r.fullmatch(v['name'])])\n",
    "            results[term][example] = s\n",
    "            print(\"\\t{0}: {1}\".format(example, len(s)))\n",
    "            t |= s\n",
    "        results[term]['all'] = t\n",
    "        results['all'] |= t\n",
    "        print(\"\\tTotal: {0}\".format(len(t)))\n",
    "    x, y = len(results['all']), len(variants)\n",
    "    print(\"Total: {0} / {1} ({2}%)\".format(x, y, round(x/y * 100, 1)))\n",
    "    s = results['all'] - results['NA']['all']\n",
    "    print(\"Total (no NA): {0} / {1} ({2}%)\".format(len(s), y, round(len(s) / y * 100, 1)))\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inframe_insertion:\n",
      "\tP790INS: 1\n",
      "\tITD: 1\n",
      "\tM774INSAYVM: 1\n",
      "\tTotal: 3\n",
      "inframe_deletion:\n",
      "\tDEL 755-759: 1\n",
      "\tV560DEL: 1\n",
      "\tDEL I843: 1\n",
      "\tTotal: 3\n",
      "transcript_fusion:\n",
      "\tEML4-ALK: 32\n",
      "\tALK FUSIONS: 6\n",
      "\tTotal: 38\n",
      "transcript_amplification:\n",
      "\tAMPLIFICATION: 29\n",
      "\tTotal: 29\n",
      "wild_type:\n",
      "\tWILD TYPE: 3\n",
      "\tTotal: 3\n",
      "frameshift_truncation:\n",
      "\tV2288fs*1: 2\n",
      "\tTotal: 2\n",
      "NA:\n",
      "\tHYPERMETHYLATION: 0\n",
      "\tEXPRESSION: 73\n",
      "\tUNDEREXPRESSION: 14\n",
      "\tMETHYLATION: 1\n",
      "\tCYTOPLASMIC EXPRESSION: 1\n",
      "\tPROMOTER METHYLATION: 1\n",
      "\tNUCLEAR EXPRESSION: 7\n",
      "\tOVEREXPRESSION: 35\n",
      "\tHYPOMETHYLATION: 0\n",
      "\tTotal: 132\n",
      "loss_of_heterozygosity:\n",
      "\tLOH: 1\n",
      "\tTotal: 1\n",
      "transcript_ablation:\n",
      "\tDELETION: 3\n",
      "\tTotal: 3\n",
      "loss_of_function_variant:\n",
      "\tLOSS-OF-FUNCTION: 10\n",
      "\tLOSS: 9\n",
      "\tTotal: 19\n",
      "missense_variant:\n",
      "\tG12D: 144\n",
      "\tTotal: 144\n",
      "protein_altering_variant:\n",
      "\tG12: 18\n",
      "\tKINASE DOMAIN MUTATION: 4\n",
      "\tTotal: 22\n",
      "exon_loss_variant:\n",
      "\tEXON 14 SKIPPING MUTATION: 1\n",
      "\tTotal: 1\n",
      "copy_number_change:\n",
      "\tCOPY NUMBER VARIATION: 1\n",
      "\tTotal: 1\n",
      "transcript_fusion and missense_variant:\n",
      "\tEML4-ALK G332K: 15\n",
      "\tTotal: 15\n",
      "transcript_translocation or feature_translocation:\n",
      "\tREARRANGEMENT: 2\n",
      "\tTotal: 2\n",
      "5_prime_UTR_variant:\n",
      "\t5' UTR MUTATION: 1\n",
      "\tTotal: 1\n",
      "3_prime_UTR_variant:\n",
      "\t3' UTR MUTATION: 2\n",
      "\tTotal: 2\n",
      "exon_variant:\n",
      "\tEXON 10 MUTATION: 11\n",
      "\tTotal: 11\n",
      "gene_variant:\n",
      "\tMUTATION: 49\n",
      "\tTotal: 49\n",
      "Total: 481 / 580 (82.9%)\n",
      "Total (no NA): 349 / 580 (60.2%)\n"
     ]
    }
   ],
   "source": [
    "results = count_matches(regex_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# regex_dict['discussion'] = dict()\n",
    "regex_dict['NA']['PROMOTER METHYLATION'] = re.compile(r'PROMOTER \\w*METHYLATION')\n",
    "regex_dict['inframe_insertion']['ITD'] = re.compile(r'(ITD)|(INTERNAL DUPLICATION)')\n",
    "regex_dict['NA']['PHOSPHORYLATION'] = re.compile(r'\\w*( )?PHOSPHORYLATION')\n",
    "regex_dict['frameshift_truncation']['frameshift variance'] = re.compile(\"[{0}]\\\\d+((fs)|(FS))(\\\\*\\\\d)?\".format(prot_alpha))\n",
    "regex_dict['frameshift_truncation']['FRAMESHIFT TRUNCATION'] = re.compile(\"FRAMESHIFT TRUNCATION\")\n",
    "regex_dict['transcript_fusion']['fusion nomenclature'] = re.compile(re.compile(r'\\w+-\\w+ FUSION'))\n",
    "# regex_dict['discussion']['polymorphisms'] = re.compile(r'(SNP)|(.*polymorphism)', flags=re.IGNORECASE)\n",
    "# regex_dict['discussion']['serum'] = re.compile(r'SERUM LEVELS')\n",
    "# regex_dict['discussion']['exon deletions'] = re.compile(r'EXON \\d+(-\\d+)? DELETION')\n",
    "regex_dict['transcript_fusion']['EML4-ALK'] = re.compile(\"\\w+-\\w+( [{0}]\\\\d+;[{0}]\\\\d+)?\".format(prot_alpha))\n",
    "regex_dict['missense_variant']['stars and slashes'] = re.compile(\"[{0}]\\\\d+[{0}*](/[{0}])?\".format(prot_alpha))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inframe_insertion:\n",
      "\tP790INS: 1\n",
      "\tITD: 2\n",
      "\tM774INSAYVM: 1\n",
      "\tTotal: 4\n",
      "inframe_deletion:\n",
      "\tDEL 755-759: 1\n",
      "\tV560DEL: 1\n",
      "\tDEL I843: 1\n",
      "\tTotal: 3\n",
      "transcript_fusion:\n",
      "\tEML4-ALK: 35\n",
      "\tfusion nomenclature: 6\n",
      "\tALK FUSIONS: 6\n",
      "\tTotal: 47\n",
      "transcript_amplification:\n",
      "\tAMPLIFICATION: 29\n",
      "\tTotal: 29\n",
      "wild_type:\n",
      "\tWILD TYPE: 3\n",
      "\tTotal: 3\n",
      "frameshift_truncation:\n",
      "\tframeshift variance: 9\n",
      "\tFRAMESHIFT TRUNCATION: 1\n",
      "\tV2288fs*1: 2\n",
      "\tTotal: 11\n",
      "NA:\n",
      "\tHYPERMETHYLATION: 0\n",
      "\tEXPRESSION: 73\n",
      "\tUNDEREXPRESSION: 14\n",
      "\tMETHYLATION: 1\n",
      "\tPHOSPHORYLATION: 5\n",
      "\tCYTOPLASMIC EXPRESSION: 1\n",
      "\tPROMOTER METHYLATION: 5\n",
      "\tNUCLEAR EXPRESSION: 7\n",
      "\tOVEREXPRESSION: 35\n",
      "\tHYPOMETHYLATION: 0\n",
      "\tTotal: 141\n",
      "loss_of_heterozygosity:\n",
      "\tLOH: 1\n",
      "\tTotal: 1\n",
      "transcript_ablation:\n",
      "\tDELETION: 3\n",
      "\tTotal: 3\n",
      "loss_of_function_variant:\n",
      "\tLOSS-OF-FUNCTION: 10\n",
      "\tLOSS: 9\n",
      "\tTotal: 19\n",
      "missense_variant:\n",
      "\tstars and slashes: 153\n",
      "\tG12D: 144\n",
      "\tTotal: 153\n",
      "protein_altering_variant:\n",
      "\tG12: 18\n",
      "\tKINASE DOMAIN MUTATION: 4\n",
      "\tTotal: 22\n",
      "exon_loss_variant:\n",
      "\tEXON 14 SKIPPING MUTATION: 1\n",
      "\tTotal: 1\n",
      "copy_number_change:\n",
      "\tCOPY NUMBER VARIATION: 1\n",
      "\tTotal: 1\n",
      "transcript_fusion and missense_variant:\n",
      "\tEML4-ALK G332K: 15\n",
      "\tTotal: 15\n",
      "transcript_translocation or feature_translocation:\n",
      "\tREARRANGEMENT: 2\n",
      "\tTotal: 2\n",
      "5_prime_UTR_variant:\n",
      "\t5' UTR MUTATION: 1\n",
      "\tTotal: 1\n",
      "3_prime_UTR_variant:\n",
      "\t3' UTR MUTATION: 2\n",
      "\tTotal: 2\n",
      "exon_variant:\n",
      "\tEXON 10 MUTATION: 11\n",
      "\tTotal: 11\n",
      "gene_variant:\n",
      "\tMUTATION: 49\n",
      "\tTotal: 49\n",
      "Total: 518 / 580 (89.3%)\n",
      "Total (no NA): 377 / 580 (65.0%)\n"
     ]
    }
   ],
   "source": [
    "results = count_matches(regex_dict)\n",
    "missed_variants = [(v['name'], v['id']) for v in variants if v['id'] not in results['all']]\n",
    "with open('missed_variants.txt', 'w') as f:\n",
    "    writer = csv.writer(f)\n",
    "    writer.writerow(['variant_name', 'civic_id'])\n",
    "    writer.writerows(missed_variants)"
   ]
  }
 ],
 "metadata": {
  "gist": {
   "data": {
    "description": "CIViC_term_mapping.ipynb",
    "public": true
   },
   "id": ""
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import requests\n",
	"import re\n",
	"from Bio.Alphabet import IUPAC\n",
	"import csv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"params = {'count': 1000}\n",
	"r = requests.get('http://civic.genome.wustl.edu/api/variants', params=params)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"580"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"variants = r.json()['records']\n",
	"len(variants)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"prot_alpha = IUPAC.IUPACProtein.letters"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Test which variants match examples"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"regex_dict = \\\n",
	" {'missense_variant': \n",
	" {'G12D': re.compile(\"[{0}]\\\\d+[{0}]\".format(prot_alpha))},\n",
	" 'protein_altering_variant':\n",
	" {'G12': re.compile(\"[{0}]\\\\d+\".format(prot_alpha)),\n",
	" 'KINASE DOMAIN MUTATION': re.compile(r'.* DOMAIN MUTATION')},\n",
	" 'frameshift_truncation':\n",
	" {'V2288fs1': re.compile(\"[{0}]\\\\d+fs\\\\\\\\d+\".format(prot_alpha))},\n",
	" 'inframe_deletion': \n",
	" {'DEL I843': re.compile(\"DEL [{0}]\\\\d+\".format(prot_alpha)),\n",
	" 'V560DEL': re.compile(\"[{0}]\\\\d+DEL\".format(prot_alpha)),\n",
	" 'DEL 755-759': re.compile(r'DEL \\d+-\\d+')},\n",
	" 'inframe_insertion':\n",
	" {'P790INS': re.compile(\"[{0}]\\\\d+INS\".format(prot_alpha)),\n",
	" 'M774INSAYVM': re.compile(\"[{0}]\\\\d+INS[{0}]+\".format(prot_alpha)),\n",
	" 'ITD': re.compile(r'ITD')},\n",
	" 'gene_variant':\n",
	" {'MUTATION': re.compile(r'MUTATION')},\n",
	" 'exon_variant':\n",
	" {'EXON 10 MUTATION': re.compile(r'EXON \\d+ MUTATION')},\n",
	" 'transcript_fusion':\n",
	" {'EML4-ALK': re.compile(r'\\w+-\\w+'),\n",
	" 'ALK FUSIONS': re.compile(r'\\w+ FUSIONS')},\n",
	" 'transcript_fusion and missense_variant':\n",
	" {'EML4-ALK G332K': re.compile(\"\\\\w+-\\\\w+ [{0}]\\\\d+[{0}]\".format(prot_alpha))},\n",
	" 'transcript_translocation or feature_translocation':\n",
	" {'REARRANGEMENT': re.compile(r'REARRANGEMENT')},\n",
	" 'wild_type':\n",
	" {'WILD TYPE': re.compile(r'WILD TYPE')},\n",
	" 'loss_of_heterozygosity':\n",
	" {'LOH': re.compile(r'LOH')},\n",
	" 'transcript_amplification':\n",
	" {'AMPLIFICATION': re.compile(r'AMPLIFICATION')},\n",
	" 'transcript_ablation':\n",
	" {'DELETION': re.compile(r'DELETION')},\n",
	" 'copy_number_change':\n",
	" {'COPY NUMBER VARIATION': re.compile(r'COPY NUMBER VARIATION')},\n",
	" 'loss_of_function_variant': \n",
	" {'LOSS-OF-FUNCTION': re.compile(r'LOSS-OF-FUNCTION'),\n",
	" 'LOSS': re.compile(r'LOSS')},\n",
	" 'exon_loss_variant':\n",
	" {'EXON 14 SKIPPING MUTATION': re.compile(r'EXON \\d+ SKIPPING MUTATION')},\n",
	" '5_prime_UTR_variant':\n",
	" {\"5' UTR MUTATION\": re.compile(r\"5' UTR MUTATION\")},\n",
	" '3_prime_UTR_variant':\n",
	" {\"3' UTR MUTATION\": re.compile(r\"3' UTR MUTATION\")},\n",
	" 'NA':\n",
	" { x: re.compile(x) for x in ['EXPRESSION',\n",
	" 'NUCLEAR EXPRESSION',\n",
	" 'CYTOPLASMIC EXPRESSION',\n",
	" 'OVEREXPRESSION',\n",
	" 'UNDEREXPRESSION',\n",
	" 'METHYLATION',\n",
	" 'PROMOTER METHYLATION',\n",
	" 'HYPOMETHYLATION',\n",
	" 'HYPERMETHYLATION']}\n",
	" }"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def count_matches(rd):\n",
	" results = dict()\n",
	" results['all'] = set()\n",
	" for term, examples in rd.items():\n",
	" t = set()\n",
	" results[term] = dict()\n",
	" print(\"{0}:\".format(term))\n",
	" for example, r in examples.items():\n",
	" s = set([v['id'] for v in variants if r.fullmatch(v['name'])])\n",
	" results[term][example] = s\n",
	" print(\"\\t{0}: {1}\".format(example, len(s)))\n",
	" t \|= s\n",
	" results[term]['all'] = t\n",
	" results['all'] \|= t\n",
	" print(\"\\tTotal: {0}\".format(len(t)))\n",
	" x, y = len(results['all']), len(variants)\n",
	" print(\"Total: {0} / {1} ({2}%)\".format(x, y, round(x/y * 100, 1)))\n",
	" s = results['all'] - results['NA']['all']\n",
	" print(\"Total (no NA): {0} / {1} ({2}%)\".format(len(s), y, round(len(s) / y * 100, 1)))\n",
	" return results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"inframe_insertion:\n",
	"\tP790INS: 1\n",
	"\tITD: 1\n",
	"\tM774INSAYVM: 1\n",
	"\tTotal: 3\n",
	"inframe_deletion:\n",
	"\tDEL 755-759: 1\n",
	"\tV560DEL: 1\n",
	"\tDEL I843: 1\n",
	"\tTotal: 3\n",
	"transcript_fusion:\n",
	"\tEML4-ALK: 32\n",
	"\tALK FUSIONS: 6\n",
	"\tTotal: 38\n",
	"transcript_amplification:\n",
	"\tAMPLIFICATION: 29\n",
	"\tTotal: 29\n",
	"wild_type:\n",
	"\tWILD TYPE: 3\n",
	"\tTotal: 3\n",
	"frameshift_truncation:\n",
	"\tV2288fs*1: 2\n",
	"\tTotal: 2\n",
	"NA:\n",
	"\tHYPERMETHYLATION: 0\n",
	"\tEXPRESSION: 73\n",
	"\tUNDEREXPRESSION: 14\n",
	"\tMETHYLATION: 1\n",
	"\tCYTOPLASMIC EXPRESSION: 1\n",
	"\tPROMOTER METHYLATION: 1\n",
	"\tNUCLEAR EXPRESSION: 7\n",
	"\tOVEREXPRESSION: 35\n",
	"\tHYPOMETHYLATION: 0\n",
	"\tTotal: 132\n",
	"loss_of_heterozygosity:\n",
	"\tLOH: 1\n",
	"\tTotal: 1\n",
	"transcript_ablation:\n",
	"\tDELETION: 3\n",
	"\tTotal: 3\n",
	"loss_of_function_variant:\n",
	"\tLOSS-OF-FUNCTION: 10\n",
	"\tLOSS: 9\n",
	"\tTotal: 19\n",
	"missense_variant:\n",
	"\tG12D: 144\n",
	"\tTotal: 144\n",
	"protein_altering_variant:\n",
	"\tG12: 18\n",
	"\tKINASE DOMAIN MUTATION: 4\n",
	"\tTotal: 22\n",
	"exon_loss_variant:\n",
	"\tEXON 14 SKIPPING MUTATION: 1\n",
	"\tTotal: 1\n",
	"copy_number_change:\n",
	"\tCOPY NUMBER VARIATION: 1\n",
	"\tTotal: 1\n",
	"transcript_fusion and missense_variant:\n",
	"\tEML4-ALK G332K: 15\n",
	"\tTotal: 15\n",
	"transcript_translocation or feature_translocation:\n",
	"\tREARRANGEMENT: 2\n",
	"\tTotal: 2\n",
	"5_prime_UTR_variant:\n",
	"\t5' UTR MUTATION: 1\n",
	"\tTotal: 1\n",
	"3_prime_UTR_variant:\n",
	"\t3' UTR MUTATION: 2\n",
	"\tTotal: 2\n",
	"exon_variant:\n",
	"\tEXON 10 MUTATION: 11\n",
	"\tTotal: 11\n",
	"gene_variant:\n",
	"\tMUTATION: 49\n",
	"\tTotal: 49\n",
	"Total: 481 / 580 (82.9%)\n",
	"Total (no NA): 349 / 580 (60.2%)\n"
	]
	}
	],
	"source": [
	"results = count_matches(regex_dict)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# regex_dict['discussion'] = dict()\n",
	"regex_dict['NA']['PROMOTER METHYLATION'] = re.compile(r'PROMOTER \\w*METHYLATION')\n",
	"regex_dict['inframe_insertion']['ITD'] = re.compile(r'(ITD)\|(INTERNAL DUPLICATION)')\n",
	"regex_dict['NA']['PHOSPHORYLATION'] = re.compile(r'\\w*( )?PHOSPHORYLATION')\n",
	"regex_dict['frameshift_truncation']['frameshift variance'] = re.compile(\"[{0}]\\\\d+((fs)\|(FS))(\\\\*\\\\d)?\".format(prot_alpha))\n",
	"regex_dict['frameshift_truncation']['FRAMESHIFT TRUNCATION'] = re.compile(\"FRAMESHIFT TRUNCATION\")\n",
	"regex_dict['transcript_fusion']['fusion nomenclature'] = re.compile(re.compile(r'\\w+-\\w+ FUSION'))\n",
	"# regex_dict['discussion']['polymorphisms'] = re.compile(r'(SNP)\|(.*polymorphism)', flags=re.IGNORECASE)\n",
	"# regex_dict['discussion']['serum'] = re.compile(r'SERUM LEVELS')\n",
	"# regex_dict['discussion']['exon deletions'] = re.compile(r'EXON \\d+(-\\d+)? DELETION')\n",
	"regex_dict['transcript_fusion']['EML4-ALK'] = re.compile(\"\\w+-\\w+( [{0}]\\\\d+;[{0}]\\\\d+)?\".format(prot_alpha))\n",
	"regex_dict['missense_variant']['stars and slashes'] = re.compile(\"[{0}]\\\\d+[{0}*](/[{0}])?\".format(prot_alpha))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"inframe_insertion:\n",
	"\tP790INS: 1\n",
	"\tITD: 2\n",
	"\tM774INSAYVM: 1\n",
	"\tTotal: 4\n",
	"inframe_deletion:\n",
	"\tDEL 755-759: 1\n",
	"\tV560DEL: 1\n",
	"\tDEL I843: 1\n",
	"\tTotal: 3\n",
	"transcript_fusion:\n",
	"\tEML4-ALK: 35\n",
	"\tfusion nomenclature: 6\n",
	"\tALK FUSIONS: 6\n",
	"\tTotal: 47\n",
	"transcript_amplification:\n",
	"\tAMPLIFICATION: 29\n",
	"\tTotal: 29\n",
	"wild_type:\n",
	"\tWILD TYPE: 3\n",
	"\tTotal: 3\n",
	"frameshift_truncation:\n",
	"\tframeshift variance: 9\n",
	"\tFRAMESHIFT TRUNCATION: 1\n",
	"\tV2288fs*1: 2\n",
	"\tTotal: 11\n",
	"NA:\n",
	"\tHYPERMETHYLATION: 0\n",
	"\tEXPRESSION: 73\n",
	"\tUNDEREXPRESSION: 14\n",
	"\tMETHYLATION: 1\n",
	"\tPHOSPHORYLATION: 5\n",
	"\tCYTOPLASMIC EXPRESSION: 1\n",
	"\tPROMOTER METHYLATION: 5\n",
	"\tNUCLEAR EXPRESSION: 7\n",
	"\tOVEREXPRESSION: 35\n",
	"\tHYPOMETHYLATION: 0\n",
	"\tTotal: 141\n",
	"loss_of_heterozygosity:\n",
	"\tLOH: 1\n",
	"\tTotal: 1\n",
	"transcript_ablation:\n",
	"\tDELETION: 3\n",
	"\tTotal: 3\n",
	"loss_of_function_variant:\n",
	"\tLOSS-OF-FUNCTION: 10\n",
	"\tLOSS: 9\n",
	"\tTotal: 19\n",
	"missense_variant:\n",
	"\tstars and slashes: 153\n",
	"\tG12D: 144\n",
	"\tTotal: 153\n",
	"protein_altering_variant:\n",
	"\tG12: 18\n",
	"\tKINASE DOMAIN MUTATION: 4\n",
	"\tTotal: 22\n",
	"exon_loss_variant:\n",
	"\tEXON 14 SKIPPING MUTATION: 1\n",
	"\tTotal: 1\n",
	"copy_number_change:\n",
	"\tCOPY NUMBER VARIATION: 1\n",
	"\tTotal: 1\n",
	"transcript_fusion and missense_variant:\n",
	"\tEML4-ALK G332K: 15\n",
	"\tTotal: 15\n",
	"transcript_translocation or feature_translocation:\n",
	"\tREARRANGEMENT: 2\n",
	"\tTotal: 2\n",
	"5_prime_UTR_variant:\n",
	"\t5' UTR MUTATION: 1\n",
	"\tTotal: 1\n",
	"3_prime_UTR_variant:\n",
	"\t3' UTR MUTATION: 2\n",
	"\tTotal: 2\n",
	"exon_variant:\n",
	"\tEXON 10 MUTATION: 11\n",
	"\tTotal: 11\n",
	"gene_variant:\n",
	"\tMUTATION: 49\n",
	"\tTotal: 49\n",
	"Total: 518 / 580 (89.3%)\n",
	"Total (no NA): 377 / 580 (65.0%)\n"
	]
	}
	],
	"source": [
	"results = count_matches(regex_dict)\n",
	"missed_variants = [(v['name'], v['id']) for v in variants if v['id'] not in results['all']]\n",
	"with open('missed_variants.txt', 'w') as f:\n",
	" writer = csv.writer(f)\n",
	" writer.writerow(['variant_name', 'civic_id'])\n",
	" writer.writerows(missed_variants)"
	]
	}
	],
	"metadata": {
	"gist": {
	"data": {
	"description": "CIViC_term_mapping.ipynb",
	"public": true
	},
	"id": ""
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}