astronomy88/notable.ipynb

## notable.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Needs nltk library - $python3 -m pip install nltk\n",
    "import re\n",
    "\n",
    "text_file = open(\"AP_ICD10.tsv\", \"r\")\n",
    "lines = text_file.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_code(code):\n",
    "    new_code = code\n",
    "    \n",
    "    #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n",
    "    new_code = new_code.lower()\n",
    "    new_code = re.sub('\\.', '', new_code)\n",
    "    \n",
    "    #-- We only want the first three characters\n",
    "    new_code = new_code[:3]\n",
    "    return new_code\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "m16\n",
      "m16\n",
      "s82\n",
      "s82\n",
      "m17\n",
      "m17\n",
      "m70\n",
      "m70\n",
      "s83\n",
      "s83\n",
      "m24\n",
      "m24\n",
      "m24\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m16\n",
      "m16\n",
      "m70\n",
      "m70\n",
      "m17\n",
      "m17\n",
      "s46\n",
      "s46\n",
      "s83\n",
      "s83\n",
      "m25\n",
      "m25\n",
      "m17\n",
      "m17\n",
      "s83\n",
      "s83\n",
      "m75\n",
      "m75\n",
      "s46\n",
      "s46\n",
      "m17\n",
      "s76\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "s76\n",
      "s76\n",
      "s76\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "s93\n",
      "s93\n",
      "t84\n",
      "t84\n",
      "s72\n",
      "s72\n",
      "m00\n",
      "m00\n",
      "m17\n",
      "m17\n",
      "m70\n",
      "m70\n",
      "m24\n",
      "m24\n",
      "m19\n",
      "m19\n",
      "m16\n",
      "m16\n",
      "s63\n",
      "s63\n",
      "s46\n",
      "s46\n",
      "s52\n",
      "s52\n",
      "s46\n",
      "s46\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "g56\n",
      "g56\n",
      "s82\n",
      "s82\n",
      "m17\n",
      "m17\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "m23\n",
      "s82\n",
      "s82\n",
      "s52\n",
      "s52\n",
      "m22\n",
      "m22\n",
      "m24\n",
      "m24\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "m67\n",
      "m67\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m16\n",
      "m16\n",
      "m22\n",
      "m22\n",
      "s83\n",
      "s83\n",
      "s82\n",
      "s82\n",
      "s52\n",
      "s52\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s68\n",
      "s68\n",
      "m22\n",
      "m22\n",
      "s92\n",
      "s92\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m16\n",
      "m16\n",
      "m75\n",
      "m75\n",
      "m84\n",
      "m84\n",
      "m25\n",
      "m25\n",
      "m77\n",
      "m77\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m75\n",
      "m75\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "s83\n",
      "s83\n",
      "s93\n",
      "s93\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m25\n",
      "m25\n",
      "m17\n",
      "m17\n",
      "m25\n",
      "m25\n",
      "m17\n",
      "m17\n",
      "m70\n",
      "m70\n",
      "m75\n",
      "m75\n",
      "m22\n",
      "m22\n",
      "s42\n",
      "s42\n",
      "s43\n",
      "s43\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m75\n",
      "m75\n",
      "s52\n",
      "s52\n",
      "s52\n",
      "s52\n",
      "m16\n",
      "m16\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s63\n",
      "s63\n",
      "s52\n",
      "s52\n",
      "s42\n",
      "s42\n",
      "m76\n",
      "m76\n",
      "s42\n",
      "s42\n",
      "m76\n",
      "m76\n",
      "m17\n",
      "m17\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "m25\n",
      "m25\n",
      "m21\n",
      "m21\n",
      "s72\n",
      "s72\n",
      "s52\n",
      "s52\n",
      "m75\n",
      "m75\n",
      "m17\n",
      "m17\n",
      "s52\n",
      "s52\n",
      "m23\n",
      "m23\n",
      "s82\n",
      "s82\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "m17\n",
      "m23\n",
      "m23\n",
      "m17\n",
      "m17\n",
      "m75\n",
      "m75\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "s46\n",
      "s46\n",
      "m17\n",
      "m17\n",
      "s43\n",
      "s43\n",
      "m87\n",
      "m87\n",
      "m16\n",
      "m16\n",
      "z89\n",
      "z89\n",
      "s72\n",
      "s72\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s42\n",
      "s42\n",
      "m16\n",
      "m16\n",
      "m25\n",
      "m25\n",
      "m70\n",
      "m70\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "m17\n",
      "m18\n",
      "m18\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "s82\n",
      "s82\n",
      "t84\n",
      "t84\n",
      "t84\n",
      "t84\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "m17\n",
      "s46\n",
      "s46\n",
      "m13\n",
      "m13\n",
      "m75\n",
      "m75\n",
      "m65\n",
      "m65\n",
      "m19\n",
      "m19\n",
      "m17\n",
      "m17\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s43\n",
      "s43\n",
      "m23\n",
      "m23\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m22\n",
      "m22\n",
      "m17\n",
      "m17\n",
      "m75\n",
      "m75\n",
      "s43\n",
      "s43\n",
      "s82\n",
      "s82\n",
      "m76\n",
      "m76\n",
      "m76\n",
      "m76\n",
      "m17\n",
      "m17\n",
      "s46\n",
      "s46\n",
      "m84\n",
      "m84\n",
      "s82\n",
      "s82\n",
      "m17\n",
      "m17\n",
      "t84\n",
      "t84\n",
      "m75\n",
      "m75\n",
      "t84\n",
      "t84\n",
      "t84\n",
      "t84\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "s43\n",
      "s43\n",
      "s83\n",
      "s83\n",
      "m25\n",
      "m25\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "m67\n",
      "m67\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "s82\n",
      "s82\n",
      "s93\n",
      "s93\n",
      "t84\n",
      "t84\n",
      "s82\n",
      "s82\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s92\n",
      "s92\n",
      "s83\n",
      "s83\n",
      "m00\n",
      "m00\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m70\n",
      "m70\n"
     ]
    }
   ],
   "source": [
    "filename = 'AP_ICD10.tsv'\n",
    "\n",
    "#-- Regex for ICD10 (the period may not always be there)\n",
    "rICD = re.compile(r'\\b[a-zA-Z][0-9]{2}[\\.a-zA-Z0-9]+', re.IGNORECASE)\n",
    "\n",
    "#-- Create a list of all the different types of ICD-10 codes seen\n",
    "icd_10_list = []\n",
    "\n",
    "with open(filename) as fp:  \n",
    "   line = fp.readline()\n",
    "   cnt = 1\n",
    "   while line:\n",
    "        matches = rICD.finditer(line)\n",
    "        \n",
    "        #-- More than one ICD-10 code can be found per line\n",
    "        for m in matches:\n",
    "            code = m.group(0)\n",
    "            #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n",
    "            code = clean_code(code)\n",
    "            print(code)\n",
    "            \n",
    "            icd_10_list.append(code)\n",
    "            \n",
    "        line = fp.readline()\n",
    "        cnt += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Now we have a list of all icd-10 codes, but remove duplicates\n",
    "unique_icd_10 = set(icd_10_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(unique_icd_10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Assign a number to each label\n",
    "icd_10_dict = {}\n",
    "i = 0\n",
    "for code in unique_icd_10:\n",
    "    icd_10_dict[code] = i\n",
    "    i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- For fun, let's find out how many times each label appeared\n",
    "from collections import Counter\n",
    "cnt = Counter()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "for code in icd_10_list:\n",
    "    cnt[code] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "m16\n",
      "s82\n",
      "m17\n",
      "m70\n",
      "s83\n",
      "m24\n",
      "m17\n",
      "m17\n",
      "m16\n",
      "m70\n",
      "m17\n",
      "s46\n",
      "s83\n",
      "m25\n",
      "m17\n",
      "s83\n",
      "m75\n",
      "s46\n",
      "m17\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s93\n",
      "t84\n",
      "s72\n",
      "m00\n",
      "m17\n",
      "m70\n",
      "m24\n",
      "m19\n",
      "m16\n",
      "s63\n",
      "s46\n",
      "s52\n",
      "s46\n",
      "m17\n",
      "m17\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "s83\n",
      "g56\n",
      "s82\n",
      "m17\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "s82\n",
      "s52\n",
      "m22\n",
      "m24\n",
      "m23\n",
      "s83\n",
      "m17\n",
      "m67\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "m16\n",
      "m22\n",
      "s83\n",
      "s82\n",
      "s52\n",
      "s83\n",
      "s83\n",
      "s68\n",
      "m22\n",
      "s92\n",
      "m23\n",
      "s83\n",
      "m16\n",
      "m75\n",
      "m84\n",
      "m25\n",
      "m77\n",
      "m17\n",
      "m17\n",
      "m75\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "s83\n",
      "s93\n",
      "m17\n",
      "m17\n",
      "m25\n",
      "m17\n",
      "m25\n",
      "m17\n",
      "m70\n",
      "m75\n",
      "m22\n",
      "s42\n",
      "s43\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m75\n",
      "s52\n",
      "s52\n",
      "m16\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s63\n",
      "s52\n",
      "s42\n",
      "m76\n",
      "s42\n",
      "m76\n",
      "m17\n",
      "m23\n",
      "s83\n",
      "m17\n",
      "m25\n",
      "m21\n",
      "s72\n",
      "s52\n",
      "m75\n",
      "m17\n",
      "s52\n",
      "m23\n",
      "s82\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "m23\n",
      "m17\n",
      "m75\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "s46\n",
      "m17\n",
      "s43\n",
      "m87\n",
      "m16\n",
      "z89\n",
      "s72\n",
      "s83\n",
      "s83\n",
      "s42\n",
      "m16\n",
      "m25\n",
      "m70\n",
      "m16\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "m18\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "t84\n",
      "t84\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "s46\n",
      "m13\n",
      "m75\n",
      "m65\n",
      "m19\n",
      "m17\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s43\n",
      "m23\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m22\n",
      "m17\n",
      "m75\n",
      "s43\n",
      "s82\n",
      "m76\n",
      "m76\n",
      "m17\n",
      "s46\n",
      "m84\n",
      "s82\n",
      "m17\n",
      "t84\n",
      "m75\n",
      "t84\n",
      "t84\n",
      "m17\n",
      "m17\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m17\n",
      "m17\n",
      "s82\n",
      "m17\n",
      "m17\n",
      "s43\n",
      "s83\n",
      "m25\n",
      "m17\n",
      "m17\n",
      "m67\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "m16\n",
      "m16\n",
      "m17\n",
      "s82\n",
      "s82\n",
      "s93\n",
      "t84\n",
      "s82\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "m23\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s83\n",
      "s92\n",
      "s83\n",
      "m00\n",
      "m23\n",
      "s83\n",
      "m70\n"
     ]
    }
   ],
   "source": [
    "#-- Now assign a label to each line\n",
    "y = []\n",
    "with open(filename) as fp:  \n",
    "   line = fp.readline()\n",
    "   cnt = 0\n",
    "   while line:\n",
    "        matches = rICD.finditer(line)\n",
    "        \n",
    "        #-- More than one ICD-10 code can be found per line\n",
    "        found_match = False\n",
    "        for m in matches:\n",
    "            code = m.group(0)\n",
    "            #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n",
    "            code = clean_code(code)\n",
    "            print(code)\n",
    "            \n",
    "            y.append(icd_10_dict[code]) \n",
    "            found_match = True\n",
    "            \n",
    "            #-- Only going to deal with 1 label per line for now to keep things clean. Optimize later.\n",
    "            break\n",
    "        if not found_match:\n",
    "            y.append(-1)\n",
    "            \n",
    "            \n",
    "        line = fp.readline()\n",
    "        cnt += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "249"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#-- Sanity check\n",
    "len(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "249"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cnt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Now that we have our labels, let's design our features\n",
    "#  Create a vocabulary list\n",
    "from nltk.stem import PorterStemmer\n",
    "from string import punctuation\n",
    "\n",
    "def strip_punctuation(s):\n",
    "    return ''.join(c for c in s if c not in punctuation)\n",
    "\n",
    "#-- Regex for ICD10 (the period may not always be there)\n",
    "rICD = re.compile(r'\\b[a-zA-Z][0-9]{2}[\\.a-zA-Z0-9]+', re.IGNORECASE)\n",
    "\n",
    "ps = PorterStemmer()\n",
    "\n",
    "def tokenize_clean_note(line):\n",
    "    new_line = line\n",
    "    new_line = new_line.lower()\n",
    "    #-- Normalize numbers\n",
    "    new_line = re.sub('[0-9]+', 'number', new_line)\n",
    "    #-- Remove punctuation\n",
    "    new_line = strip_punctuation(new_line)\n",
    "    #-- Remove tab key\n",
    "    new_line = re.sub('[\\t\\n]', '', new_line)\n",
    "\n",
    "    #-- Tokenize\n",
    "    new_line = new_line.split(' ')\n",
    "\n",
    "    #-- Stemming\n",
    "    new_word = []\n",
    "    for word in new_line:\n",
    "        new_word.append(ps.stem(word))\n",
    "    \n",
    "    return new_word\n",
    "\n",
    "#-- Find dictionary mapping as before\n",
    "vocab_list = []\n",
    "\n",
    "with open(filename) as fp:  \n",
    "   line = fp.readline()\n",
    "   cnt = 1\n",
    "   while line:\n",
    "        matches = rICD.finditer(line)\n",
    "        \n",
    "        #-- Create a new line to use for vocab list creation\n",
    "        new_line = line\n",
    "        \n",
    "        for m in matches:\n",
    "            code = m.group(0)\n",
    "            new_line = re.sub(f'{code}', '', new_line)\n",
    "            \n",
    "        new_line = tokenize_clean_note(new_line)\n",
    "        #-- Now, new_line is a list of stemmed words that we can use as feature vectors\n",
    "        vocab_list.extend(new_line)\n",
    "        \n",
    "        line = fp.readline()\n",
    "        cnt += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9078"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "652"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(vocab_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_cnt = Counter()\n",
    "for vocab in vocab_list:\n",
    "    vocab_cnt[vocab] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('of', 773),\n",
       " ('knee', 614),\n",
       " ('left', 285),\n",
       " ('right', 220),\n",
       " ('pain', 201),\n",
       " ('tear', 189),\n",
       " ('osteoarthr', 180),\n",
       " ('meniscu', 157),\n",
       " ('the', 146),\n",
       " ('and', 129),\n",
       " ('hip', 126),\n",
       " ('medial', 124),\n",
       " ('anterior', 124),\n",
       " ('ligament', 122),\n",
       " ('well', 110),\n",
       " ('cruciat', 103),\n",
       " ('post', 102),\n",
       " ('no', 97),\n",
       " ('fractur', 94),\n",
       " ('shoulder', 93),\n",
       " ('discuss', 89),\n",
       " ('result', 89),\n",
       " ('postop', 84),\n",
       " ('interpret', 84),\n",
       " ('xr', 84),\n",
       " ('as', 83),\n",
       " ('progress', 81),\n",
       " ('expect', 81),\n",
       " ('later', 75),\n",
       " ('for', 73),\n",
       " ('init', 70),\n",
       " ('primari', 66),\n",
       " ('normal', 60),\n",
       " ('', 59),\n",
       " ('with', 59),\n",
       " ('injuri', 57),\n",
       " ('unilater', 55),\n",
       " ('therapi', 49),\n",
       " ('rotat', 49),\n",
       " ('cuff', 49),\n",
       " ('fx', 49),\n",
       " ('op', 49),\n",
       " ('physic', 48),\n",
       " ('send', 47),\n",
       " ('referr', 47),\n",
       " ('ankl', 47),\n",
       " ('in', 46),\n",
       " ('current', 41),\n",
       " ('sprain', 41),\n",
       " ('order', 39),\n",
       " ('opdo', 36),\n",
       " ('report', 35),\n",
       " ('ruptur', 34),\n",
       " ('joint', 34),\n",
       " ('improv', 33),\n",
       " ('heal', 32),\n",
       " ('unsp', 32),\n",
       " ('followup', 31),\n",
       " ('instruct', 31),\n",
       " ('symptom', 30),\n",
       " ('care', 30),\n",
       " ('provid', 30),\n",
       " ('sub', 30),\n",
       " ('close', 29),\n",
       " ('prph', 28),\n",
       " ('devic', 28),\n",
       " ('wrist', 28),\n",
       " ('derang', 27),\n",
       " ('tendon', 26),\n",
       " ('bursiti', 25),\n",
       " ('do', 25),\n",
       " ('he', 25),\n",
       " ('lnumber', 25),\n",
       " ('initi', 24),\n",
       " ('bilater', 24),\n",
       " ('r', 24),\n",
       " ('patient', 23),\n",
       " ('are', 23),\n",
       " ('to', 23),\n",
       " ('but', 21),\n",
       " ('return', 21),\n",
       " ('complet', 21),\n",
       " ('good', 20),\n",
       " ('align', 20),\n",
       " ('prosthet', 20),\n",
       " ('patellar', 20),\n",
       " ('acut', 20),\n",
       " ('is', 19),\n",
       " ('now', 19),\n",
       " ('not', 19),\n",
       " ('complic', 19),\n",
       " ('signific', 18),\n",
       " ('number', 18),\n",
       " ('malleolu', 18),\n",
       " ('oth', 18),\n",
       " ('pre', 18),\n",
       " ('disloc', 17),\n",
       " ('arthroscopi', 17),\n",
       " ('intern', 17),\n",
       " ('painosteoarthr', 16),\n",
       " ('painright', 16),\n",
       " ('collater', 16),\n",
       " ('clo', 16),\n",
       " ('mensc', 16),\n",
       " ('patella', 15),\n",
       " ('experienc', 15),\n",
       " ('thick', 15),\n",
       " ('on', 15),\n",
       " ('possibl', 15),\n",
       " ('lat', 15),\n",
       " ('tender', 15),\n",
       " ('acl', 15),\n",
       " ('mmt', 15),\n",
       " ('mri', 15),\n",
       " ('trochanter', 14),\n",
       " ('radiu', 14),\n",
       " ('defici', 14),\n",
       " ('l', 13),\n",
       " ('motion', 13),\n",
       " ('arthriti', 13),\n",
       " ('gait', 13),\n",
       " ('rightright', 13),\n",
       " ('foot', 13),\n",
       " ('unspecifi', 12),\n",
       " ('femur', 12),\n",
       " ('end', 12),\n",
       " ('tibia', 12),\n",
       " ('inject', 11),\n",
       " ('strain', 11),\n",
       " ('disrupt', 11),\n",
       " ('syndesmosi', 11),\n",
       " ('vs', 11),\n",
       " ('degener', 11),\n",
       " ('line', 11),\n",
       " ('lower', 11),\n",
       " ('chondromalacia', 11),\n",
       " ('lmt', 11),\n",
       " ('leftleft', 11),\n",
       " ('humeru', 11),\n",
       " ('full', 10),\n",
       " ('rotatrcuff', 10),\n",
       " ('tearruptr', 10),\n",
       " ('trauma', 10),\n",
       " ('encount', 10),\n",
       " ('disp', 10),\n",
       " ('leg', 10),\n",
       " ('warmth', 10),\n",
       " ('partial', 10),\n",
       " ('palpat', 10),\n",
       " ('numbernumb', 10),\n",
       " ('snumber', 10),\n",
       " ('oa', 10),\n",
       " ('opleft', 9),\n",
       " ('arthroplasti', 9),\n",
       " ('loosen', 9),\n",
       " ('syndrom', 9),\n",
       " ('other', 9),\n",
       " ('have', 9),\n",
       " ('fibula', 9),\n",
       " ('bone', 9),\n",
       " ('finger', 9),\n",
       " ('gave', 9),\n",
       " ('rightdo', 9),\n",
       " ('remov', 9),\n",
       " ('elbow', 9),\n",
       " ('paindo', 9),\n",
       " ('opth', 8),\n",
       " ('painleft', 8),\n",
       " ('total', 8),\n",
       " ('thumb', 8),\n",
       " ('mild', 8),\n",
       " ('tibial', 8),\n",
       " ('plateau', 8),\n",
       " ('distal', 8),\n",
       " ('orthoped', 8),\n",
       " ('activ', 7),\n",
       " ('compon', 7),\n",
       " ('posit', 7),\n",
       " ('musctend', 7),\n",
       " ('instabl', 7),\n",
       " ('nsaid', 7),\n",
       " ('rang', 7),\n",
       " ('appear', 7),\n",
       " ('treatment', 7),\n",
       " ('recurr', 6),\n",
       " ('up', 6),\n",
       " ('sign', 6),\n",
       " ('capsul', 6),\n",
       " ('a', 6),\n",
       " ('due', 6),\n",
       " ('brace', 6),\n",
       " ('help', 6),\n",
       " ('week', 6),\n",
       " ('at', 6),\n",
       " ('effus', 6),\n",
       " ('bodi', 6),\n",
       " ('her', 6),\n",
       " ('leftosteoarthr', 6),\n",
       " ('stress', 6),\n",
       " ('epicondyl', 6)]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_cnt.most_common(200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Use around 200 features. Use vocab words that are greater than or equal to 6 only. And word length > 3\n",
    "vocab_dict = {}\n",
    "i = 0\n",
    "for vocab in vocab_cnt:\n",
    "    if vocab_cnt[vocab] >= 10 and len(vocab) > 3:\n",
    "        vocab_dict[vocab] = i\n",
    "        i +=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'followup': 0,\n",
       " 'osteoarthr': 1,\n",
       " 'pain': 2,\n",
       " 'post': 3,\n",
       " 'opdo': 4,\n",
       " 'well': 5,\n",
       " 'postop': 6,\n",
       " 'progress': 7,\n",
       " 'expect': 8,\n",
       " 'unilater': 9,\n",
       " 'primari': 10,\n",
       " 'left': 11,\n",
       " 'knee': 12,\n",
       " 'fractur': 13,\n",
       " 'heal': 14,\n",
       " 'good': 15,\n",
       " 'align': 16,\n",
       " 'close': 17,\n",
       " 'patella': 18,\n",
       " 'interpret': 19,\n",
       " 'discuss': 20,\n",
       " 'result': 21,\n",
       " 'send': 22,\n",
       " 'physic': 23,\n",
       " 'therapi': 24,\n",
       " 'referr': 25,\n",
       " 'init': 26,\n",
       " 'painosteoarthr': 27,\n",
       " 'right': 28,\n",
       " 'patient': 29,\n",
       " 'experienc': 30,\n",
       " 'signific': 31,\n",
       " 'improv': 32,\n",
       " 'initi': 33,\n",
       " 'symptom': 34,\n",
       " 'return': 35,\n",
       " 'bursiti': 36,\n",
       " 'trochanter': 37,\n",
       " 'medial': 38,\n",
       " 'meniscu': 39,\n",
       " 'tear': 40,\n",
       " 'prph': 41,\n",
       " 'current': 42,\n",
       " 'injuri': 43,\n",
       " 'shoulder': 44,\n",
       " 'with': 45,\n",
       " 'tendon': 46,\n",
       " 'number': 47,\n",
       " 'disloc': 48,\n",
       " 'unspecifi': 49,\n",
       " 'instruct': 50,\n",
       " 'inject': 51,\n",
       " 'rotat': 52,\n",
       " 'cuff': 53,\n",
       " 'care': 54,\n",
       " 'order': 55,\n",
       " 'bilater': 56,\n",
       " 'provid': 57,\n",
       " 'prosthet': 58,\n",
       " 'painright': 59,\n",
       " 'strain': 60,\n",
       " 'ruptur': 61,\n",
       " 'anterior': 62,\n",
       " 'cruciat': 63,\n",
       " 'ligament': 64,\n",
       " 'sprain': 65,\n",
       " 'patellar': 66,\n",
       " 'arthroscopi': 67,\n",
       " 'later': 68,\n",
       " 'full': 69,\n",
       " 'thick': 70,\n",
       " 'complet': 71,\n",
       " 'rotatrcuff': 72,\n",
       " 'tearruptr': 73,\n",
       " 'unsp': 74,\n",
       " 'trauma': 75,\n",
       " 'encount': 76,\n",
       " 'joint': 77,\n",
       " 'motion': 78,\n",
       " 'ankl': 79,\n",
       " 'malleolu': 80,\n",
       " 'acut': 81,\n",
       " 'disrupt': 82,\n",
       " 'syndesmosi': 83,\n",
       " 'complic': 84,\n",
       " 'intern': 85,\n",
       " 'devic': 86,\n",
       " 'disp': 87,\n",
       " 'femur': 88,\n",
       " 'arthriti': 89,\n",
       " 'collater': 90,\n",
       " 'possibl': 91,\n",
       " 'wrist': 92,\n",
       " 'radiu': 93,\n",
       " 'mensc': 94,\n",
       " 'degener': 95,\n",
       " 'derang': 96,\n",
       " 'tibia': 97,\n",
       " 'report': 98,\n",
       " 'warmth': 99,\n",
       " 'partial': 100,\n",
       " 'gait': 101,\n",
       " 'normal': 102,\n",
       " 'palpat': 103,\n",
       " 'tender': 104,\n",
       " 'line': 105,\n",
       " 'numbernumb': 106,\n",
       " 'lower': 107,\n",
       " 'lnumber': 108,\n",
       " 'snumber': 109,\n",
       " 'chondromalacia': 110,\n",
       " 'rightright': 111,\n",
       " 'defici': 112,\n",
       " 'leftleft': 113,\n",
       " 'foot': 114,\n",
       " 'humeru': 115}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = ['hip', 'painleft', 'hip', 'trocanter', 'bursiti', 'trochanter', 'bursiti', 'leftmnumbermnumb', 'trochanter', 'bursiti', 'left', 'hip']\n",
    "#-- Time to build a feature vector. \n",
    "def create_feature_vector(new_line):\n",
    "    feature_vector = []\n",
    "    for word in test:\n",
    "        #-- Only add if the vocab includes the word\n",
    "        if word in vocab_dict:\n",
    "            feature_vector.append(vocab_dict[word])\n",
    "    return feature_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "vector_test = create_feature_vector(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Let's convert this to 1s and 0s \n",
    "def transform_feature_vector(feature_vector):\n",
    "    vector = np.zeros(len(vocab_dict))\n",
    "    for idx in feature_vector:\n",
    "        vector[idx] = 1\n",
    "    return vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transform_feature_vector(vector_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-- Now let's create our dataset of X\n",
    "X = []\n",
    "with open(filename) as fp:  \n",
    "   line = fp.readline()\n",
    "   cnt = 1\n",
    "   while line:\n",
    "        matches = rICD.finditer(line)\n",
    "        \n",
    "        #-- Create a new line to use for vocab list creation\n",
    "        new_line = line\n",
    "        \n",
    "        for m in matches:\n",
    "            code = m.group(0)\n",
    "            new_line = re.sub(f'{code}', '', new_line)\n",
    "            \n",
    "        new_line = tokenize_clean_note(new_line)\n",
    "        feature_vector = create_feature_vector(new_line)\n",
    "        transformed_vector = transform_feature_vector(feature_vector)\n",
    "        X.append(transformed_vector)\n",
    "        \n",
    "        line = fp.readline()\n",
    "        cnt += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.utils import shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_s, y_s = shuffle(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import svm\n",
    "clf_svm = svm.SVC(kernel='linear')\n",
    "clf_svm.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.24"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_svm.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf=RandomForestClassifier(n_estimators=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.24"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_X = X.copy"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}