Skip to content

Instantly share code, notes, and snippets.

@astronomy88
Last active November 6, 2018 05:30
Show Gist options
  • Save astronomy88/94977cda0149f416be275e6d1a0a1755 to your computer and use it in GitHub Desktop.
Save astronomy88/94977cda0149f416be275e6d1a0a1755 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#-- Needs nltk library - $python3 -m pip install nltk\n",
"import re\n",
"\n",
"text_file = open(\"AP_ICD10.tsv\", \"r\")\n",
"lines = text_file.read()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def clean_code(code):\n",
" new_code = code\n",
" \n",
" #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n",
" new_code = new_code.lower()\n",
" new_code = re.sub('\\.', '', new_code)\n",
" \n",
" #-- We only want the first three characters\n",
" new_code = new_code[:3]\n",
" return new_code\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"m16\n",
"m16\n",
"s82\n",
"s82\n",
"m17\n",
"m17\n",
"m70\n",
"m70\n",
"s83\n",
"s83\n",
"m24\n",
"m24\n",
"m24\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"m16\n",
"m16\n",
"m70\n",
"m70\n",
"m17\n",
"m17\n",
"s46\n",
"s46\n",
"s83\n",
"s83\n",
"m25\n",
"m25\n",
"m17\n",
"m17\n",
"s83\n",
"s83\n",
"m75\n",
"m75\n",
"s46\n",
"s46\n",
"m17\n",
"s76\n",
"m17\n",
"m17\n",
"m17\n",
"s76\n",
"s76\n",
"s76\n",
"m17\n",
"m17\n",
"s82\n",
"s82\n",
"s93\n",
"s93\n",
"t84\n",
"t84\n",
"s72\n",
"s72\n",
"m00\n",
"m00\n",
"m17\n",
"m17\n",
"m70\n",
"m70\n",
"m24\n",
"m24\n",
"m19\n",
"m19\n",
"m16\n",
"m16\n",
"s63\n",
"s63\n",
"s46\n",
"s46\n",
"s52\n",
"s52\n",
"s46\n",
"s46\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"g56\n",
"g56\n",
"s82\n",
"s82\n",
"m17\n",
"m17\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"m23\n",
"s82\n",
"s82\n",
"s52\n",
"s52\n",
"m22\n",
"m22\n",
"m24\n",
"m24\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"m67\n",
"m67\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"m16\n",
"m16\n",
"m22\n",
"m22\n",
"s83\n",
"s83\n",
"s82\n",
"s82\n",
"s52\n",
"s52\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s68\n",
"s68\n",
"m22\n",
"m22\n",
"s92\n",
"s92\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"m16\n",
"m16\n",
"m75\n",
"m75\n",
"m84\n",
"m84\n",
"m25\n",
"m25\n",
"m77\n",
"m77\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"m75\n",
"m75\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"s83\n",
"s83\n",
"s93\n",
"s93\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"m25\n",
"m25\n",
"m17\n",
"m17\n",
"m25\n",
"m25\n",
"m17\n",
"m17\n",
"m70\n",
"m70\n",
"m75\n",
"m75\n",
"m22\n",
"m22\n",
"s42\n",
"s42\n",
"s43\n",
"s43\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m75\n",
"m75\n",
"s52\n",
"s52\n",
"s52\n",
"s52\n",
"m16\n",
"m16\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s63\n",
"s63\n",
"s52\n",
"s52\n",
"s42\n",
"s42\n",
"m76\n",
"m76\n",
"s42\n",
"s42\n",
"m76\n",
"m76\n",
"m17\n",
"m17\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"m25\n",
"m25\n",
"m21\n",
"m21\n",
"s72\n",
"s72\n",
"s52\n",
"s52\n",
"m75\n",
"m75\n",
"m17\n",
"m17\n",
"s52\n",
"s52\n",
"m23\n",
"m23\n",
"s82\n",
"s82\n",
"m16\n",
"m16\n",
"m16\n",
"m16\n",
"m17\n",
"m17\n",
"m23\n",
"m23\n",
"m17\n",
"m17\n",
"m75\n",
"m75\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"s82\n",
"s82\n",
"s46\n",
"s46\n",
"m17\n",
"m17\n",
"s43\n",
"s43\n",
"m87\n",
"m87\n",
"m16\n",
"m16\n",
"z89\n",
"z89\n",
"s72\n",
"s72\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s42\n",
"s42\n",
"m16\n",
"m16\n",
"m25\n",
"m25\n",
"m70\n",
"m70\n",
"m16\n",
"m16\n",
"m16\n",
"m16\n",
"m16\n",
"m16\n",
"m17\n",
"m17\n",
"m18\n",
"m18\n",
"m17\n",
"m17\n",
"s82\n",
"s82\n",
"s82\n",
"s82\n",
"t84\n",
"t84\n",
"t84\n",
"t84\n",
"m16\n",
"m16\n",
"m16\n",
"m16\n",
"m17\n",
"m17\n",
"s46\n",
"s46\n",
"m13\n",
"m13\n",
"m75\n",
"m75\n",
"m65\n",
"m65\n",
"m19\n",
"m19\n",
"m17\n",
"m17\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s43\n",
"s43\n",
"m23\n",
"m23\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m22\n",
"m22\n",
"m17\n",
"m17\n",
"m75\n",
"m75\n",
"s43\n",
"s43\n",
"s82\n",
"s82\n",
"m76\n",
"m76\n",
"m76\n",
"m76\n",
"m17\n",
"m17\n",
"s46\n",
"s46\n",
"m84\n",
"m84\n",
"s82\n",
"s82\n",
"m17\n",
"m17\n",
"t84\n",
"t84\n",
"m75\n",
"m75\n",
"t84\n",
"t84\n",
"t84\n",
"t84\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"s82\n",
"s82\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"s43\n",
"s43\n",
"s83\n",
"s83\n",
"m25\n",
"m25\n",
"m17\n",
"m17\n",
"m17\n",
"m17\n",
"m67\n",
"m67\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m16\n",
"m16\n",
"m16\n",
"m16\n",
"m17\n",
"m17\n",
"s82\n",
"s82\n",
"s82\n",
"s82\n",
"s93\n",
"s93\n",
"t84\n",
"t84\n",
"s82\n",
"s82\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s92\n",
"s92\n",
"s83\n",
"s83\n",
"m00\n",
"m00\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"m70\n",
"m70\n"
]
}
],
"source": [
"filename = 'AP_ICD10.tsv'\n",
"\n",
"#-- Regex for ICD10 (the period may not always be there)\n",
"rICD = re.compile(r'\\b[a-zA-Z][0-9]{2}[\\.a-zA-Z0-9]+', re.IGNORECASE)\n",
"\n",
"#-- Create a list of all the different types of ICD-10 codes seen\n",
"icd_10_list = []\n",
"\n",
"with open(filename) as fp: \n",
" line = fp.readline()\n",
" cnt = 1\n",
" while line:\n",
" matches = rICD.finditer(line)\n",
" \n",
" #-- More than one ICD-10 code can be found per line\n",
" for m in matches:\n",
" code = m.group(0)\n",
" #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n",
" code = clean_code(code)\n",
" print(code)\n",
" \n",
" icd_10_list.append(code)\n",
" \n",
" line = fp.readline()\n",
" cnt += 1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#-- Now we have a list of all icd-10 codes, but remove duplicates\n",
"unique_icd_10 = set(icd_10_list)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"34"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(unique_icd_10)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#-- Assign a number to each label\n",
"icd_10_dict = {}\n",
"i = 0\n",
"for code in unique_icd_10:\n",
" icd_10_dict[code] = i\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#-- For fun, let's find out how many times each label appeared\n",
"from collections import Counter\n",
"cnt = Counter()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"for code in icd_10_list:\n",
" cnt[code] += 1"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"m16\n",
"s82\n",
"m17\n",
"m70\n",
"s83\n",
"m24\n",
"m17\n",
"m17\n",
"m16\n",
"m70\n",
"m17\n",
"s46\n",
"s83\n",
"m25\n",
"m17\n",
"s83\n",
"m75\n",
"s46\n",
"m17\n",
"m17\n",
"m17\n",
"s82\n",
"s93\n",
"t84\n",
"s72\n",
"m00\n",
"m17\n",
"m70\n",
"m24\n",
"m19\n",
"m16\n",
"s63\n",
"s46\n",
"s52\n",
"s46\n",
"m17\n",
"m17\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"s83\n",
"g56\n",
"s82\n",
"m17\n",
"m23\n",
"s83\n",
"s83\n",
"m23\n",
"s82\n",
"s52\n",
"m22\n",
"m24\n",
"m23\n",
"s83\n",
"m17\n",
"m67\n",
"s83\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"m16\n",
"m22\n",
"s83\n",
"s82\n",
"s52\n",
"s83\n",
"s83\n",
"s68\n",
"m22\n",
"s92\n",
"m23\n",
"s83\n",
"m16\n",
"m75\n",
"m84\n",
"m25\n",
"m77\n",
"m17\n",
"m17\n",
"m75\n",
"s83\n",
"s83\n",
"s83\n",
"m17\n",
"s83\n",
"s93\n",
"m17\n",
"m17\n",
"m25\n",
"m17\n",
"m25\n",
"m17\n",
"m70\n",
"m75\n",
"m22\n",
"s42\n",
"s43\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"m75\n",
"s52\n",
"s52\n",
"m16\n",
"m23\n",
"s83\n",
"s83\n",
"s63\n",
"s52\n",
"s42\n",
"m76\n",
"s42\n",
"m76\n",
"m17\n",
"m23\n",
"s83\n",
"m17\n",
"m25\n",
"m21\n",
"s72\n",
"s52\n",
"m75\n",
"m17\n",
"s52\n",
"m23\n",
"s82\n",
"m16\n",
"m16\n",
"m17\n",
"m23\n",
"m17\n",
"m75\n",
"m17\n",
"m17\n",
"s82\n",
"s46\n",
"m17\n",
"s43\n",
"m87\n",
"m16\n",
"z89\n",
"s72\n",
"s83\n",
"s83\n",
"s42\n",
"m16\n",
"m25\n",
"m70\n",
"m16\n",
"m16\n",
"m16\n",
"m17\n",
"m18\n",
"m17\n",
"s82\n",
"s82\n",
"t84\n",
"t84\n",
"m16\n",
"m16\n",
"m17\n",
"s46\n",
"m13\n",
"m75\n",
"m65\n",
"m19\n",
"m17\n",
"s83\n",
"s83\n",
"s83\n",
"s43\n",
"m23\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"m22\n",
"m17\n",
"m75\n",
"s43\n",
"s82\n",
"m76\n",
"m76\n",
"m17\n",
"s46\n",
"m84\n",
"s82\n",
"m17\n",
"t84\n",
"m75\n",
"t84\n",
"t84\n",
"m17\n",
"m17\n",
"m23\n",
"s83\n",
"s83\n",
"m17\n",
"m17\n",
"s82\n",
"m17\n",
"m17\n",
"s43\n",
"s83\n",
"m25\n",
"m17\n",
"m17\n",
"m67\n",
"m23\n",
"s83\n",
"s83\n",
"m16\n",
"m16\n",
"m17\n",
"s82\n",
"s82\n",
"s93\n",
"t84\n",
"s82\n",
"s83\n",
"s83\n",
"s83\n",
"m23\n",
"s83\n",
"s83\n",
"s83\n",
"s83\n",
"s92\n",
"s83\n",
"m00\n",
"m23\n",
"s83\n",
"m70\n"
]
}
],
"source": [
"#-- Now assign a label to each line\n",
"y = []\n",
"with open(filename) as fp: \n",
" line = fp.readline()\n",
" cnt = 0\n",
" while line:\n",
" matches = rICD.finditer(line)\n",
" \n",
" #-- More than one ICD-10 code can be found per line\n",
" found_match = False\n",
" for m in matches:\n",
" code = m.group(0)\n",
" #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n",
" code = clean_code(code)\n",
" print(code)\n",
" \n",
" y.append(icd_10_dict[code]) \n",
" found_match = True\n",
" \n",
" #-- Only going to deal with 1 label per line for now to keep things clean. Optimize later.\n",
" break\n",
" if not found_match:\n",
" y.append(-1)\n",
" \n",
" \n",
" line = fp.readline()\n",
" cnt += 1"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"249"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#-- Sanity check\n",
"len(y)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"249"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cnt"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#-- Now that we have our labels, let's design our features\n",
"# Create a vocabulary list\n",
"from nltk.stem import PorterStemmer\n",
"from string import punctuation\n",
"\n",
"def strip_punctuation(s):\n",
" return ''.join(c for c in s if c not in punctuation)\n",
"\n",
"#-- Regex for ICD10 (the period may not always be there)\n",
"rICD = re.compile(r'\\b[a-zA-Z][0-9]{2}[\\.a-zA-Z0-9]+', re.IGNORECASE)\n",
"\n",
"ps = PorterStemmer()\n",
"\n",
"def tokenize_clean_note(line):\n",
" new_line = line\n",
" new_line = new_line.lower()\n",
" #-- Normalize numbers\n",
" new_line = re.sub('[0-9]+', 'number', new_line)\n",
" #-- Remove punctuation\n",
" new_line = strip_punctuation(new_line)\n",
" #-- Remove tab key\n",
" new_line = re.sub('[\\t\\n]', '', new_line)\n",
"\n",
" #-- Tokenize\n",
" new_line = new_line.split(' ')\n",
"\n",
" #-- Stemming\n",
" new_word = []\n",
" for word in new_line:\n",
" new_word.append(ps.stem(word))\n",
" \n",
" return new_word\n",
"\n",
"#-- Find dictionary mapping as before\n",
"vocab_list = []\n",
"\n",
"with open(filename) as fp: \n",
" line = fp.readline()\n",
" cnt = 1\n",
" while line:\n",
" matches = rICD.finditer(line)\n",
" \n",
" #-- Create a new line to use for vocab list creation\n",
" new_line = line\n",
" \n",
" for m in matches:\n",
" code = m.group(0)\n",
" new_line = re.sub(f'{code}', '', new_line)\n",
" \n",
" new_line = tokenize_clean_note(new_line)\n",
" #-- Now, new_line is a list of stemmed words that we can use as feature vectors\n",
" vocab_list.extend(new_line)\n",
" \n",
" line = fp.readline()\n",
" cnt += 1"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9078"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(vocab_list)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"652"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(set(vocab_list))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"vocab_cnt = Counter()\n",
"for vocab in vocab_list:\n",
" vocab_cnt[vocab] += 1"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('of', 773),\n",
" ('knee', 614),\n",
" ('left', 285),\n",
" ('right', 220),\n",
" ('pain', 201),\n",
" ('tear', 189),\n",
" ('osteoarthr', 180),\n",
" ('meniscu', 157),\n",
" ('the', 146),\n",
" ('and', 129),\n",
" ('hip', 126),\n",
" ('medial', 124),\n",
" ('anterior', 124),\n",
" ('ligament', 122),\n",
" ('well', 110),\n",
" ('cruciat', 103),\n",
" ('post', 102),\n",
" ('no', 97),\n",
" ('fractur', 94),\n",
" ('shoulder', 93),\n",
" ('discuss', 89),\n",
" ('result', 89),\n",
" ('postop', 84),\n",
" ('interpret', 84),\n",
" ('xr', 84),\n",
" ('as', 83),\n",
" ('progress', 81),\n",
" ('expect', 81),\n",
" ('later', 75),\n",
" ('for', 73),\n",
" ('init', 70),\n",
" ('primari', 66),\n",
" ('normal', 60),\n",
" ('', 59),\n",
" ('with', 59),\n",
" ('injuri', 57),\n",
" ('unilater', 55),\n",
" ('therapi', 49),\n",
" ('rotat', 49),\n",
" ('cuff', 49),\n",
" ('fx', 49),\n",
" ('op', 49),\n",
" ('physic', 48),\n",
" ('send', 47),\n",
" ('referr', 47),\n",
" ('ankl', 47),\n",
" ('in', 46),\n",
" ('current', 41),\n",
" ('sprain', 41),\n",
" ('order', 39),\n",
" ('opdo', 36),\n",
" ('report', 35),\n",
" ('ruptur', 34),\n",
" ('joint', 34),\n",
" ('improv', 33),\n",
" ('heal', 32),\n",
" ('unsp', 32),\n",
" ('followup', 31),\n",
" ('instruct', 31),\n",
" ('symptom', 30),\n",
" ('care', 30),\n",
" ('provid', 30),\n",
" ('sub', 30),\n",
" ('close', 29),\n",
" ('prph', 28),\n",
" ('devic', 28),\n",
" ('wrist', 28),\n",
" ('derang', 27),\n",
" ('tendon', 26),\n",
" ('bursiti', 25),\n",
" ('do', 25),\n",
" ('he', 25),\n",
" ('lnumber', 25),\n",
" ('initi', 24),\n",
" ('bilater', 24),\n",
" ('r', 24),\n",
" ('patient', 23),\n",
" ('are', 23),\n",
" ('to', 23),\n",
" ('but', 21),\n",
" ('return', 21),\n",
" ('complet', 21),\n",
" ('good', 20),\n",
" ('align', 20),\n",
" ('prosthet', 20),\n",
" ('patellar', 20),\n",
" ('acut', 20),\n",
" ('is', 19),\n",
" ('now', 19),\n",
" ('not', 19),\n",
" ('complic', 19),\n",
" ('signific', 18),\n",
" ('number', 18),\n",
" ('malleolu', 18),\n",
" ('oth', 18),\n",
" ('pre', 18),\n",
" ('disloc', 17),\n",
" ('arthroscopi', 17),\n",
" ('intern', 17),\n",
" ('painosteoarthr', 16),\n",
" ('painright', 16),\n",
" ('collater', 16),\n",
" ('clo', 16),\n",
" ('mensc', 16),\n",
" ('patella', 15),\n",
" ('experienc', 15),\n",
" ('thick', 15),\n",
" ('on', 15),\n",
" ('possibl', 15),\n",
" ('lat', 15),\n",
" ('tender', 15),\n",
" ('acl', 15),\n",
" ('mmt', 15),\n",
" ('mri', 15),\n",
" ('trochanter', 14),\n",
" ('radiu', 14),\n",
" ('defici', 14),\n",
" ('l', 13),\n",
" ('motion', 13),\n",
" ('arthriti', 13),\n",
" ('gait', 13),\n",
" ('rightright', 13),\n",
" ('foot', 13),\n",
" ('unspecifi', 12),\n",
" ('femur', 12),\n",
" ('end', 12),\n",
" ('tibia', 12),\n",
" ('inject', 11),\n",
" ('strain', 11),\n",
" ('disrupt', 11),\n",
" ('syndesmosi', 11),\n",
" ('vs', 11),\n",
" ('degener', 11),\n",
" ('line', 11),\n",
" ('lower', 11),\n",
" ('chondromalacia', 11),\n",
" ('lmt', 11),\n",
" ('leftleft', 11),\n",
" ('humeru', 11),\n",
" ('full', 10),\n",
" ('rotatrcuff', 10),\n",
" ('tearruptr', 10),\n",
" ('trauma', 10),\n",
" ('encount', 10),\n",
" ('disp', 10),\n",
" ('leg', 10),\n",
" ('warmth', 10),\n",
" ('partial', 10),\n",
" ('palpat', 10),\n",
" ('numbernumb', 10),\n",
" ('snumber', 10),\n",
" ('oa', 10),\n",
" ('opleft', 9),\n",
" ('arthroplasti', 9),\n",
" ('loosen', 9),\n",
" ('syndrom', 9),\n",
" ('other', 9),\n",
" ('have', 9),\n",
" ('fibula', 9),\n",
" ('bone', 9),\n",
" ('finger', 9),\n",
" ('gave', 9),\n",
" ('rightdo', 9),\n",
" ('remov', 9),\n",
" ('elbow', 9),\n",
" ('paindo', 9),\n",
" ('opth', 8),\n",
" ('painleft', 8),\n",
" ('total', 8),\n",
" ('thumb', 8),\n",
" ('mild', 8),\n",
" ('tibial', 8),\n",
" ('plateau', 8),\n",
" ('distal', 8),\n",
" ('orthoped', 8),\n",
" ('activ', 7),\n",
" ('compon', 7),\n",
" ('posit', 7),\n",
" ('musctend', 7),\n",
" ('instabl', 7),\n",
" ('nsaid', 7),\n",
" ('rang', 7),\n",
" ('appear', 7),\n",
" ('treatment', 7),\n",
" ('recurr', 6),\n",
" ('up', 6),\n",
" ('sign', 6),\n",
" ('capsul', 6),\n",
" ('a', 6),\n",
" ('due', 6),\n",
" ('brace', 6),\n",
" ('help', 6),\n",
" ('week', 6),\n",
" ('at', 6),\n",
" ('effus', 6),\n",
" ('bodi', 6),\n",
" ('her', 6),\n",
" ('leftosteoarthr', 6),\n",
" ('stress', 6),\n",
" ('epicondyl', 6)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab_cnt.most_common(200)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#-- Use around 200 features. Use vocab words that are greater than or equal to 6 only. And word length > 3\n",
"vocab_dict = {}\n",
"i = 0\n",
"for vocab in vocab_cnt:\n",
" if vocab_cnt[vocab] >= 10 and len(vocab) > 3:\n",
" vocab_dict[vocab] = i\n",
" i +=1"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'followup': 0,\n",
" 'osteoarthr': 1,\n",
" 'pain': 2,\n",
" 'post': 3,\n",
" 'opdo': 4,\n",
" 'well': 5,\n",
" 'postop': 6,\n",
" 'progress': 7,\n",
" 'expect': 8,\n",
" 'unilater': 9,\n",
" 'primari': 10,\n",
" 'left': 11,\n",
" 'knee': 12,\n",
" 'fractur': 13,\n",
" 'heal': 14,\n",
" 'good': 15,\n",
" 'align': 16,\n",
" 'close': 17,\n",
" 'patella': 18,\n",
" 'interpret': 19,\n",
" 'discuss': 20,\n",
" 'result': 21,\n",
" 'send': 22,\n",
" 'physic': 23,\n",
" 'therapi': 24,\n",
" 'referr': 25,\n",
" 'init': 26,\n",
" 'painosteoarthr': 27,\n",
" 'right': 28,\n",
" 'patient': 29,\n",
" 'experienc': 30,\n",
" 'signific': 31,\n",
" 'improv': 32,\n",
" 'initi': 33,\n",
" 'symptom': 34,\n",
" 'return': 35,\n",
" 'bursiti': 36,\n",
" 'trochanter': 37,\n",
" 'medial': 38,\n",
" 'meniscu': 39,\n",
" 'tear': 40,\n",
" 'prph': 41,\n",
" 'current': 42,\n",
" 'injuri': 43,\n",
" 'shoulder': 44,\n",
" 'with': 45,\n",
" 'tendon': 46,\n",
" 'number': 47,\n",
" 'disloc': 48,\n",
" 'unspecifi': 49,\n",
" 'instruct': 50,\n",
" 'inject': 51,\n",
" 'rotat': 52,\n",
" 'cuff': 53,\n",
" 'care': 54,\n",
" 'order': 55,\n",
" 'bilater': 56,\n",
" 'provid': 57,\n",
" 'prosthet': 58,\n",
" 'painright': 59,\n",
" 'strain': 60,\n",
" 'ruptur': 61,\n",
" 'anterior': 62,\n",
" 'cruciat': 63,\n",
" 'ligament': 64,\n",
" 'sprain': 65,\n",
" 'patellar': 66,\n",
" 'arthroscopi': 67,\n",
" 'later': 68,\n",
" 'full': 69,\n",
" 'thick': 70,\n",
" 'complet': 71,\n",
" 'rotatrcuff': 72,\n",
" 'tearruptr': 73,\n",
" 'unsp': 74,\n",
" 'trauma': 75,\n",
" 'encount': 76,\n",
" 'joint': 77,\n",
" 'motion': 78,\n",
" 'ankl': 79,\n",
" 'malleolu': 80,\n",
" 'acut': 81,\n",
" 'disrupt': 82,\n",
" 'syndesmosi': 83,\n",
" 'complic': 84,\n",
" 'intern': 85,\n",
" 'devic': 86,\n",
" 'disp': 87,\n",
" 'femur': 88,\n",
" 'arthriti': 89,\n",
" 'collater': 90,\n",
" 'possibl': 91,\n",
" 'wrist': 92,\n",
" 'radiu': 93,\n",
" 'mensc': 94,\n",
" 'degener': 95,\n",
" 'derang': 96,\n",
" 'tibia': 97,\n",
" 'report': 98,\n",
" 'warmth': 99,\n",
" 'partial': 100,\n",
" 'gait': 101,\n",
" 'normal': 102,\n",
" 'palpat': 103,\n",
" 'tender': 104,\n",
" 'line': 105,\n",
" 'numbernumb': 106,\n",
" 'lower': 107,\n",
" 'lnumber': 108,\n",
" 'snumber': 109,\n",
" 'chondromalacia': 110,\n",
" 'rightright': 111,\n",
" 'defici': 112,\n",
" 'leftleft': 113,\n",
" 'foot': 114,\n",
" 'humeru': 115}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab_dict"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"test = ['hip', 'painleft', 'hip', 'trocanter', 'bursiti', 'trochanter', 'bursiti', 'leftmnumbermnumb', 'trochanter', 'bursiti', 'left', 'hip']\n",
"#-- Time to build a feature vector. \n",
"def create_feature_vector(new_line):\n",
" feature_vector = []\n",
" for word in test:\n",
" #-- Only add if the vocab includes the word\n",
" if word in vocab_dict:\n",
" feature_vector.append(vocab_dict[word])\n",
" return feature_vector"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"vector_test = create_feature_vector(test)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"#-- Let's convert this to 1s and 0s \n",
"def transform_feature_vector(feature_vector):\n",
" vector = np.zeros(len(vocab_dict))\n",
" for idx in feature_vector:\n",
" vector[idx] = 1\n",
" return vector"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transform_feature_vector(vector_test)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#-- Now let's create our dataset of X\n",
"X = []\n",
"with open(filename) as fp: \n",
" line = fp.readline()\n",
" cnt = 1\n",
" while line:\n",
" matches = rICD.finditer(line)\n",
" \n",
" #-- Create a new line to use for vocab list creation\n",
" new_line = line\n",
" \n",
" for m in matches:\n",
" code = m.group(0)\n",
" new_line = re.sub(f'{code}', '', new_line)\n",
" \n",
" new_line = tokenize_clean_note(new_line)\n",
" feature_vector = create_feature_vector(new_line)\n",
" transformed_vector = transform_feature_vector(feature_vector)\n",
" X.append(transformed_vector)\n",
" \n",
" line = fp.readline()\n",
" cnt += 1"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.utils import shuffle"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"X_s, y_s = shuffle(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.3)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
" decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n",
" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
" tol=0.001, verbose=False)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import svm\n",
"clf_svm = svm.SVC(kernel='linear')\n",
"clf_svm.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.24"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf_svm.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"clf=RandomForestClassifier(n_estimators=100)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.24"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"new_X = X.copy"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment