Last active
November 6, 2018 05:30
-
-
Save astronomy88/94977cda0149f416be275e6d1a0a1755 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Needs nltk library - $python3 -m pip install nltk\n", | |
"import re\n", | |
"\n", | |
"text_file = open(\"AP_ICD10.tsv\", \"r\")\n", | |
"lines = text_file.read()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def clean_code(code):\n", | |
" new_code = code\n", | |
" \n", | |
" #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n", | |
" new_code = new_code.lower()\n", | |
" new_code = re.sub('\\.', '', new_code)\n", | |
" \n", | |
" #-- We only want the first three characters\n", | |
" new_code = new_code[:3]\n", | |
" return new_code\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"m16\n", | |
"m16\n", | |
"s82\n", | |
"s82\n", | |
"m17\n", | |
"m17\n", | |
"m70\n", | |
"m70\n", | |
"s83\n", | |
"s83\n", | |
"m24\n", | |
"m24\n", | |
"m24\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m16\n", | |
"m16\n", | |
"m70\n", | |
"m70\n", | |
"m17\n", | |
"m17\n", | |
"s46\n", | |
"s46\n", | |
"s83\n", | |
"s83\n", | |
"m25\n", | |
"m25\n", | |
"m17\n", | |
"m17\n", | |
"s83\n", | |
"s83\n", | |
"m75\n", | |
"m75\n", | |
"s46\n", | |
"s46\n", | |
"m17\n", | |
"s76\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"s76\n", | |
"s76\n", | |
"s76\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"s93\n", | |
"s93\n", | |
"t84\n", | |
"t84\n", | |
"s72\n", | |
"s72\n", | |
"m00\n", | |
"m00\n", | |
"m17\n", | |
"m17\n", | |
"m70\n", | |
"m70\n", | |
"m24\n", | |
"m24\n", | |
"m19\n", | |
"m19\n", | |
"m16\n", | |
"m16\n", | |
"s63\n", | |
"s63\n", | |
"s46\n", | |
"s46\n", | |
"s52\n", | |
"s52\n", | |
"s46\n", | |
"s46\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"g56\n", | |
"g56\n", | |
"s82\n", | |
"s82\n", | |
"m17\n", | |
"m17\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"m23\n", | |
"s82\n", | |
"s82\n", | |
"s52\n", | |
"s52\n", | |
"m22\n", | |
"m22\n", | |
"m24\n", | |
"m24\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"m67\n", | |
"m67\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m16\n", | |
"m16\n", | |
"m22\n", | |
"m22\n", | |
"s83\n", | |
"s83\n", | |
"s82\n", | |
"s82\n", | |
"s52\n", | |
"s52\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s68\n", | |
"s68\n", | |
"m22\n", | |
"m22\n", | |
"s92\n", | |
"s92\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m16\n", | |
"m16\n", | |
"m75\n", | |
"m75\n", | |
"m84\n", | |
"m84\n", | |
"m25\n", | |
"m25\n", | |
"m77\n", | |
"m77\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m75\n", | |
"m75\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"s83\n", | |
"s83\n", | |
"s93\n", | |
"s93\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m25\n", | |
"m25\n", | |
"m17\n", | |
"m17\n", | |
"m25\n", | |
"m25\n", | |
"m17\n", | |
"m17\n", | |
"m70\n", | |
"m70\n", | |
"m75\n", | |
"m75\n", | |
"m22\n", | |
"m22\n", | |
"s42\n", | |
"s42\n", | |
"s43\n", | |
"s43\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m75\n", | |
"m75\n", | |
"s52\n", | |
"s52\n", | |
"s52\n", | |
"s52\n", | |
"m16\n", | |
"m16\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s63\n", | |
"s63\n", | |
"s52\n", | |
"s52\n", | |
"s42\n", | |
"s42\n", | |
"m76\n", | |
"m76\n", | |
"s42\n", | |
"s42\n", | |
"m76\n", | |
"m76\n", | |
"m17\n", | |
"m17\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"m25\n", | |
"m25\n", | |
"m21\n", | |
"m21\n", | |
"s72\n", | |
"s72\n", | |
"s52\n", | |
"s52\n", | |
"m75\n", | |
"m75\n", | |
"m17\n", | |
"m17\n", | |
"s52\n", | |
"s52\n", | |
"m23\n", | |
"m23\n", | |
"s82\n", | |
"s82\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"m17\n", | |
"m23\n", | |
"m23\n", | |
"m17\n", | |
"m17\n", | |
"m75\n", | |
"m75\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"s46\n", | |
"s46\n", | |
"m17\n", | |
"m17\n", | |
"s43\n", | |
"s43\n", | |
"m87\n", | |
"m87\n", | |
"m16\n", | |
"m16\n", | |
"z89\n", | |
"z89\n", | |
"s72\n", | |
"s72\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s42\n", | |
"s42\n", | |
"m16\n", | |
"m16\n", | |
"m25\n", | |
"m25\n", | |
"m70\n", | |
"m70\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"m17\n", | |
"m18\n", | |
"m18\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"s82\n", | |
"s82\n", | |
"t84\n", | |
"t84\n", | |
"t84\n", | |
"t84\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"m17\n", | |
"s46\n", | |
"s46\n", | |
"m13\n", | |
"m13\n", | |
"m75\n", | |
"m75\n", | |
"m65\n", | |
"m65\n", | |
"m19\n", | |
"m19\n", | |
"m17\n", | |
"m17\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s43\n", | |
"s43\n", | |
"m23\n", | |
"m23\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m22\n", | |
"m22\n", | |
"m17\n", | |
"m17\n", | |
"m75\n", | |
"m75\n", | |
"s43\n", | |
"s43\n", | |
"s82\n", | |
"s82\n", | |
"m76\n", | |
"m76\n", | |
"m76\n", | |
"m76\n", | |
"m17\n", | |
"m17\n", | |
"s46\n", | |
"s46\n", | |
"m84\n", | |
"m84\n", | |
"s82\n", | |
"s82\n", | |
"m17\n", | |
"m17\n", | |
"t84\n", | |
"t84\n", | |
"m75\n", | |
"m75\n", | |
"t84\n", | |
"t84\n", | |
"t84\n", | |
"t84\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"s43\n", | |
"s43\n", | |
"s83\n", | |
"s83\n", | |
"m25\n", | |
"m25\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"m67\n", | |
"m67\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"s82\n", | |
"s82\n", | |
"s93\n", | |
"s93\n", | |
"t84\n", | |
"t84\n", | |
"s82\n", | |
"s82\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s92\n", | |
"s92\n", | |
"s83\n", | |
"s83\n", | |
"m00\n", | |
"m00\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m70\n", | |
"m70\n" | |
] | |
} | |
], | |
"source": [ | |
"filename = 'AP_ICD10.tsv'\n", | |
"\n", | |
"#-- Regex for ICD10 (the period may not always be there)\n", | |
"rICD = re.compile(r'\\b[a-zA-Z][0-9]{2}[\\.a-zA-Z0-9]+', re.IGNORECASE)\n", | |
"\n", | |
"#-- Create a list of all the different types of ICD-10 codes seen\n", | |
"icd_10_list = []\n", | |
"\n", | |
"with open(filename) as fp: \n", | |
" line = fp.readline()\n", | |
" cnt = 1\n", | |
" while line:\n", | |
" matches = rICD.finditer(line)\n", | |
" \n", | |
" #-- More than one ICD-10 code can be found per line\n", | |
" for m in matches:\n", | |
" code = m.group(0)\n", | |
" #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n", | |
" code = clean_code(code)\n", | |
" print(code)\n", | |
" \n", | |
" icd_10_list.append(code)\n", | |
" \n", | |
" line = fp.readline()\n", | |
" cnt += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Now we have a list of all icd-10 codes, but remove duplicates\n", | |
"unique_icd_10 = set(icd_10_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"34" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(unique_icd_10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Assign a number to each label\n", | |
"icd_10_dict = {}\n", | |
"i = 0\n", | |
"for code in unique_icd_10:\n", | |
" icd_10_dict[code] = i\n", | |
" i += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- For fun, let's find out how many times each label appeared\n", | |
"from collections import Counter\n", | |
"cnt = Counter()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for code in icd_10_list:\n", | |
" cnt[code] += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"m16\n", | |
"s82\n", | |
"m17\n", | |
"m70\n", | |
"s83\n", | |
"m24\n", | |
"m17\n", | |
"m17\n", | |
"m16\n", | |
"m70\n", | |
"m17\n", | |
"s46\n", | |
"s83\n", | |
"m25\n", | |
"m17\n", | |
"s83\n", | |
"m75\n", | |
"s46\n", | |
"m17\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s93\n", | |
"t84\n", | |
"s72\n", | |
"m00\n", | |
"m17\n", | |
"m70\n", | |
"m24\n", | |
"m19\n", | |
"m16\n", | |
"s63\n", | |
"s46\n", | |
"s52\n", | |
"s46\n", | |
"m17\n", | |
"m17\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"s83\n", | |
"g56\n", | |
"s82\n", | |
"m17\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"s82\n", | |
"s52\n", | |
"m22\n", | |
"m24\n", | |
"m23\n", | |
"s83\n", | |
"m17\n", | |
"m67\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"m16\n", | |
"m22\n", | |
"s83\n", | |
"s82\n", | |
"s52\n", | |
"s83\n", | |
"s83\n", | |
"s68\n", | |
"m22\n", | |
"s92\n", | |
"m23\n", | |
"s83\n", | |
"m16\n", | |
"m75\n", | |
"m84\n", | |
"m25\n", | |
"m77\n", | |
"m17\n", | |
"m17\n", | |
"m75\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"s83\n", | |
"s93\n", | |
"m17\n", | |
"m17\n", | |
"m25\n", | |
"m17\n", | |
"m25\n", | |
"m17\n", | |
"m70\n", | |
"m75\n", | |
"m22\n", | |
"s42\n", | |
"s43\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m75\n", | |
"s52\n", | |
"s52\n", | |
"m16\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s63\n", | |
"s52\n", | |
"s42\n", | |
"m76\n", | |
"s42\n", | |
"m76\n", | |
"m17\n", | |
"m23\n", | |
"s83\n", | |
"m17\n", | |
"m25\n", | |
"m21\n", | |
"s72\n", | |
"s52\n", | |
"m75\n", | |
"m17\n", | |
"s52\n", | |
"m23\n", | |
"s82\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"m23\n", | |
"m17\n", | |
"m75\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"s46\n", | |
"m17\n", | |
"s43\n", | |
"m87\n", | |
"m16\n", | |
"z89\n", | |
"s72\n", | |
"s83\n", | |
"s83\n", | |
"s42\n", | |
"m16\n", | |
"m25\n", | |
"m70\n", | |
"m16\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"m18\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"t84\n", | |
"t84\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"s46\n", | |
"m13\n", | |
"m75\n", | |
"m65\n", | |
"m19\n", | |
"m17\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s43\n", | |
"m23\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m22\n", | |
"m17\n", | |
"m75\n", | |
"s43\n", | |
"s82\n", | |
"m76\n", | |
"m76\n", | |
"m17\n", | |
"s46\n", | |
"m84\n", | |
"s82\n", | |
"m17\n", | |
"t84\n", | |
"m75\n", | |
"t84\n", | |
"t84\n", | |
"m17\n", | |
"m17\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m17\n", | |
"m17\n", | |
"s82\n", | |
"m17\n", | |
"m17\n", | |
"s43\n", | |
"s83\n", | |
"m25\n", | |
"m17\n", | |
"m17\n", | |
"m67\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"m16\n", | |
"m16\n", | |
"m17\n", | |
"s82\n", | |
"s82\n", | |
"s93\n", | |
"t84\n", | |
"s82\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"m23\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s83\n", | |
"s92\n", | |
"s83\n", | |
"m00\n", | |
"m23\n", | |
"s83\n", | |
"m70\n" | |
] | |
} | |
], | |
"source": [ | |
"#-- Now assign a label to each line\n", | |
"y = []\n", | |
"with open(filename) as fp: \n", | |
" line = fp.readline()\n", | |
" cnt = 0\n", | |
" while line:\n", | |
" matches = rICD.finditer(line)\n", | |
" \n", | |
" #-- More than one ICD-10 code can be found per line\n", | |
" found_match = False\n", | |
" for m in matches:\n", | |
" code = m.group(0)\n", | |
" #-- Normalize the list so that we remove periods and lowercase it (in case there are typos)\n", | |
" code = clean_code(code)\n", | |
" print(code)\n", | |
" \n", | |
" y.append(icd_10_dict[code]) \n", | |
" found_match = True\n", | |
" \n", | |
" #-- Only going to deal with 1 label per line for now to keep things clean. Optimize later.\n", | |
" break\n", | |
" if not found_match:\n", | |
" y.append(-1)\n", | |
" \n", | |
" \n", | |
" line = fp.readline()\n", | |
" cnt += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"249" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#-- Sanity check\n", | |
"len(y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"249" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cnt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Now that we have our labels, let's design our features\n", | |
"# Create a vocabulary list\n", | |
"from nltk.stem import PorterStemmer\n", | |
"from string import punctuation\n", | |
"\n", | |
"def strip_punctuation(s):\n", | |
" return ''.join(c for c in s if c not in punctuation)\n", | |
"\n", | |
"#-- Regex for ICD10 (the period may not always be there)\n", | |
"rICD = re.compile(r'\\b[a-zA-Z][0-9]{2}[\\.a-zA-Z0-9]+', re.IGNORECASE)\n", | |
"\n", | |
"ps = PorterStemmer()\n", | |
"\n", | |
"def tokenize_clean_note(line):\n", | |
" new_line = line\n", | |
" new_line = new_line.lower()\n", | |
" #-- Normalize numbers\n", | |
" new_line = re.sub('[0-9]+', 'number', new_line)\n", | |
" #-- Remove punctuation\n", | |
" new_line = strip_punctuation(new_line)\n", | |
" #-- Remove tab key\n", | |
" new_line = re.sub('[\\t\\n]', '', new_line)\n", | |
"\n", | |
" #-- Tokenize\n", | |
" new_line = new_line.split(' ')\n", | |
"\n", | |
" #-- Stemming\n", | |
" new_word = []\n", | |
" for word in new_line:\n", | |
" new_word.append(ps.stem(word))\n", | |
" \n", | |
" return new_word\n", | |
"\n", | |
"#-- Find dictionary mapping as before\n", | |
"vocab_list = []\n", | |
"\n", | |
"with open(filename) as fp: \n", | |
" line = fp.readline()\n", | |
" cnt = 1\n", | |
" while line:\n", | |
" matches = rICD.finditer(line)\n", | |
" \n", | |
" #-- Create a new line to use for vocab list creation\n", | |
" new_line = line\n", | |
" \n", | |
" for m in matches:\n", | |
" code = m.group(0)\n", | |
" new_line = re.sub(f'{code}', '', new_line)\n", | |
" \n", | |
" new_line = tokenize_clean_note(new_line)\n", | |
" #-- Now, new_line is a list of stemmed words that we can use as feature vectors\n", | |
" vocab_list.extend(new_line)\n", | |
" \n", | |
" line = fp.readline()\n", | |
" cnt += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"9078" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(vocab_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"652" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(set(vocab_list))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"vocab_cnt = Counter()\n", | |
"for vocab in vocab_list:\n", | |
" vocab_cnt[vocab] += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('of', 773),\n", | |
" ('knee', 614),\n", | |
" ('left', 285),\n", | |
" ('right', 220),\n", | |
" ('pain', 201),\n", | |
" ('tear', 189),\n", | |
" ('osteoarthr', 180),\n", | |
" ('meniscu', 157),\n", | |
" ('the', 146),\n", | |
" ('and', 129),\n", | |
" ('hip', 126),\n", | |
" ('medial', 124),\n", | |
" ('anterior', 124),\n", | |
" ('ligament', 122),\n", | |
" ('well', 110),\n", | |
" ('cruciat', 103),\n", | |
" ('post', 102),\n", | |
" ('no', 97),\n", | |
" ('fractur', 94),\n", | |
" ('shoulder', 93),\n", | |
" ('discuss', 89),\n", | |
" ('result', 89),\n", | |
" ('postop', 84),\n", | |
" ('interpret', 84),\n", | |
" ('xr', 84),\n", | |
" ('as', 83),\n", | |
" ('progress', 81),\n", | |
" ('expect', 81),\n", | |
" ('later', 75),\n", | |
" ('for', 73),\n", | |
" ('init', 70),\n", | |
" ('primari', 66),\n", | |
" ('normal', 60),\n", | |
" ('', 59),\n", | |
" ('with', 59),\n", | |
" ('injuri', 57),\n", | |
" ('unilater', 55),\n", | |
" ('therapi', 49),\n", | |
" ('rotat', 49),\n", | |
" ('cuff', 49),\n", | |
" ('fx', 49),\n", | |
" ('op', 49),\n", | |
" ('physic', 48),\n", | |
" ('send', 47),\n", | |
" ('referr', 47),\n", | |
" ('ankl', 47),\n", | |
" ('in', 46),\n", | |
" ('current', 41),\n", | |
" ('sprain', 41),\n", | |
" ('order', 39),\n", | |
" ('opdo', 36),\n", | |
" ('report', 35),\n", | |
" ('ruptur', 34),\n", | |
" ('joint', 34),\n", | |
" ('improv', 33),\n", | |
" ('heal', 32),\n", | |
" ('unsp', 32),\n", | |
" ('followup', 31),\n", | |
" ('instruct', 31),\n", | |
" ('symptom', 30),\n", | |
" ('care', 30),\n", | |
" ('provid', 30),\n", | |
" ('sub', 30),\n", | |
" ('close', 29),\n", | |
" ('prph', 28),\n", | |
" ('devic', 28),\n", | |
" ('wrist', 28),\n", | |
" ('derang', 27),\n", | |
" ('tendon', 26),\n", | |
" ('bursiti', 25),\n", | |
" ('do', 25),\n", | |
" ('he', 25),\n", | |
" ('lnumber', 25),\n", | |
" ('initi', 24),\n", | |
" ('bilater', 24),\n", | |
" ('r', 24),\n", | |
" ('patient', 23),\n", | |
" ('are', 23),\n", | |
" ('to', 23),\n", | |
" ('but', 21),\n", | |
" ('return', 21),\n", | |
" ('complet', 21),\n", | |
" ('good', 20),\n", | |
" ('align', 20),\n", | |
" ('prosthet', 20),\n", | |
" ('patellar', 20),\n", | |
" ('acut', 20),\n", | |
" ('is', 19),\n", | |
" ('now', 19),\n", | |
" ('not', 19),\n", | |
" ('complic', 19),\n", | |
" ('signific', 18),\n", | |
" ('number', 18),\n", | |
" ('malleolu', 18),\n", | |
" ('oth', 18),\n", | |
" ('pre', 18),\n", | |
" ('disloc', 17),\n", | |
" ('arthroscopi', 17),\n", | |
" ('intern', 17),\n", | |
" ('painosteoarthr', 16),\n", | |
" ('painright', 16),\n", | |
" ('collater', 16),\n", | |
" ('clo', 16),\n", | |
" ('mensc', 16),\n", | |
" ('patella', 15),\n", | |
" ('experienc', 15),\n", | |
" ('thick', 15),\n", | |
" ('on', 15),\n", | |
" ('possibl', 15),\n", | |
" ('lat', 15),\n", | |
" ('tender', 15),\n", | |
" ('acl', 15),\n", | |
" ('mmt', 15),\n", | |
" ('mri', 15),\n", | |
" ('trochanter', 14),\n", | |
" ('radiu', 14),\n", | |
" ('defici', 14),\n", | |
" ('l', 13),\n", | |
" ('motion', 13),\n", | |
" ('arthriti', 13),\n", | |
" ('gait', 13),\n", | |
" ('rightright', 13),\n", | |
" ('foot', 13),\n", | |
" ('unspecifi', 12),\n", | |
" ('femur', 12),\n", | |
" ('end', 12),\n", | |
" ('tibia', 12),\n", | |
" ('inject', 11),\n", | |
" ('strain', 11),\n", | |
" ('disrupt', 11),\n", | |
" ('syndesmosi', 11),\n", | |
" ('vs', 11),\n", | |
" ('degener', 11),\n", | |
" ('line', 11),\n", | |
" ('lower', 11),\n", | |
" ('chondromalacia', 11),\n", | |
" ('lmt', 11),\n", | |
" ('leftleft', 11),\n", | |
" ('humeru', 11),\n", | |
" ('full', 10),\n", | |
" ('rotatrcuff', 10),\n", | |
" ('tearruptr', 10),\n", | |
" ('trauma', 10),\n", | |
" ('encount', 10),\n", | |
" ('disp', 10),\n", | |
" ('leg', 10),\n", | |
" ('warmth', 10),\n", | |
" ('partial', 10),\n", | |
" ('palpat', 10),\n", | |
" ('numbernumb', 10),\n", | |
" ('snumber', 10),\n", | |
" ('oa', 10),\n", | |
" ('opleft', 9),\n", | |
" ('arthroplasti', 9),\n", | |
" ('loosen', 9),\n", | |
" ('syndrom', 9),\n", | |
" ('other', 9),\n", | |
" ('have', 9),\n", | |
" ('fibula', 9),\n", | |
" ('bone', 9),\n", | |
" ('finger', 9),\n", | |
" ('gave', 9),\n", | |
" ('rightdo', 9),\n", | |
" ('remov', 9),\n", | |
" ('elbow', 9),\n", | |
" ('paindo', 9),\n", | |
" ('opth', 8),\n", | |
" ('painleft', 8),\n", | |
" ('total', 8),\n", | |
" ('thumb', 8),\n", | |
" ('mild', 8),\n", | |
" ('tibial', 8),\n", | |
" ('plateau', 8),\n", | |
" ('distal', 8),\n", | |
" ('orthoped', 8),\n", | |
" ('activ', 7),\n", | |
" ('compon', 7),\n", | |
" ('posit', 7),\n", | |
" ('musctend', 7),\n", | |
" ('instabl', 7),\n", | |
" ('nsaid', 7),\n", | |
" ('rang', 7),\n", | |
" ('appear', 7),\n", | |
" ('treatment', 7),\n", | |
" ('recurr', 6),\n", | |
" ('up', 6),\n", | |
" ('sign', 6),\n", | |
" ('capsul', 6),\n", | |
" ('a', 6),\n", | |
" ('due', 6),\n", | |
" ('brace', 6),\n", | |
" ('help', 6),\n", | |
" ('week', 6),\n", | |
" ('at', 6),\n", | |
" ('effus', 6),\n", | |
" ('bodi', 6),\n", | |
" ('her', 6),\n", | |
" ('leftosteoarthr', 6),\n", | |
" ('stress', 6),\n", | |
" ('epicondyl', 6)]" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab_cnt.most_common(200)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Use around 200 features. Use vocab words that are greater than or equal to 6 only. And word length > 3\n", | |
"vocab_dict = {}\n", | |
"i = 0\n", | |
"for vocab in vocab_cnt:\n", | |
" if vocab_cnt[vocab] >= 10 and len(vocab) > 3:\n", | |
" vocab_dict[vocab] = i\n", | |
" i +=1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'followup': 0,\n", | |
" 'osteoarthr': 1,\n", | |
" 'pain': 2,\n", | |
" 'post': 3,\n", | |
" 'opdo': 4,\n", | |
" 'well': 5,\n", | |
" 'postop': 6,\n", | |
" 'progress': 7,\n", | |
" 'expect': 8,\n", | |
" 'unilater': 9,\n", | |
" 'primari': 10,\n", | |
" 'left': 11,\n", | |
" 'knee': 12,\n", | |
" 'fractur': 13,\n", | |
" 'heal': 14,\n", | |
" 'good': 15,\n", | |
" 'align': 16,\n", | |
" 'close': 17,\n", | |
" 'patella': 18,\n", | |
" 'interpret': 19,\n", | |
" 'discuss': 20,\n", | |
" 'result': 21,\n", | |
" 'send': 22,\n", | |
" 'physic': 23,\n", | |
" 'therapi': 24,\n", | |
" 'referr': 25,\n", | |
" 'init': 26,\n", | |
" 'painosteoarthr': 27,\n", | |
" 'right': 28,\n", | |
" 'patient': 29,\n", | |
" 'experienc': 30,\n", | |
" 'signific': 31,\n", | |
" 'improv': 32,\n", | |
" 'initi': 33,\n", | |
" 'symptom': 34,\n", | |
" 'return': 35,\n", | |
" 'bursiti': 36,\n", | |
" 'trochanter': 37,\n", | |
" 'medial': 38,\n", | |
" 'meniscu': 39,\n", | |
" 'tear': 40,\n", | |
" 'prph': 41,\n", | |
" 'current': 42,\n", | |
" 'injuri': 43,\n", | |
" 'shoulder': 44,\n", | |
" 'with': 45,\n", | |
" 'tendon': 46,\n", | |
" 'number': 47,\n", | |
" 'disloc': 48,\n", | |
" 'unspecifi': 49,\n", | |
" 'instruct': 50,\n", | |
" 'inject': 51,\n", | |
" 'rotat': 52,\n", | |
" 'cuff': 53,\n", | |
" 'care': 54,\n", | |
" 'order': 55,\n", | |
" 'bilater': 56,\n", | |
" 'provid': 57,\n", | |
" 'prosthet': 58,\n", | |
" 'painright': 59,\n", | |
" 'strain': 60,\n", | |
" 'ruptur': 61,\n", | |
" 'anterior': 62,\n", | |
" 'cruciat': 63,\n", | |
" 'ligament': 64,\n", | |
" 'sprain': 65,\n", | |
" 'patellar': 66,\n", | |
" 'arthroscopi': 67,\n", | |
" 'later': 68,\n", | |
" 'full': 69,\n", | |
" 'thick': 70,\n", | |
" 'complet': 71,\n", | |
" 'rotatrcuff': 72,\n", | |
" 'tearruptr': 73,\n", | |
" 'unsp': 74,\n", | |
" 'trauma': 75,\n", | |
" 'encount': 76,\n", | |
" 'joint': 77,\n", | |
" 'motion': 78,\n", | |
" 'ankl': 79,\n", | |
" 'malleolu': 80,\n", | |
" 'acut': 81,\n", | |
" 'disrupt': 82,\n", | |
" 'syndesmosi': 83,\n", | |
" 'complic': 84,\n", | |
" 'intern': 85,\n", | |
" 'devic': 86,\n", | |
" 'disp': 87,\n", | |
" 'femur': 88,\n", | |
" 'arthriti': 89,\n", | |
" 'collater': 90,\n", | |
" 'possibl': 91,\n", | |
" 'wrist': 92,\n", | |
" 'radiu': 93,\n", | |
" 'mensc': 94,\n", | |
" 'degener': 95,\n", | |
" 'derang': 96,\n", | |
" 'tibia': 97,\n", | |
" 'report': 98,\n", | |
" 'warmth': 99,\n", | |
" 'partial': 100,\n", | |
" 'gait': 101,\n", | |
" 'normal': 102,\n", | |
" 'palpat': 103,\n", | |
" 'tender': 104,\n", | |
" 'line': 105,\n", | |
" 'numbernumb': 106,\n", | |
" 'lower': 107,\n", | |
" 'lnumber': 108,\n", | |
" 'snumber': 109,\n", | |
" 'chondromalacia': 110,\n", | |
" 'rightright': 111,\n", | |
" 'defici': 112,\n", | |
" 'leftleft': 113,\n", | |
" 'foot': 114,\n", | |
" 'humeru': 115}" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab_dict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"test = ['hip', 'painleft', 'hip', 'trocanter', 'bursiti', 'trochanter', 'bursiti', 'leftmnumbermnumb', 'trochanter', 'bursiti', 'left', 'hip']\n", | |
"#-- Time to build a feature vector. \n", | |
"def create_feature_vector(new_line):\n", | |
" feature_vector = []\n", | |
" for word in test:\n", | |
" #-- Only add if the vocab includes the word\n", | |
" if word in vocab_dict:\n", | |
" feature_vector.append(vocab_dict[word])\n", | |
" return feature_vector" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"vector_test = create_feature_vector(test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Let's convert this to 1s and 0s \n", | |
"def transform_feature_vector(feature_vector):\n", | |
" vector = np.zeros(len(vocab_dict))\n", | |
" for idx in feature_vector:\n", | |
" vector[idx] = 1\n", | |
" return vector" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n", | |
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", | |
" 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", | |
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", | |
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", | |
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", | |
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"transform_feature_vector(vector_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#-- Now let's create our dataset of X\n", | |
"X = []\n", | |
"with open(filename) as fp: \n", | |
" line = fp.readline()\n", | |
" cnt = 1\n", | |
" while line:\n", | |
" matches = rICD.finditer(line)\n", | |
" \n", | |
" #-- Create a new line to use for vocab list creation\n", | |
" new_line = line\n", | |
" \n", | |
" for m in matches:\n", | |
" code = m.group(0)\n", | |
" new_line = re.sub(f'{code}', '', new_line)\n", | |
" \n", | |
" new_line = tokenize_clean_note(new_line)\n", | |
" feature_vector = create_feature_vector(new_line)\n", | |
" transformed_vector = transform_feature_vector(feature_vector)\n", | |
" X.append(transformed_vector)\n", | |
" \n", | |
" line = fp.readline()\n", | |
" cnt += 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.utils import shuffle" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_s, y_s = shuffle(X, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", | |
" decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n", | |
" max_iter=-1, probability=False, random_state=None, shrinking=True,\n", | |
" tol=0.001, verbose=False)" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn import svm\n", | |
"clf_svm = svm.SVC(kernel='linear')\n", | |
"clf_svm.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.24" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf_svm.score(X_test, y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import RandomForestClassifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"clf=RandomForestClassifier(n_estimators=100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", | |
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n", | |
" min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
" min_samples_leaf=1, min_samples_split=2,\n", | |
" min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n", | |
" oob_score=False, random_state=None, verbose=0,\n", | |
" warm_start=False)" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf.fit(X_train,y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.24" | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf.score(X_test, y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"new_X = X.copy" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment