This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Load modules" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# basic NLP\nimport nltk, codecs, string, random, math, cPickle as pickle, re, datetime\nfrom collections import Counter\n\n# scikit-learn\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nimport numpy as np\nfrom sklearn.metrics.pairwise import linear_kernel\n\nfrom __future__ import division\n\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\nstopset = set(nltk.corpus.stopwords.words('english'))", | |
"prompt_number": 1, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Load data" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "corrections = {\"Sarcoma, Ewing's\": 'Sarcoma, Ewing',\n 'Beta-Thalassemia': 'beta-Thalassemia',\n 'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',\n 'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',\n 'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',\n 'Felty''s Syndrome': 'Felty Syndrome',\n 'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',\n 'Retrognathism': 'Retrognathia',\n 'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',\n 'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',\n 'Von Willebrand Diseases': 'von Willebrand Diseases',\n 'Pontine Glioma': 'Brain Stem Neoplasms',\n 'Mental Retardation': 'Intellectual Disability',\n 'Overdose': 'Drug Overdose',\n 'Beta-Mannosidosis': 'beta-Mannosidosis',\n 'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',\n 'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',\n 'Alpha-Thalassemia': 'alpha-Thalassemia',\n 'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',\n 'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',\n 'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',\n 'Alpha-Mannosidosis': 'alpha-Mannosidosis',\n 'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'\n }\ncond = {}\ncond_r = {}\nfor row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():\n row_id, trial_id, mesh_term = row.strip().split('|')\n if mesh_term in corrections: mesh_term = corrections[mesh_term]\n if mesh_term not in cond: cond[mesh_term] = []\n cond[mesh_term].append(trial_id)\n if trial_id not in cond_r: cond_r[trial_id] = []\n cond_r[trial_id].append(mesh_term)\n\nmesh_codes = {}\nmesh_codes_r = {}\nfor row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():\n row_id, mesh_id, mesh_term = row.strip().split('|')\n mesh_codes[mesh_id] = mesh_term\n if mesh_term not in mesh_codes_r: mesh_codes_r[mesh_term] = []\n mesh_codes_r[mesh_term].append(mesh_id)\n\n# limiting to conditions that appear in ten or more trials\ntop_cond = {c for c in cond if len(cond[c]) >= 10}\ntrials = {t for c in top_cond for t in cond[c]}", | |
"prompt_number": 2, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "trial_desc = {}\nfor row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():\n data = row.split('|')\n brief_desc, detail_desc = (data[9],\n data[10] if len(data[10]) > 50 else '')\n trial_desc[data[0]] = brief_desc, detail_desc\n\nto_classify = [t for t in trial_desc if t not in trials]", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pickle.dump(trial_desc,open('../data/trial_desc.pkl','wb'))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "trial_desc = pickle.load(open('../data/trial_desc.pkl','rb'))\nto_classify = [t for t in trial_desc if t not in trials]", | |
"prompt_number": 3, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Analyze data" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print 'Total MeSH terms: %d' % len(cond)\nprint 'Total MeSH terms (level 1): %d' % len([mesh_codes[m] for m in set([mr[:3] for c in cond if c in mesh_codes_r for mr in mesh_codes_r[c]])])\nprint 'Total MeSH terms (level 2): %d' % len([mesh_codes[m] for m in set([mr[:7] for c in cond if c in mesh_codes_r for mr in mesh_codes_r[c]])])", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Create trial lookup for MeSH term hypernyms in the second level of the hierarchy" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cond_l2 = {}\nfor m in cond.keys():\n if m in mesh_codes_r:\n m_l2 = set([mr[:7] for mr in mesh_codes_r[m]])\n for l2 in m_l2:\n if l2 not in cond_l2: cond_l2[l2] = set()\n cond_l2[l2] |= set(cond[m])", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Process text" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def process_text(text):\n return [word.lower() \n for sent in sent_tokenizer.tokenize(text) \n for word in nltk.word_tokenize(sent)\n if word.lower() not in stopset and\n sum(1 for char in word if char not in string.punctuation) > 0]", | |
"prompt_number": 34, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cond_text = {cond: Counter([word\n for trial_id in cond_l2[cond] \n for desc in trial_desc[trial_id]\n if len(desc) > 0\n for word in process_text(desc)])\n for cond in cond_l2.keys()}", | |
"prompt_number": 20, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "total_text = sum(cond_text.values(),Counter())", | |
"prompt_number": 21, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pickle.dump(cond_text,open('../data/mesh_level2_textcount.pkl','wb'))\npickle.dump(total_text,open('../data/mesh_level2_alltextcount.pkl','wb'))", | |
"prompt_number": 22, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cond_text = pickle.load(open('../data/mesh_level2_textcount.pkl','rb'))\ntotal_text = pickle.load(open('../data/mesh_level2_alltextcount.pkl','rb'))", | |
"prompt_number": 4, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Building series of individual level-2 MeSH classifiers" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# initializing values\nmesh_models = {}\n\ntotal_text_keys, total_text_values = zip(*[(k, v)\n for k, v in total_text.items() \n if len(k) > 2 and sum([1 \n for char in k \n if char not in '1234567890']) > 0])\n\nother_text_len = sum(total_text_values)", | |
"prompt_number": 19, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "i = len(mesh_models) + 1\n\nfor c in cond_text.keys():\n if c not in mesh_models and len(c) > 3:\n # get total number of words for that term and for everything else that isn't that term\n cond_text_len = sum([v \n for k, v in cond_text[c].items() \n if len(k) > 2 and sum([1 \n for char in k \n if char not in '1234567890']) > 0])\n cur_other_text_len = other_text_len - cond_text_len\n \n # create set of tuples (term % of target MeSH descriptor text, term % of other MeSH descriptor text)\n vecs = [(cond_text[c][t] / cond_text_len, (total_text[t] - cond_text[c][t]) / cur_other_text_len)\n for t in total_text.keys()\n if len(t) > 2 and sum([1\n for char in t\n if char not in '1234567890']) > 0]\n\n # fit logistic model\n model = LogisticRegression()\n mesh_models[c] = model.fit(zip(*vecs),[1,0])\n\n print '%-3d %s (%s)' % (i, c, mesh_codes[c])\n i += 1", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pickle.dump(mesh_models,open('../data/mesh_models_series.pkl','wb'))", | |
"prompt_number": 29, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "mesh_models = pickle.load(open('../data/mesh_models_series.pkl','rb'))", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Applying models to each unclassified trial" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "classify_text = {trial_id: Counter([word\n for desc in trial_desc[trial_id]\n if len(desc) > 0\n for word in process_text(desc)])\n for trial_id in to_classify}", | |
"prompt_number": 35, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "guesses = {}\ntotal_text_keys, total_text_values = zip(*[(k, v)\n for k, v in total_text.items() \n if len(k) > 2 and sum([1 \n for char in k \n if char not in '1234567890']) > 0])\n\nother_text_len = sum(total_text_values)", | |
"prompt_number": 72, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "i = len(guesses) + 1\n\nfor c in classify_text.keys():\n if c not in guesses:\n text_len = sum([v\n for k, v in classify_text[c].items()\n if len(k) > 2 and sum([1\n for char in k\n if char not in '1234567890']) > 0])\n \n if text_len > 0:\n # create set of tuples (term % of target descriptor text, term % of other MeSH descriptor text)\n vecs = [classify_text[c][t] / text_len\n for t in total_text.keys()\n if len(t) > 2 and sum([1\n for char in t\n if char not in '1234567890']) > 0]\n\n # predict logistic models\n predictions = {}\n for term, model in mesh_models.items():\n predictions[term] = model.predict_proba(vecs)[0][1]\n\n guesses[c] = predictions\n\n i += 1\n if i % 10 == 0: print i", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pickle.dump(guesses,open('../data/mesh_guesses.pkl','wb'))", | |
"prompt_number": 128, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Single-prediction maxent classifier" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cond_text = {c: ' '.join(' '.join(trial_desc[t]) for t in cond[c])\n for c in top_cond}", | |
"prompt_number": 129, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "tfidf = TfidfVectorizer(stop_words=stopset)\ntrain_mat = tfidf.fit_transform(cond_text.values())\napply_mat = tfidf.transform(' '.join(trial_desc[t]) for t in to_classify)", | |
"prompt_number": 130, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "model = LogisticRegression()\nmodel.fit(train_mat,cond_text.keys())\nsingle_preds = dict(zip(to_classify,model.predict(apply_mat)))", | |
"prompt_number": 131, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pickle.dump(single_preds,open('../data/mesh_guesses_maxent.pkl','wb'))", | |
"prompt_number": 132, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## K Nearest Neighbors suggestions" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "trial_text = {t: ' '.join(trial_desc[t])\n for t in trials \n if len(trial_desc[t][0] + trial_desc[t][1]) > 50}\ntrial_text_other = {t: ' '.join(trial_desc[t]) \n for t in to_classify\n if len(trial_desc[t][0] + trial_desc[t][1]) > 50}", | |
"prompt_number": 4, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "tfidf = TfidfVectorizer(stop_words=stopset)\ntrain_mat = tfidf.fit_transform(trial_text.values())\napply_mat = tfidf.transform(trial_text_other.values())", | |
"prompt_number": 5, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "from sklearn.neighbors import NearestNeighbors\nneigh = NearestNeighbors(n_neighbors=10,radius=5)\nneigh.fit(train_mat)", | |
"prompt_number": 6, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"metadata": {}, | |
"text": "NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',\n n_neighbors=10, radius=5)" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "knn_guesses = {}", | |
"prompt_number": 7, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "for i in range(len(trial_text_other.keys())):\n trial_id = trial_text_other.keys()[i]\n if trial_id not in knn_guesses:\n dist, idx = (arr.flatten() for arr in neigh.kneighbors(apply_mat[i]))\n\n this_guess = {}\n for j in range(len(idx)):\n k_trial_id = trial_text.keys()[idx[j]]\n for mterm in cond_r[k_trial_id]:\n if mterm not in this_guess: this_guess[mterm] = []\n this_guess[mterm].append(dist[j])\n\n knn_guesses[trial_id] = this_guess\n if i % 100 == 0: print i, datetime.datetime.now().time()", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pickle.dump(knn_guesses,open('../data/mesh_guesses_knn.pkl','wb'))", | |
"prompt_number": 11, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
} | |
], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"name": "", | |
"signature": "sha256:9f33c580da651759059a04c2ac643a0bcbdf96caa178ecfa467e9f54fd5754be" | |
}, | |
"nbformat": 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment