Skip to content

Instantly share code, notes, and snippets.

@jasonost
Created December 16, 2014 01:31
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jasonost/54efeeea6ec1f5bd58c9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "## Load modules"
},
{
"metadata": {},
"cell_type": "code",
"input": "# basic NLP\nimport nltk, codecs, string, random, math, cPickle as pickle, re, datetime\nfrom collections import Counter\n\n# scikit-learn\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nimport numpy as np\nfrom sklearn.metrics.pairwise import linear_kernel\n\nfrom __future__ import division\n\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\nstopset = set(nltk.corpus.stopwords.words('english'))",
"prompt_number": 1,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Load data"
},
{
"metadata": {},
"cell_type": "code",
"input": "corrections = {\"Sarcoma, Ewing's\": 'Sarcoma, Ewing',\n 'Beta-Thalassemia': 'beta-Thalassemia',\n 'Von Willebrand Disease, Type 3': 'von Willebrand Disease, Type 3',\n 'Von Willebrand Disease, Type 2': 'von Willebrand Disease, Type 2',\n 'Von Willebrand Disease, Type 1': 'von Willebrand Disease, Type 1',\n 'Felty''s Syndrome': 'Felty Syndrome',\n 'Von Hippel-Lindau Disease': 'von Hippel-Lindau Disease',\n 'Retrognathism': 'Retrognathia',\n 'Regurgitation, Gastric': 'Laryngopharyngeal Reflux',\n 'Persistent Hyperinsulinemia Hypoglycemia of Infancy': 'Congenital Hyperinsulinism',\n 'Von Willebrand Diseases': 'von Willebrand Diseases',\n 'Pontine Glioma': 'Brain Stem Neoplasms',\n 'Mental Retardation': 'Intellectual Disability',\n 'Overdose': 'Drug Overdose',\n 'Beta-Mannosidosis': 'beta-Mannosidosis',\n 'Alpha 1-Antitrypsin Deficiency': 'alpha 1-Antitrypsin Deficiency',\n 'Intervertebral Disk Displacement': 'Intervertebral Disc Displacement',\n 'Alpha-Thalassemia': 'alpha-Thalassemia',\n 'Mycobacterium Infections, Atypical': 'Mycobacterium Infections, Nontuberculous',\n 'Legg-Perthes Disease': 'Legg-Calve-Perthes Disease',\n 'Intervertebral Disk Degeneration': 'Intervertebral Disc Degeneration',\n 'Alpha-Mannosidosis': 'alpha-Mannosidosis',\n 'Gestational Trophoblastic Disease': 'Gestational Trophoblastic Neoplasms'\n }\ncond = {}\ncond_r = {}\nfor row in codecs.open('../data/condition_browse.txt','r','utf-8').readlines():\n row_id, trial_id, mesh_term = row.strip().split('|')\n if mesh_term in corrections: mesh_term = corrections[mesh_term]\n if mesh_term not in cond: cond[mesh_term] = []\n cond[mesh_term].append(trial_id)\n if trial_id not in cond_r: cond_r[trial_id] = []\n cond_r[trial_id].append(mesh_term)\n\nmesh_codes = {}\nmesh_codes_r = {}\nfor row in codecs.open('../data/mesh_thesaurus.txt','r','utf-8').readlines():\n row_id, mesh_id, mesh_term = row.strip().split('|')\n mesh_codes[mesh_id] = mesh_term\n if mesh_term not in mesh_codes_r: mesh_codes_r[mesh_term] = []\n mesh_codes_r[mesh_term].append(mesh_id)\n\n# limiting to conditions that appear in ten or more trials\ntop_cond = {c for c in cond if len(cond[c]) >= 10}\ntrials = {t for c in top_cond for t in cond[c]}",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "trial_desc = {}\nfor row in codecs.open('../data/clinical_study.txt','r','utf-8').readlines():\n data = row.split('|')\n brief_desc, detail_desc = (data[9],\n data[10] if len(data[10]) > 50 else '')\n trial_desc[data[0]] = brief_desc, detail_desc\n\nto_classify = [t for t in trial_desc if t not in trials]",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(trial_desc,open('../data/trial_desc.pkl','wb'))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "trial_desc = pickle.load(open('../data/trial_desc.pkl','rb'))\nto_classify = [t for t in trial_desc if t not in trials]",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Analyze data"
},
{
"metadata": {},
"cell_type": "code",
"input": "print 'Total MeSH terms: %d' % len(cond)\nprint 'Total MeSH terms (level 1): %d' % len([mesh_codes[m] for m in set([mr[:3] for c in cond if c in mesh_codes_r for mr in mesh_codes_r[c]])])\nprint 'Total MeSH terms (level 2): %d' % len([mesh_codes[m] for m in set([mr[:7] for c in cond if c in mesh_codes_r for mr in mesh_codes_r[c]])])",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Create trial lookup for MeSH term hypernyms in the second level of the hierarchy"
},
{
"metadata": {},
"cell_type": "code",
"input": "cond_l2 = {}\nfor m in cond.keys():\n if m in mesh_codes_r:\n m_l2 = set([mr[:7] for mr in mesh_codes_r[m]])\n for l2 in m_l2:\n if l2 not in cond_l2: cond_l2[l2] = set()\n cond_l2[l2] |= set(cond[m])",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Process text"
},
{
"metadata": {},
"cell_type": "code",
"input": "def process_text(text):\n return [word.lower() \n for sent in sent_tokenizer.tokenize(text) \n for word in nltk.word_tokenize(sent)\n if word.lower() not in stopset and\n sum(1 for char in word if char not in string.punctuation) > 0]",
"prompt_number": 34,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "cond_text = {cond: Counter([word\n for trial_id in cond_l2[cond] \n for desc in trial_desc[trial_id]\n if len(desc) > 0\n for word in process_text(desc)])\n for cond in cond_l2.keys()}",
"prompt_number": 20,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "total_text = sum(cond_text.values(),Counter())",
"prompt_number": 21,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(cond_text,open('../data/mesh_level2_textcount.pkl','wb'))\npickle.dump(total_text,open('../data/mesh_level2_alltextcount.pkl','wb'))",
"prompt_number": 22,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "cond_text = pickle.load(open('../data/mesh_level2_textcount.pkl','rb'))\ntotal_text = pickle.load(open('../data/mesh_level2_alltextcount.pkl','rb'))",
"prompt_number": 4,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Building series of individual level-2 MeSH classifiers"
},
{
"metadata": {},
"cell_type": "code",
"input": "# initializing values\nmesh_models = {}\n\ntotal_text_keys, total_text_values = zip(*[(k, v)\n for k, v in total_text.items() \n if len(k) > 2 and sum([1 \n for char in k \n if char not in '1234567890']) > 0])\n\nother_text_len = sum(total_text_values)",
"prompt_number": 19,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "i = len(mesh_models) + 1\n\nfor c in cond_text.keys():\n if c not in mesh_models and len(c) > 3:\n # get total number of words for that term and for everything else that isn't that term\n cond_text_len = sum([v \n for k, v in cond_text[c].items() \n if len(k) > 2 and sum([1 \n for char in k \n if char not in '1234567890']) > 0])\n cur_other_text_len = other_text_len - cond_text_len\n \n # create set of tuples (term % of target MeSH descriptor text, term % of other MeSH descriptor text)\n vecs = [(cond_text[c][t] / cond_text_len, (total_text[t] - cond_text[c][t]) / cur_other_text_len)\n for t in total_text.keys()\n if len(t) > 2 and sum([1\n for char in t\n if char not in '1234567890']) > 0]\n\n # fit logistic model\n model = LogisticRegression()\n mesh_models[c] = model.fit(zip(*vecs),[1,0])\n\n print '%-3d %s (%s)' % (i, c, mesh_codes[c])\n i += 1",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(mesh_models,open('../data/mesh_models_series.pkl','wb'))",
"prompt_number": 29,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "mesh_models = pickle.load(open('../data/mesh_models_series.pkl','rb'))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Applying models to each unclassified trial"
},
{
"metadata": {},
"cell_type": "code",
"input": "classify_text = {trial_id: Counter([word\n for desc in trial_desc[trial_id]\n if len(desc) > 0\n for word in process_text(desc)])\n for trial_id in to_classify}",
"prompt_number": 35,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "guesses = {}\ntotal_text_keys, total_text_values = zip(*[(k, v)\n for k, v in total_text.items() \n if len(k) > 2 and sum([1 \n for char in k \n if char not in '1234567890']) > 0])\n\nother_text_len = sum(total_text_values)",
"prompt_number": 72,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "i = len(guesses) + 1\n\nfor c in classify_text.keys():\n if c not in guesses:\n text_len = sum([v\n for k, v in classify_text[c].items()\n if len(k) > 2 and sum([1\n for char in k\n if char not in '1234567890']) > 0])\n \n if text_len > 0:\n # create set of tuples (term % of target descriptor text, term % of other MeSH descriptor text)\n vecs = [classify_text[c][t] / text_len\n for t in total_text.keys()\n if len(t) > 2 and sum([1\n for char in t\n if char not in '1234567890']) > 0]\n\n # predict logistic models\n predictions = {}\n for term, model in mesh_models.items():\n predictions[term] = model.predict_proba(vecs)[0][1]\n\n guesses[c] = predictions\n\n i += 1\n if i % 10 == 0: print i",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(guesses,open('../data/mesh_guesses.pkl','wb'))",
"prompt_number": 128,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Single-prediction maxent classifier"
},
{
"metadata": {},
"cell_type": "code",
"input": "cond_text = {c: ' '.join(' '.join(trial_desc[t]) for t in cond[c])\n for c in top_cond}",
"prompt_number": 129,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "tfidf = TfidfVectorizer(stop_words=stopset)\ntrain_mat = tfidf.fit_transform(cond_text.values())\napply_mat = tfidf.transform(' '.join(trial_desc[t]) for t in to_classify)",
"prompt_number": 130,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "model = LogisticRegression()\nmodel.fit(train_mat,cond_text.keys())\nsingle_preds = dict(zip(to_classify,model.predict(apply_mat)))",
"prompt_number": 131,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(single_preds,open('../data/mesh_guesses_maxent.pkl','wb'))",
"prompt_number": 132,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## K Nearest Neighbors suggestions"
},
{
"metadata": {},
"cell_type": "code",
"input": "trial_text = {t: ' '.join(trial_desc[t])\n for t in trials \n if len(trial_desc[t][0] + trial_desc[t][1]) > 50}\ntrial_text_other = {t: ' '.join(trial_desc[t]) \n for t in to_classify\n if len(trial_desc[t][0] + trial_desc[t][1]) > 50}",
"prompt_number": 4,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "tfidf = TfidfVectorizer(stop_words=stopset)\ntrain_mat = tfidf.fit_transform(trial_text.values())\napply_mat = tfidf.transform(trial_text_other.values())",
"prompt_number": 5,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "from sklearn.neighbors import NearestNeighbors\nneigh = NearestNeighbors(n_neighbors=10,radius=5)\nneigh.fit(train_mat)",
"prompt_number": 6,
"outputs": [
{
"output_type": "pyout",
"prompt_number": 6,
"metadata": {},
"text": "NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',\n n_neighbors=10, radius=5)"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "knn_guesses = {}",
"prompt_number": 7,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for i in range(len(trial_text_other.keys())):\n trial_id = trial_text_other.keys()[i]\n if trial_id not in knn_guesses:\n dist, idx = (arr.flatten() for arr in neigh.kneighbors(apply_mat[i]))\n\n this_guess = {}\n for j in range(len(idx)):\n k_trial_id = trial_text.keys()[idx[j]]\n for mterm in cond_r[k_trial_id]:\n if mterm not in this_guess: this_guess[mterm] = []\n this_guess[mterm].append(dist[j])\n\n knn_guesses[trial_id] = this_guess\n if i % 100 == 0: print i, datetime.datetime.now().time()",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pickle.dump(knn_guesses,open('../data/mesh_guesses_knn.pkl','wb'))",
"prompt_number": 11,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:9f33c580da651759059a04c2ac643a0bcbdf96caa178ecfa467e9f54fd5754be"
},
"nbformat": 3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment