cmgerber/Finding_criteria_concepts_and_terms.ipynb

## Finding_criteria_concepts_and_terms.ipynb
{
 "worksheets": [
  {
   "cells": [
    {
     "metadata": {},
     "cell_type": "code",
     "input": "import pandas as pd\nimport nltk\nimport codecs\nimport unicodedata\nimport re\nfrom copy import deepcopy\nfrom pyUtil import easyPickle as pickle\nfrom pyUtil import flattenList as flatten",
     "prompt_number": 2,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Data Input",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "criteria_text = codecs.open('data/ct_criteria_colin.txt',\n                            encoding=\"utf-8\")\ncriteria_text = criteria_text.readlines()",
     "prompt_number": 7,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Chunck sentences and tokens",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#break sentences on '-'\ncriteria_text_sent = [re.split(' - ', line) for line in criteria_text]\n\n#get sentence tokenizer\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\n\n#run the sentence tokenizer over all the documents\ndef sent_token(text):\n    sentence_groups = []\n    for sent_group in text:\n        group_holder = []\n        for sent in sent_group:\n            group_holder.append(sent_tokenizer.tokenize(sent))\n        sentence_groups.append(group_holder)\n        del group_holder\n    return sentence_groups\n\ncriteria_text_sent = sent_token(criteria_text_sent)\n\n\n#Flatten the documents to contain just a list of strings where each string is a sentence\ndef flatten_docs(text):\n    result = []\n    for doc in text:\n        result.append(flatten.flatten(doc))\n    return result\n\ncriteria_text_docs = flatten_docs(criteria_text_sent)\n\n#create a list of all sentences\ncriteria_text_sents = flatten.flatten(criteria_text_docs)\n\n#CREATING TOKENS\n\n#patter for tokenizing\npattern = r'''(?x)    # set flag to allow verbose regexps\n        ([A-Z]\\.)+        # abbreviations, e.g. U.S.A\n        | \\w+([-‘]\\w+)*        # words with optional internal hyphens\n        | \\$?\\d+(\\.\\d+)?%?  # currency and percentages, e.g. $12.40, 82%\n        | \\.\\.\\.            # ellipsis...   \n        | [][.,;\"'?():\\-_`]+  # these are separate tokens\n        '''\n#create tokens for the sentence list\ncriteria_text_sent_tokens = [nltk.regexp_tokenize(sent, pattern) for sent\n                         in criteria_text_sents]\n\n#use this for creating tokens for the documents\ndef doc_token(text):\n    result = []\n    for doc in text:\n        doc_text = []\n        for sent in doc:\n            doc_text.append(nltk.regexp_tokenize(sent, pattern))\n        result.append(doc_text)\n    return result\n#criteria_text_docs_token = doc_token(criteria_text_docs)\n\n",
     "prompt_number": 8,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Tag tokens",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#tag document structured criteria text\ndef doc_tagger_pos(text):\n    result = []\n    for doc in text:\n        doc_text = []\n        for sent in doc:\n            doc_text.append(nltk.pos_tag(sent))\n        result.append(doc_text)\n    return result\n\n#criteria_text_docs_tagged_pos = doc_tagger_pos(criteria_text_docs_token)\n\n#tag sentence structured criteria text\ncriteria_text_sent_tag = []\nfor sent in criteria_text_sent_tokens:\n    criteria_text_sent_tag.append(nltk.pos_tag(sent))",
     "prompt_number": 9,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Save and load tagged corpus",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#save tagged corpus\npickle.save_object(criteria_text_sent_tag,\n                   'data/criteria_corpus_pos_tagged.pkl')",
     "prompt_number": 10,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#load tagged corpus\ncriteria_text_sent_tag = pickle.open_object('data/criteria_corpus_pos_tagged.pkl')",
     "prompt_number": 4,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Keyphrases for criteria",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#imports\nfrom nltk.util import ngrams\nfrom nltk import FreqDist\nimport string\nfrom nltk.corpus import stopwords",
     "prompt_number": 5,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#remove stopwords and punctuation\ndef remove_punct(text):\n    return [[word for word in sent if word[0] not in string.punctuation] for sent in text]\ndef remove_stop(text):\n    return [[word for word in sent if word.lower() not in stopwords.words('english')] for sent in text]\n\n#create non-tagged corpus\ncriteria_text_sent_tokens = [[w[0] for w in sent] for sent in criteria_text_sent_tag]\ncriteria_sents_no_stop = remove_punct(criteria_text_sent_tokens)\ncriteria_sents_no_stop = remove_stop(criteria_sents_no_stop)",
     "prompt_number": 13,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Chunker Approach",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def get_specific_sent(text, spec_words):\n\n    specific_sents = []\n    spec_words = map(lambda x: x.lower(), spec_words)\n    for sent in text:\n        for word in sent:\n            if word[0].lower() in spec_words:\n                specific_sents.append(sent)\n                break\n    return specific_sents",
     "prompt_number": 33,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#create subsection of sentences to run the chunker on that contain cerain phrases\nsmoker_list = ['Non-smoker', 'smoker']\nsmoker_sents = get_specific_sent(criteria_text_sent_tag, smoker_list)\npregnancy_list = ['Pregnancy', 'pregnant']\npregnancy_sents = get_specific_sent(criteria_text_sent_tag, pregnancy_list)\nbirth_control_list = ['Birth control', 'contraception']\nbirth_control_sents = get_specific_sent(criteria_text_sent_tag, birth_control_list)\ndrug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit']\ndrug_sents = get_specific_sent(criteria_text_sent_tag, drug_list)\nheart_failure_list = ['Congestive Heart Failure', 'heart failure']\nheart_failure_sents = get_specific_sent(criteria_text_sent_tag, heart_failure_list)\nhiv_list = ['HIV', 'aids', 'human immunodeficiency virus']\nhiv_sents = get_specific_sent(criteria_text_sent_tag, hiv_list)\nallergy_list = ['Allergies', 'allergy', 'hypersensitivity']\nallergy_sents = get_specific_sent(criteria_text_sent_tag, allergy_list)",
     "prompt_number": 37,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "term_sents_list = [smoker_sents, pregnancy_sents, birth_control_sents, drug_sents,\n                   heart_failure_sents, hiv_sents, allergy_sents]\nterm_list = [smoker_list, pregnancy_list, birth_control_list, drug_list, heart_failure_list,\n             hiv_list, allergy_list]",
     "prompt_number": 40,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#get chunks\ndef chunker(tagged_corpus, chunk_reg):\n    \n    cp = nltk.RegexpParser(chunk_reg)\n    \n    results = []\n    \n    for sents in tagged_corpus:\n        tree = cp.parse(sents)\n        for subtree in tree.subtrees():\n            if subtree.label() == 'CHUNK':\n                results.append(subtree[:])\n    return results\n\nchunk_reg = r\"\"\"\n                  CHUNK: {(<NN.*><POS>)?<RB>?<JJ.*>*<NN.*>+}\n             \"\"\"\n\n\ndef get_doc_desc(num, terms, text):\n    print\n    print 'For terms: ' + ', '.join(terms)\n    for sent in [[word[0] for word in sent] for sent in text[:num]]:\n        print ' '.join(sent)\n    \n",
     "prompt_number": 30,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "for idx, term in enumerate(term_sents_list):\n    chunks_dict_criteria = chunker(term, chunk_reg)\n    get_doc_desc(20, term_list[idx], chunks_dict_criteria)",
     "prompt_number": 41,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\nFor terms: Non-smoker, smoker\nNon-smoker\nCurrent smoker\nnicotine patches\ngum\nCurrent cigarette smoker\ncigarettes day\nsmoking\nyear\nCurrent smoker\nsubject\nsmoker\nuse\ntobacco\nnicotine\nproducts\nmonths\nScreening\nLDL\nrisk factor\nLDL\n\n"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "For terms: Pregnancy, pregnant\nPRIOR CONCURRENT THERAPY\nBiologic therapy\nPregnancy\nuse\ndouble barrier method\npregnancy\ne\ncondom\ndiaphragm\ncervical cap\npositive pregnancy test\nbreast feeding\nscreening\nWomen\nnegative pregnancy test\nBaseline Month\nFemale subjects\nnegative pregnancy\ntest\nchild\n\n"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "For terms: Birth control, contraception\nMust\nmethod\ncontraception\nstudy\nFemale subjects\nuse\neffective nonhormonal birth control methods\npracticing\nbirth control methods\ndays\nend\ntreatment period\nNote\nEstrogen-based hormonal contraception\nPrezista\ntrial\nwomen\nFertile patients\neffective contraception PRIOR CONCURRENT THERAPY\nBiologic therapy\n\nFor terms: Illicit drugs, Alcohol abuse, illegal, illicit\nKnown history\nalcohol abuse\nillicit drugs\nsteroids\nalcoholic beverages day\nAlcohol abuse\ndrug addiction\nuse\nillegal drugs\nhistory\ndrug abuse\nillicit drug use\nhistory\nalcohol abuse\ndaily consumption\nalcoholic drinks\nday\nyears\nalcohol\ndrugs\n\nFor terms: Congestive Heart Failure, heart failure\n\n"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "For terms: HIV, aids, human immunodeficiency virus\nHIV\nnegative test\nSubjects\nlaboratory abnormalities\nDivision\nAIDS Table\nGrading\nSeverity\nAdult\nPediatric Adverse Events (\" DAIDS grading table\naccordance\nnormal ranges\ntrial\nclinical laboratory\nSubjects\nHIV\nsigns\nactive Hepatitis B\nHepatitis C.\nmultiple sclerosis\n\n"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "For terms: Allergies, allergy, hypersensitivity\nKnown\nhypersensitivity\nvaccine components\nvaccine\nsame substances\neruptions\ndrug allergies\nfood allergy\neczema\npsoriasis\nurticaria\nopinion\ninvestigator\ncontraindication\nstudy enrollment\nclinically significant allergy\nhypersensitivity\nexcipients\nmedications\ntrial\n"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Ngram Approach",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "#look at multigrams as well for the specific sentences\ndef multinNgram(n, text):\n    '''This funciton loops through ngrams of length 1 to n.'''\n    text = remove_punct(text)\n    text = remove_stop(text)\n    result = {}\n    flat_list = flatten.flatten(text)\n    for num in range(n, 0, -1):\n        result[num] = []\n        ngram = ngrams(flat_list, num)\n        result[num] = [' '.join(gram) for gram in ngram]\n    return result\n\ndef get_top_mulitgrams(multiGrams, terms, num):\n    print 'For terms: ' + ', '.join(terms)\n    for ngram in multiGrams:\n        fd = FreqDist(multiGrams[ngram]).most_common(num)\n        for key in fd:\n            print key",
     "prompt_number": 43,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "for idx, term in enumerate(term_sents_list):\n    multiGrams = multinNgram(4, [[word[0] for word in sent] for sent in term])\n    get_top_mulitgrams(multiGrams, term_list[idx], 10)",
     "prompt_number": 45,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "For terms: Non-smoker, smoker\n(u'smoker', 35)\n(u'Current', 12)\n(u'history', 11)\n(u'years', 9)\n(u'day', 8)\n(u'pack', 8)\n(u'smoking', 8)\n(u'cigarettes', 8)\n(u'10', 7)\n(u'Non-smoker', 6)\n(u'Current smoker', 9)\n(u'pack years', 5)\n(u'Exclusion Criteria', 4)\n(u'per day', 4)\n(u'1 year', 4)\n(u'cigarettes day', 4)\n(u'5 cigarettes', 3)\n(u'smoker defined', 3)\n(u'defined smoked', 3)\n(u'6 months', 3)\n(u'smoker defined smoked', 3)\n(u'smoker Current smoker', 3)\n(u'preceding 1 year', 2)\n(u'6 months pack', 2)\n(u'Current cigarette smoker', 2)\n(u'year Current smoker', 2)\n(u'5 cigarettes day', 2)\n(u'defined smoked preceding', 2)\n(u'non-smoker 18 years', 2)\n(u'10 pack years', 2)\n(u'defined smoked preceding 1', 2)\n(u'smoked preceding 1 year', 2)\n(u'Current smoker defined smoked', 2)\n(u'smoker defined smoked preceding', 2)\n(u'18 years age older', 2)\n(u'non-smoker 18 years age', 2)\n(u'1 year Current smoker', 2)\n(u'within last 2 years', 1)\n(u'containing products within 6', 1)\n(u'within past year prior', 1)\nFor terms: Pregnancy, pregnant"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n(u'pregnant', 679)\n(u'pregnancy', 673)\n(u'test', 451)\n(u'study', 341)\n(u'potential', 336)\n(u'Pregnant', 314)\n(u'women', 314)\n(u'must', 267)\n(u'negative', 235)\n(u'lactating', 217)\n(u'pregnancy test', 430)\n(u'childbearing potential', 192)\n(u'pregnant nursing', 126)\n(u'must negative', 113)\n(u'become pregnant', 110)\n(u'urine pregnancy', 100)\n(u'pregnant lactating', 100)\n(u'breast feeding', 99)\n(u'negative pregnancy', 99)\n(u'potential must', 97)\n(u'urine pregnancy test', 95)\n(u'negative pregnancy test', 91)\n(u'Negative pregnancy test', 84)\n(u'potential must negative', 79)\n(u'childbearing potential must', 69)\n(u'serum pregnancy test', 63)\n(u'nursing Negative pregnancy', 59)\n(u'pregnant nursing Negative', 59)\n(u'Women childbearing potential', 50)\n(u'must negative pregnancy', 49)\n(u'nursing Negative pregnancy test', 59)"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n(u'pregnant nursing Negative pregnancy', 59)\n(u'childbearing potential must negative', 58)\n(u'must negative pregnancy test', 48)\n(u'negative urine pregnancy test', 46)\n(u'potential must negative pregnancy', 39)\n(u'Women childbearing potential must', 33)\n(u'negative serum pregnancy test', 31)\n(u'potential must negative serum', 21)\n(u'must negative serum pregnancy', 21)\nFor terms: Birth control, contraception"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n(u'contraception', 511)\n(u'use', 296)\n(u'must', 272)\n(u'study', 259)\n(u'potential', 203)\n(u'effective', 201)\n(u'patients', 135)\n(u'childbearing', 129)\n(u'method', 128)\n(u'least', 117)\n(u'effective contraception', 138)\n(u'use effective', 129)\n(u'childbearing potential', 118)\n(u'must use', 107)\n(u'patients must', 93)\n(u'method contraception', 83)\n(u'Fertile patients', 82)\n(u'adequate contraception', 72)\n(u'must agree', 63)\n(u'agree use', 59)\n(u'use effective contraception', 101)\n(u'must use effective', 89)\n(u'patients must use', 84)\n(u'Fertile patients must', 81)\n(u'must agree use', 51)\n(u'use adequate contraception', 37)\n(u'childbearing potential must', 36)\n(u'women childbearing potential', 30)\n(u'PRIOR CONCURRENT THERAPY', 28)\n(u'Women childbearing potential', 27)\n(u'patients must use effective', 81)\n(u'Fertile patients must use', 81)\n(u'must use effective contraception', 80)\n(u'PRIOR CONCURRENT THERAPY Biologic', 25)\n(u'must agree use adequate', 24)\n(u'CONCURRENT THERAPY Biologic therapy', 24)\n(u'agree use adequate contraception', 23)\n(u'contraception Fertile patients must', 18)\n(u'contraception PRIOR CONCURRENT THERAPY', 15)\n(u'use effective contraception PRIOR', 15)\nFor terms: Illicit drugs, Alcohol abuse, illegal, illicit"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n(u'illicit', 39)\n(u'drug', 29)\n(u'drugs', 28)\n(u'alcohol', 27)\n(u'use', 26)\n(u'abuse', 20)\n(u'within', 16)\n(u'study', 12)\n(u'months', 11)\n(u'history', 10)\n(u'illicit drug', 19)\n(u'illicit drugs', 17)\n(u'alcohol illicit', 11)\n(u'drug use', 11)\n(u'drug abuse', 9)\n(u'prior first', 6)\n(u'illegal drugs', 6)\n(u'first dose', 5)\n(u'days prior', 5)\n(u'history alcohol', 5)\n(u'illicit drug use', 10)\n(u'alcohol illicit drug', 8)\n(u'illicit drug abuse', 6)\n(u'first dose study', 5)\n(u'days prior first', 5)\n(u'dose study medication', 5)\n(u'use illicit drugs', 5)\n(u'prior first dose', 5)\n(u'illicit drugs alcohol', 4)\n(u'drug abuse within', 3)\n(u'prior first dose study', 5)\n(u'first dose study medication', 5)\n(u'days prior first dose', 4)\n(u'alcohol illicit drug abuse', 4)\n(u'illicit drug abuse within', 3)\n(u'Current use illicit drugs', 3)\n(u'alcohol illicit drug use', 3)\n(u'illicit drug use within', 2)\n(u'abuse drug addiction use', 2)\n(u'90 days prior first', 2)\nFor terms: Congestive Heart Failure, heart failure\nFor terms: HIV, aids, human immunodeficiency virus"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n(u'HIV', 517)\n(u'hepatitis', 163)\n(u'infection', 146)\n(u'B', 127)\n(u'positive', 126)\n(u'C', 121)\n(u'virus', 118)\n(u'immunodeficiency', 97)\n(u'human', 84)\n(u'Hepatitis', 73)\n(u'HIV infection', 84)\n(u'immunodeficiency virus', 81)\n(u'virus HIV', 80)\n(u'human immunodeficiency', 79)\n(u'hepatitis B', 79)\n(u'hepatitis C', 60)\n(u'HIV positive', 56)\n(u'Hepatitis B', 41)\n(u'B C', 33)\n(u'HIV hepatitis', 28)\n(u'immunodeficiency virus HIV', 78)\n(u'human immunodeficiency virus', 78)\n(u'B hepatitis C', 25)\n(u'hepatitis B hepatitis', 23)\n(u'B surface antigen', 22)\n(u'HIV hepatitis B', 22)\n(u'hepatitis B C', 21)\n(u'virus HIV infection', 17)\n(u'HIV Hepatitis B', 16)\n(u'Hepatitis B surface', 13)\n(u'human immunodeficiency virus HIV', 75)\n(u'hepatitis B hepatitis C', 23)\n(u'immunodeficiency virus HIV infection', 17)\n(u'immunodeficiency virus HIV positive', 13)\n(u'hepatitis B surface antigen', 11)\n(u'Hepatitis B surface antigen', 11)\n(u'immunodeficiency virus HIV hepatitis', 9)\n(u'positive human immunodeficiency virus', 9)\n(u'B hepatitis C HIV', 9)\n(u'Known human immunodeficiency virus', 8)\nFor terms: Allergies, allergy, hypersensitivity"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n(u'hypersensitivity', 331)\n(u'allergy', 272)\n(u'known', 157)\n(u'Known', 150)\n(u'history', 139)\n(u'study', 112)\n(u'History', 94)\n(u'drug', 88)\n(u'allergies', 76)\n(u'drugs', 70)\n(u'known hypersensitivity', 72)\n(u'Known hypersensitivity', 65)\n(u'Known allergy', 43)\n(u'History hypersensitivity', 35)\n(u'Patients known', 29)\n(u'known allergy', 28)\n(u'history allergy', 27)\n(u'allergy hypersensitivity', 23)\n(u'hypersensitivity reaction', 21)\n(u'study drug', 20)\n(u'Patients known hypersensitivity', 18)\n(u'PRIOR CONCURRENT THERAPY', 14)\n(u'CONCURRENT THERAPY Biologic', 11)\n(u'THERAPY Biologic therapy', 11)\n(u'history drug allergy', 10)\n(u'Known suspected allergy', 8)\n(u'history allergy hypersensitivity', 7)\n(u'Known suspected hypersensitivity', 7)\n(u'Patient known hypersensitivity', 7)\n(u'known suspected allergy', 7)\n(u'PRIOR CONCURRENT THERAPY Biologic', 11)\n(u'CONCURRENT THERAPY Biologic therapy', 11)\n(u'CHARACTERISTICS Age 18 Performance', 5)\n(u'PATIENT CHARACTERISTICS Age 18', 5)\n(u'drugs formulated polysorbate 80', 5)\n(u'Age 18 Performance status', 5)\n(u'times upper limit normal', 4)\n(u'sensitivity study medications components', 4)\n(u'least 100 000 mm3', 4)\n(u'study medications components thereof', 4)\n"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "multiGrams = multinNgram(4, [[word[0] for word in sent] for sent in fertile_sents])\nget_top_mulitgrams(multiGrams, fertile_terms, 10)",
     "prompt_number": 75,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "For terms: fertile\n(u'contraception', 101)\n(u'use', 100)\n(u'patients', 95)\n(u'effective', 92)\n(u'must', 91)\n(u'Fertile', 88)\n(u'least', 63)\n(u'therapy', 55)\n(u'study', 53)\n(u'prior', 47)\n(u'use effective', 88)\n(u'patients must', 84)\n(u'must use', 84)\n(u'Fertile patients', 82)\n(u'effective contraception', 76)\n(u'PRIOR CONCURRENT', 29)\n(u'CONCURRENT THERAPY', 29)\n(u'contraception Fertile', 29)\n(u'Biologic therapy', 25)\n(u'THERAPY Biologic', 25)\n(u'patients must use', 83)\n(u'must use effective', 82)\n(u'Fertile patients must', 81)\n(u'use effective contraception', 75)\n(u'PRIOR CONCURRENT THERAPY', 29)\n(u'contraception Fertile patients', 28)\n(u'effective contraception Fertile', 26)\n(u'CONCURRENT THERAPY Biologic', 25)\n(u'THERAPY Biologic therapy', 24)\n(u'contraception PRIOR CONCURRENT', 15)\n(u'patients must use effective', 82)\n(u'Fertile patients must use', 81)\n(u'must use effective contraception', 74)\n(u'contraception Fertile patients must', 28)\n(u'use effective contraception Fertile', 26)\n(u'effective contraception Fertile patients', 25)\n(u'PRIOR CONCURRENT THERAPY Biologic', 25)\n(u'CONCURRENT THERAPY Biologic therapy', 24)\n(u'contraception PRIOR CONCURRENT THERAPY', 15)\n(u'use effective contraception PRIOR', 15)\n"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "heading",
     "source": "Look at full sentences",
     "level": 3
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "def check_sents(text):\n    for sent in text:\n        print ' '.join([word[0] for word in sent])",
     "prompt_number": 77,
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "check_sents(fertile_sents[:10])",
     "prompt_number": 79,
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Fertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy\nFertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy :\nFertile patients must use effective contraception during and for 4 weeks after study participation\nFertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy :\nAgreement to use a condom , and with a fertile female partner , another form of contraception .\nDISEASE CHARACTERISTICS : Histologically proven epithelial adenocarcinoma of the ovary , fallopian tube , or peritoneum CA 125 greater than 35 U mL No conclusive radiological or clinical evidence of disease No disease recurrence Must have received only 1 prior platinum based chemotherapy regimen No tumors of low malignant potential or noninvasive disease PATIENT CHARACTERISTICS : Age : 18 and over Performance status : ECOG 0-2 Life expectancy : At least 6 months Hematopoietic : Hemoglobin at least 8 . 0 g dL Lymphocyte count at least 1 , 000 mm3 Neutrophil count at least 1 , 500 mm3 Platelet count at least 100 , 000 mm3 Hepatic : Bilirubin no greater than 1 . 5 times normal Renal : Creatinine no greater than 2 mg dL Cardiovascular : No uncontrolled hypertension No congestive heart failure No arrhythmias Other : Not pregnant or nursing Negative pregnancy test Fertile patients must use effective contraception No active autoimmune disease requiring chronic treatment No allergy to murine proteins No documented anaphylactic reaction to any drug No active infection causing fever No immunodeficiency disease No uncontrolled nonmalignant diseases No other malignancy ( except nonmelanomatous skin cancer or carcinoma in situ of the cervix ) unless curatively treated and free of disease for at least 5 years PRIOR CONCURRENT THERAPY : Biologic therapy : No prior murine monoclonal antibodies Chemotherapy : See Disease Characteristics At least 4 weeks since prior platinum based chemotherapy No concurrent chemotherapy Endocrine therapy : Not specified Radiotherapy : At least 6 months since prior limited field ( i . e ., abdominal or pelvic ) radiotherapy No prior whole abdominal radiotherapy Surgery : At least 4 weeks since prior surgery No prior splenectomy Other : At least 4 weeks since prior immunosuppressive drugs No concurrent immunosuppressive drugs At least 30 days since other prior investigational drugs\nPatients who are fertile must agree to use an effective method of contraception during participation in the study\nFertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy :\nFertile patients must use effective contraception\nFertile patients must use effective barrier contraception during and for 3 months after study\n"
      }
     ],
     "language": "python",
     "trusted": true,
     "collapsed": false
    },
    {
     "metadata": {},
     "cell_type": "markdown",
     "source": "##chosen categories\n* Non-smoker\n* Pregnancy\n* Birth control\n* Illicit drugs/Alcohol abuse\n* Congestive Heart Failure\n* HIV\n* Allergies/hypersensitivity"
    },
    {
     "metadata": {},
     "cell_type": "code",
     "input": "",
     "outputs": [],
     "language": "python",
     "trusted": true,
     "collapsed": false
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "name": "",
  "signature": "sha256:91f289de0a65b6e311810771200c5819dcfe865d8b586036a333bed091fa5d50"
 },
 "nbformat": 3
}