jerielizabeth/iliff_named_entities.ipynb

## iliff_named_entities.ipynb
{
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Steps:\n+ load in file\n+ tokenize\n+ POS tag\n+ extract PERSON named entities\n+ get frequency of entities (? this could be complicated)\n+ tuple: file_id, entity, frequency"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:52.551647",
          "end_time": "2017-01-27T14:36:53.870071"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "# Using example at https://gist.github.com/onyxfish/322906\n\nimport nltk\nimport itertools",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:53.871711",
          "end_time": "2017-01-27T14:36:53.877018"
        },
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "with open(\"iliff_corpus/59_16_1_1.txt\") as f:\n    content = f.read()",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:53.878582",
          "end_time": "2017-01-27T14:36:53.953688"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "\"\"\"First break the content into sentences, using the sentence tokenizer in NLTK.\"\"\"\nsentences = nltk.sent_tokenize(content)\nsentences[:1]",
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": "['Vneology, and the VUion of Qteatneââ HARVEY H. POTTHOFF THE general theme of this series of articles is \"Theology In A Space Age.\"']"
          },
          "metadata": {},
          "execution_count": 3
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:53.955135",
          "end_time": "2017-01-27T14:36:54.059996"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "\"\"\"Then break each sentence into word tokens, using the word tokenizer in NLTK. \"\"\"\ntokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]\ntokenized_sentences[:1]",
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": "[['Vneology',\n  ',',\n  'and',\n  'the',\n  'VUion',\n  'of',\n  'Qteatneââ',\n  'HARVEY',\n  'H.',\n  'POTTHOFF',\n  'THE',\n  'general',\n  'theme',\n  'of',\n  'this',\n  'series',\n  'of',\n  'articles',\n  'is',\n  '``',\n  'Theology',\n  'In',\n  'A',\n  'Space',\n  'Age',\n  '.',\n  \"''\"]]"
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:54.063563",
          "end_time": "2017-01-27T14:36:54.633692"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "\"\"\"Tag each token with a part of speech tag.\"\"\"\ntagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]\ntagged_sentences[:1]",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": "[[('Vneology', 'NNP'),\n  (',', ','),\n  ('and', 'CC'),\n  ('the', 'DT'),\n  ('VUion', 'NNP'),\n  ('of', 'IN'),\n  ('Qteatneââ', 'NNP'),\n  ('HARVEY', 'NNP'),\n  ('H.', 'NNP'),\n  ('POTTHOFF', 'NNP'),\n  ('THE', 'NNP'),\n  ('general', 'JJ'),\n  ('theme', 'NN'),\n  ('of', 'IN'),\n  ('this', 'DT'),\n  ('series', 'NN'),\n  ('of', 'IN'),\n  ('articles', 'NNS'),\n  ('is', 'VBZ'),\n  ('``', '``'),\n  ('Theology', 'NNP'),\n  ('In', 'IN'),\n  ('A', 'NNP'),\n  ('Space', 'NNP'),\n  ('Age', 'NNP'),\n  ('.', '.'),\n  (\"''\", \"''\")]]"
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:54.635241",
          "end_time": "2017-01-27T14:36:54.813813"
        },
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "\"\"\"Finally, use the ne \"chunker\" to identify the named entities\"\"\"\nchunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)",
      "execution_count": 6,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:54.816032",
          "end_time": "2017-01-27T14:36:54.825419"
        },
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "def extract_entity_names(t):\n    entity_names = []\n    \n    \"\"\" Cycle through the different tags.\n    First identify those with a lable, then isolate those labled 'PERSON'. This could also be\n    GPE (GeoPolitical Entity) or ORGANIZATION. \n    Combine and return the names, or check if there are nested attributes.\n    \"\"\"\n    \n    if hasattr(t, 'label') and t.label:\n        if t.label() == 'PERSON':\n            entity_names.append(' '.join([child[0] for child in t]))\n        else:\n            for child in t:\n                entity_names.extend(extract_entity_names(child))\n\n    return entity_names",
      "execution_count": 7,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:54.827165",
          "end_time": "2017-01-27T14:36:54.923457"
        },
        "trusted": true,
        "collapsed": false,
        "scrolled": false
      },
      "cell_type": "code",
      "source": "\"\"\" Demonstration with the first five sentences. \"\"\"\n\nentity_names = []\nfor tree in itertools.islice(chunked_sentences, 10):\n    print(tree)\n    entity_names.extend(extract_entity_names(tree))\n",
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": "(S\n  (GPE Vneology/NNP)\n  ,/,\n  and/CC\n  the/DT\n  (ORGANIZATION VUion/NNP)\n  of/IN\n  (PERSON Qteatneââ/NNP HARVEY/NNP)\n  H./NNP\n  POTTHOFF/NNP\n  THE/NNP\n  general/JJ\n  theme/NN\n  of/IN\n  this/DT\n  series/NN\n  of/IN\n  articles/NNS\n  is/VBZ\n  ``/``\n  Theology/NNP\n  In/IN\n  A/NNP\n  Space/NNP\n  Age/NNP\n  ./.\n  ''/'')\n(S\n  Our/PRP$\n  concern/NN\n  is/VBZ\n  not/RB\n  simply/RB\n  with/IN\n  satellites/NNS\n  and/CC\n  space/NN\n  rockets/NNS\n  ,/,\n  but/CC\n  with/IN\n  the/DT\n  fact/NN\n  that/IN\n  much/JJ\n  new/JJ\n  information/NN\n  concerning/VBG\n  man/NN\n  and/CC\n  the/DT\n  universe/NN\n  has/VBZ\n  become/VBN\n  available/JJ\n  to/TO\n  us/PRP\n  in/IN\n  recent/JJ\n  years/NNS\n  ./.)\n(S\n  One/CD\n  scientist/NN\n  writes/VBZ\n  ,/,\n  ``/``\n  The/DT\n  new/JJ\n  knowledge/NN\n  built/VBN\n  up/RP\n  by/IN\n  the/DT\n  world/NN\n  's/POS\n  astronomers/NNS\n  in/IN\n  the/DT\n  past/JJ\n  forty/NN\n  years/NNS\n  is/VBZ\n  many/JJ\n  times/NNS\n  that/IN\n  of/IN\n  all/DT\n  time/NN\n  before/IN\n  ./.)\n(S\n  ``/``\n  1/CD\n  In/IN\n  many/JJ\n  fields/NNS\n  men/NNS\n  are/VBP\n  rethinking/VBG\n  basic/JJ\n  presuppositions/NNS\n  and/CC\n  reformulating/VBG\n  concepts/NNS\n  with/IN\n  amazing/JJ\n  results/NNS\n  ./.)\n(S\n  (PERSON Someone/NN)\n  asked/VBD\n  (PERSON Einstein/NNP)\n  how/WRB\n  he/PRP\n  discovered/VBD\n  relativity/NN\n  ./.)\n(S\n  He/PRP\n  replied/VBD\n  ,/,\n  ``/``\n  I/PRP\n  challenged/VBD\n  an/DT\n  axiom/NN\n  ./.\n  ''/'')\n(S\n  In/IN\n  this/DT\n  series/NN\n  of/IN\n  discussions/NNS\n  we/PRP\n  are/VBP\n  asking/VBG\n  if/IN\n  the/DT\n  spirit/NN\n  of/IN\n  inquiry/NN\n  and/CC\n  our/PRP$\n  new/JJ\n  information/NN\n  concerning/VBG\n  man/NN\n  and/CC\n  the/DT\n  universe/NN\n  have/VBP\n  implications/NNS\n  for/IN\n  religious/JJ\n  life/NN\n  and/CC\n  thought/NN\n  ./.)\n(S\n  In/IN\n  this/DT\n  article/NN\n  we/PRP\n  shall/MD\n  consider/VB\n  certain/JJ\n  historical/JJ\n  backgrounds/NNS\n  which/WDT\n  are/VBP\n  essential/JJ\n  for/IN\n  an/DT\n  understanding/NN\n  of/IN\n  our/PRP$\n  current/JJ\n  theological/JJ\n  situation/NN\n  ./.)\n(S\n  We/PRP\n  shall/MD\n  suggest/VB\n  that/IN\n  in/IN\n  entering/VBG\n  the/DT\n  space/NN\n  age/NN\n  we/PRP\n  are/VBP\n  entering/VBG\n  an/DT\n  era/NN\n  of/IN\n  history/NN\n  calling/VBG\n  for/IN\n  a/DT\n  ``/``\n  vision/NN\n  of/IN\n  greatness/NN\n  ''/''\n  in/IN\n  religious/JJ\n  life/NN\n  and/CC\n  thought/NN\n  ./.)\n(S\n  The/DT\n  call/NN\n  of/IN\n  today/NN\n  is/VBZ\n  not/RB\n  for/IN\n  regression/NN\n  ,/,\n  but/CC\n  for/IN\n  creative/JJ\n  ,/,\n  imaginative/JJ\n  advance/NN\n  ./.)\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-27T14:36:54.925622",
          "end_time": "2017-01-27T14:36:54.932756"
        },
        "trusted": true,
        "collapsed": false,
        "scrolled": true
      },
      "cell_type": "code",
      "source": "print(set(entity_names))",
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": "{'Someone', 'Einstein', 'Qteatneââ HARVEY'}\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "file_extension": ".py",
      "nbconvert_exporter": "python",
      "version": "3.5.2",
      "mimetype": "text/x-python",
      "pygments_lexer": "ipython3",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "name": "python"
    },
    "gist": {
      "id": "",
      "data": {
        "description": "iliff named entities",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
	{
	"cells": [
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "Steps:\n+ load in file\n+ tokenize\n+ POS tag\n+ extract PERSON named entities\n+ get frequency of entities (? this could be complicated)\n+ tuple: file_id, entity, frequency"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:52.551647",
	"end_time": "2017-01-27T14:36:53.870071"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "# Using example at https://gist.github.com/onyxfish/322906\n\nimport nltk\nimport itertools",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:53.871711",
	"end_time": "2017-01-27T14:36:53.877018"
	},
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "with open(\"iliff_corpus/59_16_1_1.txt\") as f:\n content = f.read()",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:53.878582",
	"end_time": "2017-01-27T14:36:53.953688"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "\"\"\"First break the content into sentences, using the sentence tokenizer in NLTK.\"\"\"\nsentences = nltk.sent_tokenize(content)\nsentences[:1]",
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": "['Vneology, and the VUion of Qteatneââ HARVEY H. POTTHOFF THE general theme of this series of articles is \"Theology In A Space Age.\"']"
	},
	"metadata": {},
	"execution_count": 3
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:53.955135",
	"end_time": "2017-01-27T14:36:54.059996"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "\"\"\"Then break each sentence into word tokens, using the word tokenizer in NLTK. \"\"\"\ntokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]\ntokenized_sentences[:1]",
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": "[['Vneology',\n ',',\n 'and',\n 'the',\n 'VUion',\n 'of',\n 'Qteatneââ',\n 'HARVEY',\n 'H.',\n 'POTTHOFF',\n 'THE',\n 'general',\n 'theme',\n 'of',\n 'this',\n 'series',\n 'of',\n 'articles',\n 'is',\n '``',\n 'Theology',\n 'In',\n 'A',\n 'Space',\n 'Age',\n '.',\n \"''\"]]"
	},
	"metadata": {},
	"execution_count": 4
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:54.063563",
	"end_time": "2017-01-27T14:36:54.633692"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "\"\"\"Tag each token with a part of speech tag.\"\"\"\ntagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]\ntagged_sentences[:1]",
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": "[[('Vneology', 'NNP'),\n (',', ','),\n ('and', 'CC'),\n ('the', 'DT'),\n ('VUion', 'NNP'),\n ('of', 'IN'),\n ('Qteatneââ', 'NNP'),\n ('HARVEY', 'NNP'),\n ('H.', 'NNP'),\n ('POTTHOFF', 'NNP'),\n ('THE', 'NNP'),\n ('general', 'JJ'),\n ('theme', 'NN'),\n ('of', 'IN'),\n ('this', 'DT'),\n ('series', 'NN'),\n ('of', 'IN'),\n ('articles', 'NNS'),\n ('is', 'VBZ'),\n ('``', '``'),\n ('Theology', 'NNP'),\n ('In', 'IN'),\n ('A', 'NNP'),\n ('Space', 'NNP'),\n ('Age', 'NNP'),\n ('.', '.'),\n (\"''\", \"''\")]]"
	},
	"metadata": {},
	"execution_count": 5
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:54.635241",
	"end_time": "2017-01-27T14:36:54.813813"
	},
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "\"\"\"Finally, use the ne \"chunker\" to identify the named entities\"\"\"\nchunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)",
	"execution_count": 6,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:54.816032",
	"end_time": "2017-01-27T14:36:54.825419"
	},
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "def extract_entity_names(t):\n entity_names = []\n \n \"\"\" Cycle through the different tags.\n First identify those with a lable, then isolate those labled 'PERSON'. This could also be\n GPE (GeoPolitical Entity) or ORGANIZATION. \n Combine and return the names, or check if there are nested attributes.\n \"\"\"\n \n if hasattr(t, 'label') and t.label:\n if t.label() == 'PERSON':\n entity_names.append(' '.join([child[0] for child in t]))\n else:\n for child in t:\n entity_names.extend(extract_entity_names(child))\n\n return entity_names",
	"execution_count": 7,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:54.827165",
	"end_time": "2017-01-27T14:36:54.923457"
	},
	"trusted": true,
	"collapsed": false,
	"scrolled": false
	},
	"cell_type": "code",
	"source": "\"\"\" Demonstration with the first five sentences. \"\"\"\n\nentity_names = []\nfor tree in itertools.islice(chunked_sentences, 10):\n print(tree)\n entity_names.extend(extract_entity_names(tree))\n",
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "stream",
	"text": "(S\n (GPE Vneology/NNP)\n ,/,\n and/CC\n the/DT\n (ORGANIZATION VUion/NNP)\n of/IN\n (PERSON Qteatneââ/NNP HARVEY/NNP)\n H./NNP\n POTTHOFF/NNP\n THE/NNP\n general/JJ\n theme/NN\n of/IN\n this/DT\n series/NN\n of/IN\n articles/NNS\n is/VBZ\n ``/``\n Theology/NNP\n In/IN\n A/NNP\n Space/NNP\n Age/NNP\n ./.\n ''/'')\n(S\n Our/PRP$\n concern/NN\n is/VBZ\n not/RB\n simply/RB\n with/IN\n satellites/NNS\n and/CC\n space/NN\n rockets/NNS\n ,/,\n but/CC\n with/IN\n the/DT\n fact/NN\n that/IN\n much/JJ\n new/JJ\n information/NN\n concerning/VBG\n man/NN\n and/CC\n the/DT\n universe/NN\n has/VBZ\n become/VBN\n available/JJ\n to/TO\n us/PRP\n in/IN\n recent/JJ\n years/NNS\n ./.)\n(S\n One/CD\n scientist/NN\n writes/VBZ\n ,/,\n ``/``\n The/DT\n new/JJ\n knowledge/NN\n built/VBN\n up/RP\n by/IN\n the/DT\n world/NN\n 's/POS\n astronomers/NNS\n in/IN\n the/DT\n past/JJ\n forty/NN\n years/NNS\n is/VBZ\n many/JJ\n times/NNS\n that/IN\n of/IN\n all/DT\n time/NN\n before/IN\n ./.)\n(S\n ``/``\n 1/CD\n In/IN\n many/JJ\n fields/NNS\n men/NNS\n are/VBP\n rethinking/VBG\n basic/JJ\n presuppositions/NNS\n and/CC\n reformulating/VBG\n concepts/NNS\n with/IN\n amazing/JJ\n results/NNS\n ./.)\n(S\n (PERSON Someone/NN)\n asked/VBD\n (PERSON Einstein/NNP)\n how/WRB\n he/PRP\n discovered/VBD\n relativity/NN\n ./.)\n(S\n He/PRP\n replied/VBD\n ,/,\n ``/``\n I/PRP\n challenged/VBD\n an/DT\n axiom/NN\n ./.\n ''/'')\n(S\n In/IN\n this/DT\n series/NN\n of/IN\n discussions/NNS\n we/PRP\n are/VBP\n asking/VBG\n if/IN\n the/DT\n spirit/NN\n of/IN\n inquiry/NN\n and/CC\n our/PRP$\n new/JJ\n information/NN\n concerning/VBG\n man/NN\n and/CC\n the/DT\n universe/NN\n have/VBP\n implications/NNS\n for/IN\n religious/JJ\n life/NN\n and/CC\n thought/NN\n ./.)\n(S\n In/IN\n this/DT\n article/NN\n we/PRP\n shall/MD\n consider/VB\n certain/JJ\n historical/JJ\n backgrounds/NNS\n which/WDT\n are/VBP\n essential/JJ\n for/IN\n an/DT\n understanding/NN\n of/IN\n our/PRP$\n current/JJ\n theological/JJ\n situation/NN\n ./.)\n(S\n We/PRP\n shall/MD\n suggest/VB\n that/IN\n in/IN\n entering/VBG\n the/DT\n space/NN\n age/NN\n we/PRP\n are/VBP\n entering/VBG\n an/DT\n era/NN\n of/IN\n history/NN\n calling/VBG\n for/IN\n a/DT\n ``/``\n vision/NN\n of/IN\n greatness/NN\n ''/''\n in/IN\n religious/JJ\n life/NN\n and/CC\n thought/NN\n ./.)\n(S\n The/DT\n call/NN\n of/IN\n today/NN\n is/VBZ\n not/RB\n for/IN\n regression/NN\n ,/,\n but/CC\n for/IN\n creative/JJ\n ,/,\n imaginative/JJ\n advance/NN\n ./.)\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-27T14:36:54.925622",
	"end_time": "2017-01-27T14:36:54.932756"
	},
	"trusted": true,
	"collapsed": false,
	"scrolled": true
	},
	"cell_type": "code",
	"source": "print(set(entity_names))",
	"execution_count": 9,
	"outputs": [
	{
	"output_type": "stream",
	"text": "{'Someone', 'Einstein', 'Qteatneââ HARVEY'}\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true,
	"collapsed": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3",
	"language": "python"
	},
	"language_info": {
	"file_extension": ".py",
	"nbconvert_exporter": "python",
	"version": "3.5.2",
	"mimetype": "text/x-python",
	"pygments_lexer": "ipython3",
	"codemirror_mode": {
	"version": 3,
	"name": "ipython"
	},
	"name": "python"
	},
	"gist": {
	"id": "",
	"data": {
	"description": "iliff named entities",
	"public": true
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}