Skip to content

Instantly share code, notes, and snippets.

@jerielizabeth
Created January 27, 2017 22:42
Show Gist options
  • Save jerielizabeth/b64d07319bde08958e8dd7e7d15b97c7 to your computer and use it in GitHub Desktop.
Save jerielizabeth/b64d07319bde08958e8dd7e7d15b97c7 to your computer and use it in GitHub Desktop.
iliff named entities
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "Steps:\n+ load in file\n+ tokenize\n+ POS tag\n+ extract PERSON named entities\n+ get frequency of entities (? this could be complicated)\n+ tuple: file_id, entity, frequency"
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:52.551647",
"end_time": "2017-01-27T14:36:53.870071"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# Using example at https://gist.github.com/onyxfish/322906\n\nimport nltk\nimport itertools",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:53.871711",
"end_time": "2017-01-27T14:36:53.877018"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "with open(\"iliff_corpus/59_16_1_1.txt\") as f:\n content = f.read()",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:53.878582",
"end_time": "2017-01-27T14:36:53.953688"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "\"\"\"First break the content into sentences, using the sentence tokenizer in NLTK.\"\"\"\nsentences = nltk.sent_tokenize(content)\nsentences[:1]",
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "['Vneology, and the VUion of Qteatneââ HARVEY H. POTTHOFF THE general theme of this series of articles is \"Theology In A Space Age.\"']"
},
"metadata": {},
"execution_count": 3
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:53.955135",
"end_time": "2017-01-27T14:36:54.059996"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "\"\"\"Then break each sentence into word tokens, using the word tokenizer in NLTK. \"\"\"\ntokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]\ntokenized_sentences[:1]",
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[['Vneology',\n ',',\n 'and',\n 'the',\n 'VUion',\n 'of',\n 'Qteatneââ',\n 'HARVEY',\n 'H.',\n 'POTTHOFF',\n 'THE',\n 'general',\n 'theme',\n 'of',\n 'this',\n 'series',\n 'of',\n 'articles',\n 'is',\n '``',\n 'Theology',\n 'In',\n 'A',\n 'Space',\n 'Age',\n '.',\n \"''\"]]"
},
"metadata": {},
"execution_count": 4
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:54.063563",
"end_time": "2017-01-27T14:36:54.633692"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "\"\"\"Tag each token with a part of speech tag.\"\"\"\ntagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]\ntagged_sentences[:1]",
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[[('Vneology', 'NNP'),\n (',', ','),\n ('and', 'CC'),\n ('the', 'DT'),\n ('VUion', 'NNP'),\n ('of', 'IN'),\n ('Qteatneââ', 'NNP'),\n ('HARVEY', 'NNP'),\n ('H.', 'NNP'),\n ('POTTHOFF', 'NNP'),\n ('THE', 'NNP'),\n ('general', 'JJ'),\n ('theme', 'NN'),\n ('of', 'IN'),\n ('this', 'DT'),\n ('series', 'NN'),\n ('of', 'IN'),\n ('articles', 'NNS'),\n ('is', 'VBZ'),\n ('``', '``'),\n ('Theology', 'NNP'),\n ('In', 'IN'),\n ('A', 'NNP'),\n ('Space', 'NNP'),\n ('Age', 'NNP'),\n ('.', '.'),\n (\"''\", \"''\")]]"
},
"metadata": {},
"execution_count": 5
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:54.635241",
"end_time": "2017-01-27T14:36:54.813813"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "\"\"\"Finally, use the ne \"chunker\" to identify the named entities\"\"\"\nchunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:54.816032",
"end_time": "2017-01-27T14:36:54.825419"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def extract_entity_names(t):\n entity_names = []\n \n \"\"\" Cycle through the different tags.\n First identify those with a lable, then isolate those labled 'PERSON'. This could also be\n GPE (GeoPolitical Entity) or ORGANIZATION. \n Combine and return the names, or check if there are nested attributes.\n \"\"\"\n \n if hasattr(t, 'label') and t.label:\n if t.label() == 'PERSON':\n entity_names.append(' '.join([child[0] for child in t]))\n else:\n for child in t:\n entity_names.extend(extract_entity_names(child))\n\n return entity_names",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:54.827165",
"end_time": "2017-01-27T14:36:54.923457"
},
"trusted": true,
"collapsed": false,
"scrolled": false
},
"cell_type": "code",
"source": "\"\"\" Demonstration with the first five sentences. \"\"\"\n\nentity_names = []\nfor tree in itertools.islice(chunked_sentences, 10):\n print(tree)\n entity_names.extend(extract_entity_names(tree))\n",
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"text": "(S\n (GPE Vneology/NNP)\n ,/,\n and/CC\n the/DT\n (ORGANIZATION VUion/NNP)\n of/IN\n (PERSON Qteatneââ/NNP HARVEY/NNP)\n H./NNP\n POTTHOFF/NNP\n THE/NNP\n general/JJ\n theme/NN\n of/IN\n this/DT\n series/NN\n of/IN\n articles/NNS\n is/VBZ\n ``/``\n Theology/NNP\n In/IN\n A/NNP\n Space/NNP\n Age/NNP\n ./.\n ''/'')\n(S\n Our/PRP$\n concern/NN\n is/VBZ\n not/RB\n simply/RB\n with/IN\n satellites/NNS\n and/CC\n space/NN\n rockets/NNS\n ,/,\n but/CC\n with/IN\n the/DT\n fact/NN\n that/IN\n much/JJ\n new/JJ\n information/NN\n concerning/VBG\n man/NN\n and/CC\n the/DT\n universe/NN\n has/VBZ\n become/VBN\n available/JJ\n to/TO\n us/PRP\n in/IN\n recent/JJ\n years/NNS\n ./.)\n(S\n One/CD\n scientist/NN\n writes/VBZ\n ,/,\n ``/``\n The/DT\n new/JJ\n knowledge/NN\n built/VBN\n up/RP\n by/IN\n the/DT\n world/NN\n 's/POS\n astronomers/NNS\n in/IN\n the/DT\n past/JJ\n forty/NN\n years/NNS\n is/VBZ\n many/JJ\n times/NNS\n that/IN\n of/IN\n all/DT\n time/NN\n before/IN\n ./.)\n(S\n ``/``\n 1/CD\n In/IN\n many/JJ\n fields/NNS\n men/NNS\n are/VBP\n rethinking/VBG\n basic/JJ\n presuppositions/NNS\n and/CC\n reformulating/VBG\n concepts/NNS\n with/IN\n amazing/JJ\n results/NNS\n ./.)\n(S\n (PERSON Someone/NN)\n asked/VBD\n (PERSON Einstein/NNP)\n how/WRB\n he/PRP\n discovered/VBD\n relativity/NN\n ./.)\n(S\n He/PRP\n replied/VBD\n ,/,\n ``/``\n I/PRP\n challenged/VBD\n an/DT\n axiom/NN\n ./.\n ''/'')\n(S\n In/IN\n this/DT\n series/NN\n of/IN\n discussions/NNS\n we/PRP\n are/VBP\n asking/VBG\n if/IN\n the/DT\n spirit/NN\n of/IN\n inquiry/NN\n and/CC\n our/PRP$\n new/JJ\n information/NN\n concerning/VBG\n man/NN\n and/CC\n the/DT\n universe/NN\n have/VBP\n implications/NNS\n for/IN\n religious/JJ\n life/NN\n and/CC\n thought/NN\n ./.)\n(S\n In/IN\n this/DT\n article/NN\n we/PRP\n shall/MD\n consider/VB\n certain/JJ\n historical/JJ\n backgrounds/NNS\n which/WDT\n are/VBP\n essential/JJ\n for/IN\n an/DT\n understanding/NN\n of/IN\n our/PRP$\n current/JJ\n theological/JJ\n situation/NN\n ./.)\n(S\n We/PRP\n shall/MD\n suggest/VB\n that/IN\n in/IN\n entering/VBG\n the/DT\n space/NN\n age/NN\n we/PRP\n are/VBP\n entering/VBG\n an/DT\n era/NN\n of/IN\n history/NN\n calling/VBG\n for/IN\n a/DT\n ``/``\n vision/NN\n of/IN\n greatness/NN\n ''/''\n in/IN\n religious/JJ\n life/NN\n and/CC\n thought/NN\n ./.)\n(S\n The/DT\n call/NN\n of/IN\n today/NN\n is/VBZ\n not/RB\n for/IN\n regression/NN\n ,/,\n but/CC\n for/IN\n creative/JJ\n ,/,\n imaginative/JJ\n advance/NN\n ./.)\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T14:36:54.925622",
"end_time": "2017-01-27T14:36:54.932756"
},
"trusted": true,
"collapsed": false,
"scrolled": true
},
"cell_type": "code",
"source": "print(set(entity_names))",
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"text": "{'Someone', 'Einstein', 'Qteatneââ HARVEY'}\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"file_extension": ".py",
"nbconvert_exporter": "python",
"version": "3.5.2",
"mimetype": "text/x-python",
"pygments_lexer": "ipython3",
"codemirror_mode": {
"version": 3,
"name": "ipython"
},
"name": "python"
},
"gist": {
"id": "",
"data": {
"description": "iliff named entities",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment