Created
January 28, 2017 02:11
-
-
Save jerielizabeth/d7d1f166199a9698408b41420485e250 to your computer and use it in GitHub Desktop.
iliff named entities per decade
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:24.176877", | |
"end_time": "2017-01-27T17:25:25.442707" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "import nltk\nimport os\nfrom collections import Counter\nimport operator", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:25.943146", | |
"end_time": "2017-01-27T17:25:25.950279" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "def extract_entity_names(t):\n entity_names = []\n \n \"\"\" Cycle through the different tags.\n First identify those with a lable, then isolate those labled 'PERSON'. This could also be\n GPE (GeoPolitical Entity) or ORGANIZATION. \n Combine and return the names, or check if there are nested attributes.\n \"\"\"\n \n if hasattr(t, 'label') and t.label:\n if t.label() == 'PERSON':\n entity_names.append(' '.join([child[0] for child in t]))\n else:\n for child in t:\n entity_names.extend(extract_entity_names(child))\n\n return( entity_names)", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:27.085843", | |
"end_time": "2017-01-27T17:25:27.098006" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "def identify_chunks(directory, file):\n with open(os.path.join(directory, file)) as f:\n content = f.read()\n \"\"\"First break the content into sentences, using the sentence tokenizer in NLTK.\"\"\"\n sentences = nltk.sent_tokenize(content)\n \n \"\"\"Then break each sentence into word tokens, using the word tokenizer in NLTK. \"\"\"\n tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]\n \n \"\"\"Tag each token with a part of speech tag.\"\"\"\n tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]\n \n \"\"\"Finally, use the ne \"chunker\" to identify the named entities\"\"\"\n chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)\n \n return(chunked_sentences)", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:28.163626", | |
"end_time": "2017-01-27T17:25:28.204531" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "def process_chunks(chunks):\n entity_names = []\n for tree in chunks:\n entity_names.extend(extract_entity_names(tree))\n return(entity_names)", | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:29.089016", | |
"end_time": "2017-01-27T17:25:29.093643" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "def get_entity_frequency(entity_names):\n counts = Counter(entity_names)\n return(dict(counts))", | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:30.236004", | |
"end_time": "2017-01-27T17:25:30.240120" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "def process_file(directory, file):\n \n chunks = identify_chunks(directory, file)\n entity_names = process_chunks(chunks)\n entity_summary = get_entity_frequency(entity_names)\n \n return({'doc_id': file, 'entities': entity_summary})", | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:30.987331", | |
"end_time": "2017-01-27T17:25:30.996201" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "def get_decade_data(decade):\n decade_statistics = []\n for file in corpus:\n if file.startswith(decade):\n file_data = process_file(directory, file)\n decade_statistics.append(file_data)\n return(decade_statistics)", | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:51:23.469632", | |
"end_time": "2017-01-27T17:51:23.492059" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "def summary_data(decade_data):\n '''Generates overview on the named entities for a collection of data. Creates a dictionary (entity_summary) \n from all the reported entities/frequencies\n and records the entity (as key) and the total count for that entity (as value).\n '''\n all_entities = [ document['entities'] for document in decade_data]\n\n inp = [dict(x) for x in all_entities]\n \n entity_summary = Counter()\n for y in inp:\n entity_summary += Counter(y)\n entity_dict = dict(entity_summary)\n \n return(sorted(entity_summary.items(), key=operator.itemgetter(1)))", | |
"execution_count": 15, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:32.517791", | |
"end_time": "2017-01-27T17:25:32.520754" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "directory = \"/Users/jeriwieringa/Documents/nlp-group/iliff_review/data/ir_txt/\"", | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:25:33.373209", | |
"end_time": "2017-01-27T17:25:33.387250" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "corpus = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))]", | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:51:28.694901", | |
"end_time": "2017-01-27T17:51:57.365970" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "fifties_decade = get_decade_data(\"5\")\nfifties_summary = summary_data(fifties_decade)", | |
"execution_count": 16, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T18:04:09.970702", | |
"end_time": "2017-01-27T18:04:09.985387" | |
}, | |
"trusted": true, | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"cell_type": "code", | |
"source": "fifties_summary[-50:]", | |
"execution_count": 25, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[('Bowne', 3),\n ('Plato', 3),\n ('Knudson', 3),\n ('Existentialism', 3),\n ('Simon', 3),\n ('Value', 3),\n ('Empty', 3),\n ('Sir', 3),\n ('Holt', 3),\n ('Einstein', 3),\n ('Williams', 3),\n ('Donahue', 3),\n ('Karl', 3),\n ('Mr.', 4),\n ('Cimarron', 4),\n ('Christ', 4),\n ('Cardinas', 4),\n ('Hume', 4),\n ('Thomas Harwood', 4),\n ('Cruz Vega', 4),\n ('Emerson', 4),\n ('Schleiermacher', 4),\n ('Bernhardt', 4),\n ('Augustine', 5),\n ('Whitehead', 5),\n ('Mills', 5),\n ('Parkinson', 5),\n ('Garden', 5),\n ('Macmillan', 5),\n ('Glencoe', 5),\n ('Christian Theology', 5),\n ('Adam', 6),\n ('Durkheim', 6),\n ('Shapley', 6),\n ('Aristotle', 6),\n ('Galileo', 6),\n ('William James', 6),\n ('James', 6),\n ('John', 6),\n ('Kant', 7),\n ('Vol', 10),\n ('Paul', 11),\n ('Harper', 12),\n ('Jesus', 14),\n ('Man', 15),\n ('Etzler', 16),\n ('Freud', 18),\n ('Popper', 21),\n ('Thoreau', 21),\n ('God', 78)]" | |
}, | |
"metadata": {}, | |
"execution_count": 25 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:54:09.526562", | |
"end_time": "2017-01-27T17:58:37.622907" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "sixties = get_decade_data(\"6\")\nsixties_summary = summary_data(sixties)", | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T18:04:18.420398", | |
"end_time": "2017-01-27T18:04:18.431740" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "sixties_summary[-50:]", | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[('History', 24),\n ('Jews', 25),\n ('Robinson', 25),\n ('Macmillan', 25),\n ('Douglas', 28),\n ('Sweden', 28),\n ('Jeremiah', 28),\n ('Nabopolassar', 29),\n ('Church', 29),\n ('Ogden', 29),\n ('Adam Pastor', 30),\n ('Daniel', 30),\n ('Teilhard', 31),\n ('Pastor', 31),\n ('Job', 33),\n ('Him', 34),\n ('Mark', 35),\n ('Calvin', 36),\n ('Matthew', 36),\n ('Jason Lee', 36),\n ('Aristotle', 36),\n ('Pietism', 38),\n ('John Wesley', 39),\n ('Williams', 41),\n ('Harwood', 41),\n ('Wieman', 42),\n ('Thomas', 44),\n ('Bultmann', 45),\n ('Emerson', 46),\n ('Martin Rist', 47),\n ('Man', 47),\n ('Babylon', 50),\n ('Harper', 52),\n ('Iliff Review', 52),\n ('Solomon', 54),\n ('David', 56),\n ('Wesley', 57),\n ('Lee', 63),\n ('Niebuhr', 66),\n ('Vol', 78),\n ('Tillich', 95),\n ('Scott', 100),\n ('Luther', 102),\n ('Jesus Christ', 102),\n ('Christ', 105),\n ('Lincoln', 133),\n ('John', 163),\n ('Paul', 241),\n ('Jesus', 299),\n ('God', 631)]" | |
}, | |
"metadata": {}, | |
"execution_count": 26 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T17:58:37.674962", | |
"end_time": "2017-01-27T18:03:37.084161" | |
}, | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "seventies = get_decade_data(\"7\")\nseventies_summary = summary_data(seventies)", | |
"execution_count": 20, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T18:04:43.270229", | |
"end_time": "2017-01-27T18:04:43.280674" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "seventies_summary[-50:]", | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[('Barrett', 28),\n ('Francis', 29),\n ('Faith', 31),\n ('Salzburg', 32),\n ('Whitehead', 32),\n ('Wittgenstein', 36),\n ('Knox', 36),\n ('Mencius', 36),\n ('Martin', 37),\n ('Kierkegaard', 39),\n ('Dostoevsky', 42),\n ('Rahner', 42),\n ('Job', 46),\n ('Macmillan', 46),\n ('Sartre', 49),\n ('St. Francis', 49),\n ('Wesley', 50),\n ('Husserl', 50),\n ('Pistorius', 51),\n ('Camus', 53),\n ('Hegel', 53),\n ('Smith', 58),\n ('Gordon', 59),\n ('Williams', 60),\n ('Jesus Christ', 60),\n ('Church', 61),\n ('Jung', 62),\n ('Campbell', 64),\n ('Van', 64),\n ('Bonhoeffer', 64),\n ('Niebuhr', 66),\n ('Dasein', 66),\n ('Man', 66),\n ('Bultmann', 68),\n ('Row', 69),\n ('Christ', 76),\n ('Luther', 76),\n ('Bernhardt', 82),\n ('John', 90),\n ('Harper', 93),\n ('Heidegger', 100),\n ('Iliff', 114),\n ('Wieman', 119),\n ('Paul', 127),\n ('Vol', 134),\n ('Lowell', 179),\n ('Tillich', 189),\n ('Ricoeur', 200),\n ('Jesus', 317),\n ('God', 737)]" | |
}, | |
"metadata": {}, | |
"execution_count": 27 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T18:04:58.904488", | |
"end_time": "2017-01-27T18:08:40.712988" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "eighties = get_decade_data(\"8\")\neighties_summary = summary_data(eighties)", | |
"execution_count": 28, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"start_time": "2017-01-27T18:08:40.715068", | |
"end_time": "2017-01-27T18:08:40.723500" | |
}, | |
"trusted": true, | |
"collapsed": false | |
}, | |
"cell_type": "code", | |
"source": "eighties_summary[-50:]", | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": "[('Himself', 23),\n ('Petersen', 23),\n ('Mapple', 24),\n ('Charles', 24),\n ('Williams', 25),\n ('Bernhardt', 25),\n ('King', 25),\n ('Macmillan', 26),\n ('Martin', 26),\n ('Wilbanks', 27),\n ('Sontag', 28),\n ('Milligan', 29),\n ('Order', 29),\n ('Bob', 29),\n ('Faith', 29),\n ('Holler', 30),\n ('Tracy', 32),\n ('Jews', 32),\n ('Campbell', 32),\n ('Strong', 33),\n ('Jesus Christ', 34),\n ('Whitehead', 35),\n ('Luther', 37),\n ('Smith', 38),\n ('Rist', 39),\n ('Tillich', 40),\n ('James', 42),\n ('Haggai', 44),\n ('Herbert', 44),\n ('Melville', 45),\n ('Schweitzer', 45),\n ('Gandhi', 50),\n ('Vol', 51),\n ('Thomas', 57),\n ('Fowler', 60),\n ('Harper', 62),\n ('Row', 63),\n ('Freud', 71),\n ('Christ', 71),\n ('Pannenberg', 74),\n ('Black Elk', 78),\n ('Wieman', 84),\n ('John', 99),\n ('Potthoff', 103),\n ('Oecolampadius', 107),\n ('Matthew', 126),\n ('Paul', 152),\n ('David', 185),\n ('Jesus', 230),\n ('God', 722)]" | |
}, | |
"metadata": {}, | |
"execution_count": 29 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"collapsed": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"version": 3, | |
"name": "ipython" | |
}, | |
"nbconvert_exporter": "python", | |
"version": "3.5.2", | |
"mimetype": "text/x-python", | |
"file_extension": ".py", | |
"pygments_lexer": "ipython3", | |
"name": "python" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "iliff named entities per decade", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment