Skip to content

Instantly share code, notes, and snippets.

@jerielizabeth
Created January 28, 2017 02:11
Show Gist options
  • Save jerielizabeth/d7d1f166199a9698408b41420485e250 to your computer and use it in GitHub Desktop.
Save jerielizabeth/d7d1f166199a9698408b41420485e250 to your computer and use it in GitHub Desktop.
iliff named entities per decade
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:24.176877",
"end_time": "2017-01-27T17:25:25.442707"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "import nltk\nimport os\nfrom collections import Counter\nimport operator",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:25.943146",
"end_time": "2017-01-27T17:25:25.950279"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def extract_entity_names(t):\n entity_names = []\n \n \"\"\" Cycle through the different tags.\n First identify those with a lable, then isolate those labled 'PERSON'. This could also be\n GPE (GeoPolitical Entity) or ORGANIZATION. \n Combine and return the names, or check if there are nested attributes.\n \"\"\"\n \n if hasattr(t, 'label') and t.label:\n if t.label() == 'PERSON':\n entity_names.append(' '.join([child[0] for child in t]))\n else:\n for child in t:\n entity_names.extend(extract_entity_names(child))\n\n return( entity_names)",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:27.085843",
"end_time": "2017-01-27T17:25:27.098006"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "def identify_chunks(directory, file):\n with open(os.path.join(directory, file)) as f:\n content = f.read()\n \"\"\"First break the content into sentences, using the sentence tokenizer in NLTK.\"\"\"\n sentences = nltk.sent_tokenize(content)\n \n \"\"\"Then break each sentence into word tokens, using the word tokenizer in NLTK. \"\"\"\n tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]\n \n \"\"\"Tag each token with a part of speech tag.\"\"\"\n tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]\n \n \"\"\"Finally, use the ne \"chunker\" to identify the named entities\"\"\"\n chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)\n \n return(chunked_sentences)",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:28.163626",
"end_time": "2017-01-27T17:25:28.204531"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def process_chunks(chunks):\n entity_names = []\n for tree in chunks:\n entity_names.extend(extract_entity_names(tree))\n return(entity_names)",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:29.089016",
"end_time": "2017-01-27T17:25:29.093643"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def get_entity_frequency(entity_names):\n counts = Counter(entity_names)\n return(dict(counts))",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:30.236004",
"end_time": "2017-01-27T17:25:30.240120"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "def process_file(directory, file):\n \n chunks = identify_chunks(directory, file)\n entity_names = process_chunks(chunks)\n entity_summary = get_entity_frequency(entity_names)\n \n return({'doc_id': file, 'entities': entity_summary})",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:30.987331",
"end_time": "2017-01-27T17:25:30.996201"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "def get_decade_data(decade):\n decade_statistics = []\n for file in corpus:\n if file.startswith(decade):\n file_data = process_file(directory, file)\n decade_statistics.append(file_data)\n return(decade_statistics)",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:51:23.469632",
"end_time": "2017-01-27T17:51:23.492059"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "def summary_data(decade_data):\n '''Generates overview on the named entities for a collection of data. Creates a dictionary (entity_summary) \n from all the reported entities/frequencies\n and records the entity (as key) and the total count for that entity (as value).\n '''\n all_entities = [ document['entities'] for document in decade_data]\n\n inp = [dict(x) for x in all_entities]\n \n entity_summary = Counter()\n for y in inp:\n entity_summary += Counter(y)\n entity_dict = dict(entity_summary)\n \n return(sorted(entity_summary.items(), key=operator.itemgetter(1)))",
"execution_count": 15,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:32.517791",
"end_time": "2017-01-27T17:25:32.520754"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "directory = \"/Users/jeriwieringa/Documents/nlp-group/iliff_review/data/ir_txt/\"",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:25:33.373209",
"end_time": "2017-01-27T17:25:33.387250"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "corpus = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))]",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:51:28.694901",
"end_time": "2017-01-27T17:51:57.365970"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "fifties_decade = get_decade_data(\"5\")\nfifties_summary = summary_data(fifties_decade)",
"execution_count": 16,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T18:04:09.970702",
"end_time": "2017-01-27T18:04:09.985387"
},
"trusted": true,
"collapsed": false,
"scrolled": true
},
"cell_type": "code",
"source": "fifties_summary[-50:]",
"execution_count": 25,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[('Bowne', 3),\n ('Plato', 3),\n ('Knudson', 3),\n ('Existentialism', 3),\n ('Simon', 3),\n ('Value', 3),\n ('Empty', 3),\n ('Sir', 3),\n ('Holt', 3),\n ('Einstein', 3),\n ('Williams', 3),\n ('Donahue', 3),\n ('Karl', 3),\n ('Mr.', 4),\n ('Cimarron', 4),\n ('Christ', 4),\n ('Cardinas', 4),\n ('Hume', 4),\n ('Thomas Harwood', 4),\n ('Cruz Vega', 4),\n ('Emerson', 4),\n ('Schleiermacher', 4),\n ('Bernhardt', 4),\n ('Augustine', 5),\n ('Whitehead', 5),\n ('Mills', 5),\n ('Parkinson', 5),\n ('Garden', 5),\n ('Macmillan', 5),\n ('Glencoe', 5),\n ('Christian Theology', 5),\n ('Adam', 6),\n ('Durkheim', 6),\n ('Shapley', 6),\n ('Aristotle', 6),\n ('Galileo', 6),\n ('William James', 6),\n ('James', 6),\n ('John', 6),\n ('Kant', 7),\n ('Vol', 10),\n ('Paul', 11),\n ('Harper', 12),\n ('Jesus', 14),\n ('Man', 15),\n ('Etzler', 16),\n ('Freud', 18),\n ('Popper', 21),\n ('Thoreau', 21),\n ('God', 78)]"
},
"metadata": {},
"execution_count": 25
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:54:09.526562",
"end_time": "2017-01-27T17:58:37.622907"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "sixties = get_decade_data(\"6\")\nsixties_summary = summary_data(sixties)",
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T18:04:18.420398",
"end_time": "2017-01-27T18:04:18.431740"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "sixties_summary[-50:]",
"execution_count": 26,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[('History', 24),\n ('Jews', 25),\n ('Robinson', 25),\n ('Macmillan', 25),\n ('Douglas', 28),\n ('Sweden', 28),\n ('Jeremiah', 28),\n ('Nabopolassar', 29),\n ('Church', 29),\n ('Ogden', 29),\n ('Adam Pastor', 30),\n ('Daniel', 30),\n ('Teilhard', 31),\n ('Pastor', 31),\n ('Job', 33),\n ('Him', 34),\n ('Mark', 35),\n ('Calvin', 36),\n ('Matthew', 36),\n ('Jason Lee', 36),\n ('Aristotle', 36),\n ('Pietism', 38),\n ('John Wesley', 39),\n ('Williams', 41),\n ('Harwood', 41),\n ('Wieman', 42),\n ('Thomas', 44),\n ('Bultmann', 45),\n ('Emerson', 46),\n ('Martin Rist', 47),\n ('Man', 47),\n ('Babylon', 50),\n ('Harper', 52),\n ('Iliff Review', 52),\n ('Solomon', 54),\n ('David', 56),\n ('Wesley', 57),\n ('Lee', 63),\n ('Niebuhr', 66),\n ('Vol', 78),\n ('Tillich', 95),\n ('Scott', 100),\n ('Luther', 102),\n ('Jesus Christ', 102),\n ('Christ', 105),\n ('Lincoln', 133),\n ('John', 163),\n ('Paul', 241),\n ('Jesus', 299),\n ('God', 631)]"
},
"metadata": {},
"execution_count": 26
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T17:58:37.674962",
"end_time": "2017-01-27T18:03:37.084161"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "seventies = get_decade_data(\"7\")\nseventies_summary = summary_data(seventies)",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T18:04:43.270229",
"end_time": "2017-01-27T18:04:43.280674"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "seventies_summary[-50:]",
"execution_count": 27,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[('Barrett', 28),\n ('Francis', 29),\n ('Faith', 31),\n ('Salzburg', 32),\n ('Whitehead', 32),\n ('Wittgenstein', 36),\n ('Knox', 36),\n ('Mencius', 36),\n ('Martin', 37),\n ('Kierkegaard', 39),\n ('Dostoevsky', 42),\n ('Rahner', 42),\n ('Job', 46),\n ('Macmillan', 46),\n ('Sartre', 49),\n ('St. Francis', 49),\n ('Wesley', 50),\n ('Husserl', 50),\n ('Pistorius', 51),\n ('Camus', 53),\n ('Hegel', 53),\n ('Smith', 58),\n ('Gordon', 59),\n ('Williams', 60),\n ('Jesus Christ', 60),\n ('Church', 61),\n ('Jung', 62),\n ('Campbell', 64),\n ('Van', 64),\n ('Bonhoeffer', 64),\n ('Niebuhr', 66),\n ('Dasein', 66),\n ('Man', 66),\n ('Bultmann', 68),\n ('Row', 69),\n ('Christ', 76),\n ('Luther', 76),\n ('Bernhardt', 82),\n ('John', 90),\n ('Harper', 93),\n ('Heidegger', 100),\n ('Iliff', 114),\n ('Wieman', 119),\n ('Paul', 127),\n ('Vol', 134),\n ('Lowell', 179),\n ('Tillich', 189),\n ('Ricoeur', 200),\n ('Jesus', 317),\n ('God', 737)]"
},
"metadata": {},
"execution_count": 27
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T18:04:58.904488",
"end_time": "2017-01-27T18:08:40.712988"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "eighties = get_decade_data(\"8\")\neighties_summary = summary_data(eighties)",
"execution_count": 28,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-27T18:08:40.715068",
"end_time": "2017-01-27T18:08:40.723500"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "eighties_summary[-50:]",
"execution_count": 29,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[('Himself', 23),\n ('Petersen', 23),\n ('Mapple', 24),\n ('Charles', 24),\n ('Williams', 25),\n ('Bernhardt', 25),\n ('King', 25),\n ('Macmillan', 26),\n ('Martin', 26),\n ('Wilbanks', 27),\n ('Sontag', 28),\n ('Milligan', 29),\n ('Order', 29),\n ('Bob', 29),\n ('Faith', 29),\n ('Holler', 30),\n ('Tracy', 32),\n ('Jews', 32),\n ('Campbell', 32),\n ('Strong', 33),\n ('Jesus Christ', 34),\n ('Whitehead', 35),\n ('Luther', 37),\n ('Smith', 38),\n ('Rist', 39),\n ('Tillich', 40),\n ('James', 42),\n ('Haggai', 44),\n ('Herbert', 44),\n ('Melville', 45),\n ('Schweitzer', 45),\n ('Gandhi', 50),\n ('Vol', 51),\n ('Thomas', 57),\n ('Fowler', 60),\n ('Harper', 62),\n ('Row', 63),\n ('Freud', 71),\n ('Christ', 71),\n ('Pannenberg', 74),\n ('Black Elk', 78),\n ('Wieman', 84),\n ('John', 99),\n ('Potthoff', 103),\n ('Oecolampadius', 107),\n ('Matthew', 126),\n ('Paul', 152),\n ('David', 185),\n ('Jesus', 230),\n ('God', 722)]"
},
"metadata": {},
"execution_count": 29
}
]
},
{
"metadata": {
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"codemirror_mode": {
"version": 3,
"name": "ipython"
},
"nbconvert_exporter": "python",
"version": "3.5.2",
"mimetype": "text/x-python",
"file_extension": ".py",
"pygments_lexer": "ipython3",
"name": "python"
},
"gist": {
"id": "",
"data": {
"description": "iliff named entities per decade",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment