Last active
July 22, 2020 00:12
-
-
Save fsartoris/400b5a5e6bdc1a637a3cc4d45f9869b9 to your computer and use it in GitHub Desktop.
NLP_Spacy_EntityRuler.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "NLP_Spacy_EntityRuler.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyOfU5xw+rgpL44PymRVvvOZ", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/fsartoris/400b5a5e6bdc1a637a3cc4d45f9869b9/nlp_spacy_entityruler.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LzCkbLuJTuro", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import spacy\n", | |
"import feedparser\n", | |
"from spacy.pipeline import EntityRuler" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Eiw8kJUiT8Dl", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"nlp = spacy.load(\"en_core_web_sm\")\n", | |
"feed = feedparser.parse(\"https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Q66hKKkSUKNM", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"ruler = EntityRuler(nlp, overwrite_ents=True)\n", | |
"patterns = [{\"label\": \"PER\", \"pattern\": \"Trump\"}]\n", | |
"ruler.add_patterns(patterns)\n", | |
"nlp.add_pipe(ruler)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "iXQUSe4XUNFO", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def return_all_entities(doc):\n", | |
" array = []\n", | |
" for entity in doc.ents:\n", | |
" record = {'entity': entity, 'type': entity.label_}\n", | |
" array.append(record)\n", | |
" return array" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ZRMhpqQhUQJE", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000 | |
}, | |
"outputId": "461afed7-b200-4302-883d-f2a69f8564ea" | |
}, | |
"source": [ | |
"for entry in feed.entries:\n", | |
" document = nlp(entry.title)\n", | |
" print(entry.title)\n", | |
" print(return_all_entities(document))\n", | |
" print(\"\\n\")" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Coronavirus Live Updates\n", | |
"[{'entity': Coronavirus Live Updates, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"U.S. Accuses Hackers of Trying to Steal Coronavirus Vaccine Data for China\n", | |
"[{'entity': U.S., 'type': 'GPE'}, {'entity': China, 'type': 'GPE'}]\n", | |
"\n", | |
"\n", | |
"End of $600 Unemployment Bonus Could Push Millions Past the Brink\n", | |
"[{'entity': 600, 'type': 'MONEY'}]\n", | |
"\n", | |
"\n", | |
"Chaotic Scenes in Portland as Backlash to Federal Deployment Grows\n", | |
"[{'entity': Portland, 'type': 'GPE'}]\n", | |
"\n", | |
"\n", | |
"The Virus Found a Crowded Houston Neighborhood, Sparing One Nearby\n", | |
"[{'entity': Virus, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"Patient, Can You Spare a Dime?\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Business Updates: United Airlines Revenue Dropped Nearly 90%\n", | |
"[{'entity': Business Updates, 'type': 'ORG'}, {'entity': United Airlines Revenue, 'type': 'ORG'}, {'entity': Nearly 90%, 'type': 'PERCENT'}]\n", | |
"\n", | |
"\n", | |
"Andrew Gillum and the Long Shadow of the Florida Governor’s Race\n", | |
"[{'entity': Andrew Gillum, 'type': 'PERSON'}, {'entity': Florida, 'type': 'GPE'}]\n", | |
"\n", | |
"\n", | |
"Biden vs. Trump: Live 2020 Election Updates\n", | |
"[{'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"Trump’s Tulsa Rally Drew Sparse Crowd, but It Cost $2.2 Million\n", | |
"[{'entity': Trump, 'type': 'PER'}, {'entity': Drew Sparse Crowd, 'type': 'PERSON'}, {'entity': $2.2 Million, 'type': 'MONEY'}]\n", | |
"\n", | |
"\n", | |
"Where the Wild Things Play\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"The Vaccine Trust Problem\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Corporate Political Donations Undermine Pledges\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Sign Up: ‘At Home’\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Anthony Fauci Is Not an Alarmist. He Is a Realist.\n", | |
"[{'entity': Anthony Fauci, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Elizabeth Warren: My Coronavirus Must-Do List\n", | |
"[{'entity': Elizabeth Warren, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Trump’s Occupation of American Cities Has Begun\n", | |
"[{'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"We Searched for Covid-19 Data. Here’s What We Couldn’t Find.\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Arsenal Legend Bob Wilson on the Loneliest Role in Soccer\n", | |
"[{'entity': Legend Bob Wilson, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"What You Don’t Know About the Coronavirus Can’t Hurt Trump\n", | |
"[{'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"The Border War in Portland\n", | |
"[{'entity': The Border War, 'type': 'EVENT'}, {'entity': Portland, 'type': 'GPE'}]\n", | |
"\n", | |
"\n", | |
"Trump, Unleashed\n", | |
"[{'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"Should We Cancel Aristotle?\n", | |
"[{'entity': Cancel Aristotle, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Can Trump Come Back?\n", | |
"[{'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"Stay Safe, Justice Ginsburg\n", | |
"[{'entity': Stay Safe, 'type': 'ORG'}, {'entity': Ginsburg, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Worried About Crowded Flights? Know Where Your Airline Stands\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Zooming In on Bill Buford’s Latest Obsession\n", | |
"[{'entity': Bill Buford’s, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Who Is Behind Those N.B.A. ‘Bubble Life’ Tweets?\n", | |
"[{'entity': Bubble Life’ Tweets, 'type': 'WORK_OF_ART'}]\n", | |
"\n", | |
"\n", | |
"Shelton Clears Senate Committee, Moving Closer to Fed Board\n", | |
"[{'entity': Shelton, 'type': 'ORG'}, {'entity': Fed Board, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"Pompeo Praises U.K. for Getting Tough on China\n", | |
"[{'entity': Pompeo Praises, 'type': 'PERSON'}, {'entity': U.K., 'type': 'GPE'}, {'entity': China, 'type': 'GPE'}]\n", | |
"\n", | |
"\n", | |
"Tapestry C.E.O. Jide Zeitlin Resigns After Misconduct Allegation\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"ABC Executive Barbara Fedida Out Over 'Racially Insensitive' Remarks\n", | |
"[{'entity': ABC, 'type': 'ORG'}, {'entity': Barbara Fedida, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Danny Meyer’s Restaurants Will End Their No-Tipping Policy\n", | |
"[{'entity': Danny Meyer, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Minneapolis Police Experience Surge of Departures in Aftermath of George Floyd Protests\n", | |
"[{'entity': Minneapolis, 'type': 'GPE'}, {'entity': George Floyd Protests, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"When John Lewis Cosplayed at Comic-Con as His Younger Self\n", | |
"[{'entity': John Lewis Cosplayed, 'type': 'PERSON'}, {'entity': Comic-Con, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"Apple, Facing Pressure on Climate, Says It'll Go Carbon-Neutral\n", | |
"[{'entity': Apple, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"At the Hirshhorn, a Battle Over Plans for Its Sculpture Garden\n", | |
"[{'entity': Hirshhorn, 'type': 'FAC'}]\n", | |
"\n", | |
"\n", | |
"Should You Say Yes to That Favor? Well …\n", | |
"[{'entity': Favor, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"Breathe Better With These Nine Exercises\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"There Are Wasps in the Yard. You’d Better Get to Know Them.\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"New Tools for Home Buyers as the Pandemic Upends Real Estate\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Keeping Kids Curious About Their Bodies Without Shame\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"They’re Used to Tapping. Now They’re Talking.\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Hotels Nix Mints and Breakfast Buffets. Hand Sanitizer, Anyone?\n", | |
"[{'entity': Hotels Nix Mints, 'type': 'PERSON'}, {'entity': Breakfast Buffets, 'type': 'PERSON'}, {'entity': Hand Sanitizer, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Facing a Season of Uncertainty, Max Scherzer Is Still Tinkering\n", | |
"[{'entity': Max Scherzer, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"How Michaela Coel Shaped ‘I May Destroy You’\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"Lianne La Havas Traces the Arc of a Romance\n", | |
"[{'entity': Lianne La Havas, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"A Covid-19 Lesson: Some Seriously Ill Patients Can Be Treated at Home\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"The Wilderness of Rare Genetic Diseases and the Parents Navigating It\n", | |
"[{'entity': The Wilderness of Rare Genetic Diseases and the Parents Navigating It, 'type': 'WORK_OF_ART'}]\n", | |
"\n", | |
"\n", | |
"Trump Says He ‘Aced’ a Cognitive Test. What Does That Really Mean?\n", | |
"[{'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"Scientific Panel Urges That Schools Reopen\n", | |
"[{'entity': Scientific Panel, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"Family-Friendly Movies Made by Diverse Filmmakers\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"New York Reporter Nina Kapur Dies After Revel Moped Accident\n", | |
"[{'entity': New York, 'type': 'GPE'}, {'entity': Nina Kapur, 'type': 'PERSON'}]\n", | |
"\n", | |
"\n", | |
"Trevor Noah Praises Fox News Host for Actually Questioning Trump\n", | |
"[{'entity': Trevor Noah Praises Fox News Host, 'type': 'ORG'}, {'entity': Trump, 'type': 'PER'}]\n", | |
"\n", | |
"\n", | |
"As Trump Ignores Virus Crisis, Republicans Start to Break Ranks\n", | |
"[{'entity': Trump, 'type': 'PER'}, {'entity': Republicans, 'type': 'NORP'}]\n", | |
"\n", | |
"\n", | |
"Try Spelling Bee\n", | |
"[]\n", | |
"\n", | |
"\n", | |
"The Crossword, Vertex and More\n", | |
"[{'entity': Crossword, 'type': 'ORG'}, {'entity': Vertex, 'type': 'ORG'}]\n", | |
"\n", | |
"\n", | |
"Try Tiles\n", | |
"[]\n", | |
"\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment