Created
June 20, 2020 06:08
-
-
Save thebishorup/fa55c8000080b90b5c54bf09e1425923 to your computer and use it in GitHub Desktop.
spacy_and_custom_rules.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp = spacy.load('en_core_web_lg')", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Expanding named entities" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Dr. Jay Jannel completed is PHD this summer from Oregon State University.')", | |
"execution_count": 37, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc", | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 38, | |
"data": { | |
"text/plain": "Dr. Jay Jannel completed is PHD this summer from Oregon State University." | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "print([(ent.text, ent.label_) for ent in doc.ents] )", | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[('Dr. Jay Jannel', 'PERSON')]\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "### PROBLEM: It detected Jay Jannel as person but it opted Dr.\ndef add_title(doc):\n new_ents = []\n for ent in doc.ents:\n if ent.label_ == 'PERSON' and ent.start != 0:\n prev_token = doc[ent.start - 1]\n if prev_token.text in ('Dr', 'Dr.', 'Mr', 'Mr.'):\n new_ent = Span(doc, ent.start - 1, ent.end, label = ent.label_)\n new_ents.append(new_ent)\n else:\n new_ents.append(ent)\n doc.ents = new_ents\n return doc", | |
"execution_count": 35, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp.add_pipe(add_title, after='ner')", | |
"execution_count": 36, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### User of Past of Speech (POS) and Dependency Parsing" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp = spacy.load('en_core_web_lg')", | |
"execution_count": 40, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Jay Jannel is working at Oregon State University.')", | |
"execution_count": 44, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc", | |
"execution_count": 45, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 45, | |
"data": { | |
"text/plain": "Jay Jannel is working at Oregon State University." | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "displacy.render(doc, style='dep', options = {'compact': True, 'distance': 100})", | |
"execution_count": 46, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": "<IPython.core.display.HTML object>", | |
"text/html": "<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"5e0af996ab4b4489a64e572a54dd7c1b-0\" class=\"displacy\" width=\"850\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Jay</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">Jannel</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">is</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">AUX</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">working</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">VERB</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">at</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">ADP</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">Oregon</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">State</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">University.</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">PROPN</tspan>\n</text>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-0\" stroke-width=\"2px\" d=\"M62,152.0 62,135.33333333333334 144.0,135.33333333333334 144.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M62,154.0 L58,146.0 66,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-1\" stroke-width=\"2px\" d=\"M162,152.0 162,118.66666666666666 347.0,118.66666666666666 347.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M162,154.0 L158,146.0 166,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-2\" stroke-width=\"2px\" d=\"M262,152.0 262,135.33333333333334 344.0,135.33333333333334 344.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M262,154.0 L258,146.0 266,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-3\" stroke-width=\"2px\" d=\"M362,152.0 362,135.33333333333334 444.0,135.33333333333334 444.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M444.0,154.0 L448.0,146.0 440.0,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-4\" stroke-width=\"2px\" d=\"M562,152.0 562,118.66666666666666 747.0,118.66666666666666 747.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M562,154.0 L558,146.0 566,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-5\" stroke-width=\"2px\" d=\"M662,152.0 662,135.33333333333334 744.0,135.33333333333334 744.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M662,154.0 L658,146.0 666,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-6\" stroke-width=\"2px\" d=\"M462,152.0 462,102.0 750.0,102.0 750.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M750.0,154.0 L754.0,146.0 746.0,146.0\" fill=\"currentColor\"/>\n</g>\n</svg></span>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Goal: Get the past and present work place.\ndef get_person_org(doc):\n # get all person entities\n person_entities = [ent for ent in doc.ents if ent.label_ == 'PERSON']\n print(person_entities)\n for ent in person_entities:\n head = ent.root.head\n print(head.lemma)\n if head.lemma_ == 'work':\n # get preposition which has dependcy of preposition of head\n preps = [token for token in head.children if token.dep_ == 'prep']\n print(preps)\n for prep in preps:\n # get org followed by preposition\n orgs = [token for token in prep.children if token.ent_type_ == 'ORG']\n print({'person': ent, 'orgs': orgs, 'past': head.tag_ == 'VBD'})\n return doc", | |
"execution_count": 79, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from spacy.pipeline import merge_entities", | |
"execution_count": 48, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp = spacy.load('en_core_web_lg')", | |
"execution_count": 98, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "#nlp.add_pipe(merge_entities)\nnlp.add_pipe(get_person_org)", | |
"execution_count": 81, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp.remove_pipe('get_person_org')", | |
"execution_count": 80, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 80, | |
"data": { | |
"text/plain": "('get_person_org', <function __main__.get_person_org(doc)>)" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Jay Jannel worked at Oregon State University.')", | |
"execution_count": 84, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Jay Jannel]\n10038440415813069799\n[at]\n{'person': Jay Jannel, 'orgs': [Oregon State University], 'past': True}\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Modify Model" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Goal: Get the past and present work place.\ndef get_person_org_modified(doc):\n # get all person entities\n person_entities = [ent for ent in doc.ents if ent.label_ == 'PERSON']\n print(person_entities)\n for ent in person_entities:\n head = ent.root.head\n print(head.lemma)\n if head.lemma_ == 'work':\n # get preposition which has dependcy of preposition of head\n preps = [token for token in head.children if token.dep_ == 'prep']\n print(preps)\n for prep in preps:\n # get org followed by preposition\n print(head.children)\n orgs = [token for token in prep.children if token.ent_type_ == 'ORG']\n \n # check for auxulary verb\n aux = [token for token in head.children if token.dep_ == 'aux']\n past_aux = any(t.tag_ == 'VBD' for t in aux)\n past = head.tag_ == 'VBD' or head.tag_ == 'VBG' and past_aux\n print({'person': ent, 'orgs': orgs, 'past': head.tag_ == 'VBD'})\n return doc", | |
"execution_count": 86, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp.add_pipe(merge_entities)\nnlp.add_pipe(get_person_org_modified)", | |
"execution_count": 99, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Jay Baleno was working at Nissan.')", | |
"execution_count": 100, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[Jay Baleno]\n10038440415813069799\n[at]\n<generator object at 0x0000024D48CA2A48>\n{'person': Jay Baleno, 'orgs': [Nissan], 'past': False}\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp.pipe_factories", | |
"execution_count": 101, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 101, | |
"data": { | |
"text/plain": "{'tagger': 'tagger',\n 'parser': 'parser',\n 'ner': 'ner',\n 'merge_entities': 'merge_entities',\n 'get_person_org_modified': 'get_person_org_modified'}" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"toc": { | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"base_numbering": 1, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.7.6", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "spacy_and_custom_rules.ipynb", | |
"public": true | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment