Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save thebishorup/fa55c8000080b90b5c54bf09e1425923 to your computer and use it in GitHub Desktop.
Save thebishorup/fa55c8000080b90b5c54bf09e1425923 to your computer and use it in GitHub Desktop.
spacy_and_custom_rules.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp = spacy.load('en_core_web_lg')",
"execution_count": 2,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Expanding named entities"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('Dr. Jay Jannel completed is PHD this summer from Oregon State University.')",
"execution_count": 37,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc",
"execution_count": 38,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 38,
"data": {
"text/plain": "Dr. Jay Jannel completed is PHD this summer from Oregon State University."
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print([(ent.text, ent.label_) for ent in doc.ents] )",
"execution_count": 39,
"outputs": [
{
"output_type": "stream",
"text": "[('Dr. Jay Jannel', 'PERSON')]\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "### PROBLEM: It detected Jay Jannel as person but it opted Dr.\ndef add_title(doc):\n new_ents = []\n for ent in doc.ents:\n if ent.label_ == 'PERSON' and ent.start != 0:\n prev_token = doc[ent.start - 1]\n if prev_token.text in ('Dr', 'Dr.', 'Mr', 'Mr.'):\n new_ent = Span(doc, ent.start - 1, ent.end, label = ent.label_)\n new_ents.append(new_ent)\n else:\n new_ents.append(ent)\n doc.ents = new_ents\n return doc",
"execution_count": 35,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp.add_pipe(add_title, after='ner')",
"execution_count": 36,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### User of Past of Speech (POS) and Dependency Parsing"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp = spacy.load('en_core_web_lg')",
"execution_count": 40,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('Jay Jannel is working at Oregon State University.')",
"execution_count": 44,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc",
"execution_count": 45,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 45,
"data": {
"text/plain": "Jay Jannel is working at Oregon State University."
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "displacy.render(doc, style='dep', options = {'compact': True, 'distance': 100})",
"execution_count": 46,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"5e0af996ab4b4489a64e572a54dd7c1b-0\" class=\"displacy\" width=\"850\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Jay</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">Jannel</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">is</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">AUX</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">working</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">VERB</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">at</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">ADP</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">Oregon</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">State</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">PROPN</tspan>\n</text>\n\n<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">University.</tspan>\n <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">PROPN</tspan>\n</text>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-0\" stroke-width=\"2px\" d=\"M62,152.0 62,135.33333333333334 144.0,135.33333333333334 144.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M62,154.0 L58,146.0 66,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-1\" stroke-width=\"2px\" d=\"M162,152.0 162,118.66666666666666 347.0,118.66666666666666 347.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M162,154.0 L158,146.0 166,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-2\" stroke-width=\"2px\" d=\"M262,152.0 262,135.33333333333334 344.0,135.33333333333334 344.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M262,154.0 L258,146.0 266,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-3\" stroke-width=\"2px\" d=\"M362,152.0 362,135.33333333333334 444.0,135.33333333333334 444.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M444.0,154.0 L448.0,146.0 440.0,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-4\" stroke-width=\"2px\" d=\"M562,152.0 562,118.66666666666666 747.0,118.66666666666666 747.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M562,154.0 L558,146.0 566,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-5\" stroke-width=\"2px\" d=\"M662,152.0 662,135.33333333333334 744.0,135.33333333333334 744.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M662,154.0 L658,146.0 666,146.0\" fill=\"currentColor\"/>\n</g>\n\n<g class=\"displacy-arrow\">\n <path class=\"displacy-arc\" id=\"arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-6\" stroke-width=\"2px\" d=\"M462,152.0 462,102.0 750.0,102.0 750.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n <textPath xlink:href=\"#arrow-5e0af996ab4b4489a64e572a54dd7c1b-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n </text>\n <path class=\"displacy-arrowhead\" d=\"M750.0,154.0 L754.0,146.0 746.0,146.0\" fill=\"currentColor\"/>\n</g>\n</svg></span>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Goal: Get the past and present work place.\ndef get_person_org(doc):\n # get all person entities\n person_entities = [ent for ent in doc.ents if ent.label_ == 'PERSON']\n print(person_entities)\n for ent in person_entities:\n head = ent.root.head\n print(head.lemma)\n if head.lemma_ == 'work':\n # get preposition which has dependcy of preposition of head\n preps = [token for token in head.children if token.dep_ == 'prep']\n print(preps)\n for prep in preps:\n # get org followed by preposition\n orgs = [token for token in prep.children if token.ent_type_ == 'ORG']\n print({'person': ent, 'orgs': orgs, 'past': head.tag_ == 'VBD'})\n return doc",
"execution_count": 79,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from spacy.pipeline import merge_entities",
"execution_count": 48,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp = spacy.load('en_core_web_lg')",
"execution_count": 98,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#nlp.add_pipe(merge_entities)\nnlp.add_pipe(get_person_org)",
"execution_count": 81,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp.remove_pipe('get_person_org')",
"execution_count": 80,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 80,
"data": {
"text/plain": "('get_person_org', <function __main__.get_person_org(doc)>)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('Jay Jannel worked at Oregon State University.')",
"execution_count": 84,
"outputs": [
{
"output_type": "stream",
"text": "[Jay Jannel]\n10038440415813069799\n[at]\n{'person': Jay Jannel, 'orgs': [Oregon State University], 'past': True}\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Modify Model"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Goal: Get the past and present work place.\ndef get_person_org_modified(doc):\n # get all person entities\n person_entities = [ent for ent in doc.ents if ent.label_ == 'PERSON']\n print(person_entities)\n for ent in person_entities:\n head = ent.root.head\n print(head.lemma)\n if head.lemma_ == 'work':\n # get preposition which has dependcy of preposition of head\n preps = [token for token in head.children if token.dep_ == 'prep']\n print(preps)\n for prep in preps:\n # get org followed by preposition\n print(head.children)\n orgs = [token for token in prep.children if token.ent_type_ == 'ORG']\n \n # check for auxulary verb\n aux = [token for token in head.children if token.dep_ == 'aux']\n past_aux = any(t.tag_ == 'VBD' for t in aux)\n past = head.tag_ == 'VBD' or head.tag_ == 'VBG' and past_aux\n print({'person': ent, 'orgs': orgs, 'past': head.tag_ == 'VBD'})\n return doc",
"execution_count": 86,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp.add_pipe(merge_entities)\nnlp.add_pipe(get_person_org_modified)",
"execution_count": 99,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('Jay Baleno was working at Nissan.')",
"execution_count": 100,
"outputs": [
{
"output_type": "stream",
"text": "[Jay Baleno]\n10038440415813069799\n[at]\n<generator object at 0x0000024D48CA2A48>\n{'person': Jay Baleno, 'orgs': [Nissan], 'past': False}\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp.pipe_factories",
"execution_count": 101,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 101,
"data": {
"text/plain": "{'tagger': 'tagger',\n 'parser': 'parser',\n 'ner': 'ner',\n 'merge_entities': 'merge_entities',\n 'get_person_org_modified': 'get_person_org_modified'}"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"base_numbering": 1,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"language_info": {
"name": "python",
"version": "3.7.6",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "",
"data": {
"description": "spacy_and_custom_rules.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment