Last active
June 17, 2020 07:05
-
-
Save thebishorup/d3d1a1fbed931d38cec427a062af56d1 to your computer and use it in GitHub Desktop.
Untitled.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Lingustic Annotation" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy", | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp = spacy.load('en_core_web_lg')", | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher = Matcher(nlp.vocab)", | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern = [{'LOWER':'facebook'}, {'LEMMA':'be'}, {'POS':'ADV', 'OP':'*'}, {'POS':'ADJ'}]", | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matched_sentences = []", | |
"execution_count": 21, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def callback_fb(matcher, doc, i, matches):\n matched_id, start, end = matches[i]\n span = doc[start:end]\n sent = span.sent\n \n match_ent = [{\n 'start':span.start_char - sent.start_char,\n 'end':span.end_char - sent.start_char,\n 'label': 'MATCH'\n }]\n \n matched_sentences.append({'text': sent.text, 'ents': match_ent})", | |
"execution_count": 22, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher.add('fb', callback_fb, pattern)", | |
"execution_count": 23, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Facebook is best place to find friends. Facebook is good for mental health. Facebook has placed a community into a global village.')", | |
"execution_count": 24, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches = matcher(doc)", | |
"execution_count": 25, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches", | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]" | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matched_sentences", | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "[{'text': 'Facebook is best place to find friends.',\n 'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]},\n {'text': 'Facebook is good for mental health.',\n 'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]}]" | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "displacy.render(matched_sentences, style='ent', manual=True)", | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n Facebook is best\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n place to find friends.</div>\n\n<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n Facebook is good\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n for mental health.</div></span>", | |
"text/plain": "<IPython.core.display.HTML object>" | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "#### Extract Phone Number" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "### (123) 456 7890 OR (123) 456-8901\npattern_phone = [{'ORTH': \"(\"}, {'SHAPE': 'ddd'}, {'ORTH': \")\"}, {'SHAPE': 'ddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]", | |
"execution_count": 67, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher = Matcher(nlp.vocab)", | |
"execution_count": 66, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher.add('phone_number', None, pattern_phone)", | |
"execution_count": 68, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('My phone number is (123) 456-7890')", | |
"execution_count": 69, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "mathces = matcher(doc)", | |
"execution_count": 70, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "print([t.text for t in doc])", | |
"execution_count": 71, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "['My', 'phone', 'number', 'is', '(', '123', ')', '456', '-', '7890']\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches", | |
"execution_count": 72, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]" | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"code_folding": [], | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "for match_id, start, end in matches:\n print(start, end)\n span = doc[start:end]\n print(span.text)", | |
"execution_count": 73, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "0 3\nMy phone number\n8 11\n-7890\n" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "#### Email Address matching" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern_email = [{'TEXT': {'REGEX': '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'}}]", | |
"execution_count": 74, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher = Matcher(nlp.vocab)\nmatcher.add('email', None, pattern_email)", | |
"execution_count": 75, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = 'Email me at jack_bisho@gmail.com.'\ndoc = nlp(text)", | |
"execution_count": 76, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches = matcher(doc)", | |
"execution_count": 77, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches", | |
"execution_count": 78, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "[(7320900731437023467, 3, 4)]" | |
}, | |
"execution_count": 78, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "for match_id, start, end in matches:\n span = doc[start:end]\n print(span.text)", | |
"execution_count": 79, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "jack_bisho@gmail.com\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "markdown", | |
"source": "#### Efficient Phrase Matching" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from spacy.matcher import PhraseMatcher", | |
"execution_count": 80, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher = PhraseMatcher(nlp.vocab)", | |
"execution_count": 81, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "terms = ['BARAC OBAMA', 'ANGELA MARKEL', 'NEW YORK']", | |
"execution_count": 83, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern = [nlp.make_doc(text) for text in terms]", | |
"execution_count": 84, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern", | |
"execution_count": 85, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 85, | |
"data": { | |
"text/plain": "[BARAC OBAMA, ANGELA MARKEL, NEW YORK]" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher.add('pm', None, *pattern)", | |
"execution_count": 88, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK')", | |
"execution_count": 93, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc", | |
"execution_count": 94, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 94, | |
"data": { | |
"text/plain": "U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches = matcher(doc)", | |
"execution_count": 95, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matches", | |
"execution_count": 96, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 96, | |
"data": { | |
"text/plain": "[(10701989183306053849, 2, 4),\n (10701989183306053849, 7, 9),\n (10701989183306053849, 17, 19)]" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "for match_id, start, end in matches:\n span = doc[start:end]\n print(span.text)", | |
"execution_count": 98, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "BARAC OBAMA\nANGELA MARKEL\nNEW YORK\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "#### Custom Rules Based Entity Recognition" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "from spacy.pipeline import EntityRuler", | |
"execution_count": 99, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "ruler = EntityRuler(nlp)", | |
"execution_count": 100, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern = [{'label': 'ORG', 'pattern': 'theBisho'}, {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]", | |
"execution_count": 101, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern", | |
"execution_count": 102, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 102, | |
"data": { | |
"text/plain": "[{'label': 'ORG', 'pattern': 'theBisho'},\n {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "ruler.add_patterns(pattern)\nnlp.add_pipe(ruler)", | |
"execution_count": 103, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('My new organization is theBisho.')", | |
"execution_count": 104, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "for ent in doc.ents:\n print(ent.text, ent.label_)", | |
"execution_count": 105, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "theBisho ORG\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/d3d1a1fbed931d38cec427a062af56d1" | |
}, | |
"gist": { | |
"id": "d3d1a1fbed931d38cec427a062af56d1", | |
"data": { | |
"description": "Untitled.ipynb", | |
"public": true | |
} | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.7.6", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"toc": { | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"base_numbering": 1, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment