Skip to content

Instantly share code, notes, and snippets.

@thebishorup
Last active June 17, 2020 07:05
Show Gist options
  • Save thebishorup/d3d1a1fbed931d38cec427a062af56d1 to your computer and use it in GitHub Desktop.
Save thebishorup/d3d1a1fbed931d38cec427a062af56d1 to your computer and use it in GitHub Desktop.
Untitled.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "### Lingustic Annotation"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp = spacy.load('en_core_web_lg')",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher = Matcher(nlp.vocab)",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern = [{'LOWER':'facebook'}, {'LEMMA':'be'}, {'POS':'ADV', 'OP':'*'}, {'POS':'ADJ'}]",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matched_sentences = []",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def callback_fb(matcher, doc, i, matches):\n matched_id, start, end = matches[i]\n span = doc[start:end]\n sent = span.sent\n \n match_ent = [{\n 'start':span.start_char - sent.start_char,\n 'end':span.end_char - sent.start_char,\n 'label': 'MATCH'\n }]\n \n matched_sentences.append({'text': sent.text, 'ents': match_ent})",
"execution_count": 22,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher.add('fb', callback_fb, pattern)",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('Facebook is best place to find friends. Facebook is good for mental health. Facebook has placed a community into a global village.')",
"execution_count": 24,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches = matcher(doc)",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches",
"execution_count": 26,
"outputs": [
{
"data": {
"text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]"
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matched_sentences",
"execution_count": 27,
"outputs": [
{
"data": {
"text/plain": "[{'text': 'Facebook is best place to find friends.',\n 'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]},\n {'text': 'Facebook is good for mental health.',\n 'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]}]"
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "displacy.render(matched_sentences, style='ent', manual=True)",
"execution_count": 28,
"outputs": [
{
"data": {
"text/html": "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n Facebook is best\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n place to find friends.</div>\n\n<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n Facebook is good\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n for mental health.</div></span>",
"text/plain": "<IPython.core.display.HTML object>"
},
"metadata": {},
"output_type": "display_data"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### Extract Phone Number"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "### (123) 456 7890 OR (123) 456-8901\npattern_phone = [{'ORTH': \"(\"}, {'SHAPE': 'ddd'}, {'ORTH': \")\"}, {'SHAPE': 'ddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]",
"execution_count": 67,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher = Matcher(nlp.vocab)",
"execution_count": 66,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher.add('phone_number', None, pattern_phone)",
"execution_count": 68,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('My phone number is (123) 456-7890')",
"execution_count": 69,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "mathces = matcher(doc)",
"execution_count": 70,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print([t.text for t in doc])",
"execution_count": 71,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "['My', 'phone', 'number', 'is', '(', '123', ')', '456', '-', '7890']\n"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches",
"execution_count": 72,
"outputs": [
{
"data": {
"text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]"
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"code_folding": [],
"trusted": true
},
"cell_type": "code",
"source": "for match_id, start, end in matches:\n print(start, end)\n span = doc[start:end]\n print(span.text)",
"execution_count": 73,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "0 3\nMy phone number\n8 11\n-7890\n"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### Email Address matching"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern_email = [{'TEXT': {'REGEX': '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'}}]",
"execution_count": 74,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher = Matcher(nlp.vocab)\nmatcher.add('email', None, pattern_email)",
"execution_count": 75,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text = 'Email me at jack_bisho@gmail.com.'\ndoc = nlp(text)",
"execution_count": 76,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches = matcher(doc)",
"execution_count": 77,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches",
"execution_count": 78,
"outputs": [
{
"data": {
"text/plain": "[(7320900731437023467, 3, 4)]"
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "for match_id, start, end in matches:\n span = doc[start:end]\n print(span.text)",
"execution_count": 79,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "jack_bisho@gmail.com\n"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "markdown",
"source": "#### Efficient Phrase Matching"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from spacy.matcher import PhraseMatcher",
"execution_count": 80,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher = PhraseMatcher(nlp.vocab)",
"execution_count": 81,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "terms = ['BARAC OBAMA', 'ANGELA MARKEL', 'NEW YORK']",
"execution_count": 83,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern = [nlp.make_doc(text) for text in terms]",
"execution_count": 84,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern",
"execution_count": 85,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 85,
"data": {
"text/plain": "[BARAC OBAMA, ANGELA MARKEL, NEW YORK]"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher.add('pm', None, *pattern)",
"execution_count": 88,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK')",
"execution_count": 93,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc",
"execution_count": 94,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 94,
"data": {
"text/plain": "U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches = matcher(doc)",
"execution_count": 95,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matches",
"execution_count": 96,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 96,
"data": {
"text/plain": "[(10701989183306053849, 2, 4),\n (10701989183306053849, 7, 9),\n (10701989183306053849, 17, 19)]"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "for match_id, start, end in matches:\n span = doc[start:end]\n print(span.text)",
"execution_count": 98,
"outputs": [
{
"output_type": "stream",
"text": "BARAC OBAMA\nANGELA MARKEL\nNEW YORK\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### Custom Rules Based Entity Recognition"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from spacy.pipeline import EntityRuler",
"execution_count": 99,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ruler = EntityRuler(nlp)",
"execution_count": 100,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern = [{'label': 'ORG', 'pattern': 'theBisho'}, {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]",
"execution_count": 101,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern",
"execution_count": 102,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 102,
"data": {
"text/plain": "[{'label': 'ORG', 'pattern': 'theBisho'},\n {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ruler.add_patterns(pattern)\nnlp.add_pipe(ruler)",
"execution_count": 103,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp('My new organization is theBisho.')",
"execution_count": 104,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "for ent in doc.ents:\n print(ent.text, ent.label_)",
"execution_count": 105,
"outputs": [
{
"output_type": "stream",
"text": "theBisho ORG\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/d3d1a1fbed931d38cec427a062af56d1"
},
"gist": {
"id": "d3d1a1fbed931d38cec427a062af56d1",
"data": {
"description": "Untitled.ipynb",
"public": true
}
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.7.6",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"base_numbering": 1,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment