thebishorup/get_phone_email_from_text.ipynb

## get_phone_email_from_text.ipynb
{
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "### Lingustic Annotation"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "nlp = spacy.load('en_core_web_lg')",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher = Matcher(nlp.vocab)",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pattern = [{'LOWER':'facebook'}, {'LEMMA':'be'}, {'POS':'ADV', 'OP':'*'}, {'POS':'ADJ'}]",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matched_sentences = []",
      "execution_count": 21,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "def callback_fb(matcher, doc, i, matches):\n    matched_id, start, end = matches[i]\n    span = doc[start:end]\n    sent = span.sent\n    \n    match_ent = [{\n        'start':span.start_char - sent.start_char,\n        'end':span.end_char - sent.start_char,\n        'label': 'MATCH'\n    }]\n    \n    matched_sentences.append({'text': sent.text, 'ents': match_ent})",
      "execution_count": 22,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher.add('fb', callback_fb, pattern)",
      "execution_count": 23,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "doc = nlp('Facebook is best place to find friends. Facebook is good for mental health. Facebook has placed a community into a global village.')",
      "execution_count": 24,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches = matcher(doc)",
      "execution_count": 25,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches",
      "execution_count": 26,
      "outputs": [
        {
          "data": {
            "text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]"
          },
          "execution_count": 26,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matched_sentences",
      "execution_count": 27,
      "outputs": [
        {
          "data": {
            "text/plain": "[{'text': 'Facebook is best place to find friends.',\n  'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]},\n {'text': 'Facebook is good for mental health.',\n  'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]}]"
          },
          "execution_count": 27,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "displacy.render(matched_sentences, style='ent', manual=True)",
      "execution_count": 28,
      "outputs": [
        {
          "data": {
            "text/html": "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n    Facebook is best\n    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n place to find friends.</div>\n\n<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n    Facebook is good\n    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n for mental health.</div></span>",
            "text/plain": "<IPython.core.display.HTML object>"
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "#### Extract Phone Number"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "### (123) 456 7890 OR (123) 456-8901\npattern_phone = [{'ORTH': \"(\"}, {'SHAPE': 'ddd'}, {'ORTH': \")\"}, {'SHAPE': 'ddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]",
      "execution_count": 67,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher = Matcher(nlp.vocab)",
      "execution_count": 66,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher.add('phone_number', None, pattern_phone)",
      "execution_count": 68,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "doc = nlp('My phone number is (123) 456-7890')",
      "execution_count": 69,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "mathces = matcher(doc)",
      "execution_count": 70,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "print([t.text for t in doc])",
      "execution_count": 71,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "['My', 'phone', 'number', 'is', '(', '123', ')', '456', '-', '7890']\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches",
      "execution_count": 72,
      "outputs": [
        {
          "data": {
            "text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]"
          },
          "execution_count": 72,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "code_folding": [],
        "trusted": true
      },
      "cell_type": "code",
      "source": "for match_id, start, end in matches:\n    print(start, end)\n    span = doc[start:end]\n    print(span.text)",
      "execution_count": 73,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "0 3\nMy phone number\n8 11\n-7890\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "#### Email Address matching"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pattern_email = [{'TEXT': {'REGEX': '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'}}]",
      "execution_count": 74,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher = Matcher(nlp.vocab)\nmatcher.add('email', None, pattern_email)",
      "execution_count": 75,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "text = 'Email me at jack_bisho@gmail.com.'\ndoc = nlp(text)",
      "execution_count": 76,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches = matcher(doc)",
      "execution_count": 77,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches",
      "execution_count": 78,
      "outputs": [
        {
          "data": {
            "text/plain": "[(7320900731437023467, 3, 4)]"
          },
          "execution_count": 78,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "for match_id, start, end in matches:\n    span = doc[start:end]\n    print(span.text)",
      "execution_count": 79,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "jack_bisho@gmail.com\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "markdown",
      "source": "#### Efficient Phrase Matching"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from spacy.matcher import PhraseMatcher",
      "execution_count": 80,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher = PhraseMatcher(nlp.vocab)",
      "execution_count": 81,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "terms = ['BARAC OBAMA', 'ANGELA MARKEL', 'NEW YORK']",
      "execution_count": 83,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pattern = [nlp.make_doc(text) for text in terms]",
      "execution_count": 84,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pattern",
      "execution_count": 85,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 85,
          "data": {
            "text/plain": "[BARAC OBAMA, ANGELA MARKEL, NEW YORK]"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matcher.add('pm', None, *pattern)",
      "execution_count": 88,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "doc = nlp('U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK')",
      "execution_count": 93,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "doc",
      "execution_count": 94,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 94,
          "data": {
            "text/plain": "U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches = matcher(doc)",
      "execution_count": 95,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "matches",
      "execution_count": 96,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 96,
          "data": {
            "text/plain": "[(10701989183306053849, 2, 4),\n (10701989183306053849, 7, 9),\n (10701989183306053849, 17, 19)]"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "for match_id, start, end in matches:\n    span = doc[start:end]\n    print(span.text)",
      "execution_count": 98,
      "outputs": [
        {
          "output_type": "stream",
          "text": "BARAC OBAMA\nANGELA MARKEL\nNEW YORK\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "#### Custom Rules Based Entity Recognition"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from spacy.pipeline import EntityRuler",
      "execution_count": 99,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "ruler = EntityRuler(nlp)",
      "execution_count": 100,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pattern = [{'label': 'ORG', 'pattern': 'theBisho'}, {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]",
      "execution_count": 101,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "pattern",
      "execution_count": 102,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 102,
          "data": {
            "text/plain": "[{'label': 'ORG', 'pattern': 'theBisho'},\n {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]"
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "ruler.add_patterns(pattern)\nnlp.add_pipe(ruler)",
      "execution_count": 103,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "doc = nlp('My new organization is theBisho.')",
      "execution_count": 104,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "for ent in doc.ents:\n    print(ent.text, ent.label_)",
      "execution_count": 105,
      "outputs": [
        {
          "output_type": "stream",
          "text": "theBisho ORG\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "_draft": {
      "nbviewer_url": "https://gist.github.com/d3d1a1fbed931d38cec427a062af56d1"
    },
    "gist": {
      "id": "d3d1a1fbed931d38cec427a062af56d1",
      "data": {
        "description": "Untitled.ipynb",
        "public": true
      }
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.7.6",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "toc": {
      "nav_menu": {},
      "number_sections": true,
      "sideBar": true,
      "skip_h1_title": false,
      "base_numbering": 1,
      "title_cell": "Table of Contents",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": false
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}
	{
	"cells": [
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "### Lingustic Annotation"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "nlp = spacy.load('en_core_web_lg')",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher = Matcher(nlp.vocab)",
	"execution_count": 3,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pattern = [{'LOWER':'facebook'}, {'LEMMA':'be'}, {'POS':'ADV', 'OP':'*'}, {'POS':'ADJ'}]",
	"execution_count": 4,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matched_sentences = []",
	"execution_count": 21,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "def callback_fb(matcher, doc, i, matches):\n matched_id, start, end = matches[i]\n span = doc[start:end]\n sent = span.sent\n \n match_ent = [{\n 'start':span.start_char - sent.start_char,\n 'end':span.end_char - sent.start_char,\n 'label': 'MATCH'\n }]\n \n matched_sentences.append({'text': sent.text, 'ents': match_ent})",
	"execution_count": 22,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher.add('fb', callback_fb, pattern)",
	"execution_count": 23,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "doc = nlp('Facebook is best place to find friends. Facebook is good for mental health. Facebook has placed a community into a global village.')",
	"execution_count": 24,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches = matcher(doc)",
	"execution_count": 25,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches",
	"execution_count": 26,
	"outputs": [
	{
	"data": {
	"text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]"
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matched_sentences",
	"execution_count": 27,
	"outputs": [
	{
	"data": {
	"text/plain": "[{'text': 'Facebook is best place to find friends.',\n 'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]},\n {'text': 'Facebook is good for mental health.',\n 'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]}]"
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "displacy.render(matched_sentences, style='ent', manual=True)",
	"execution_count": 28,
	"outputs": [
	{
	"data": {
	"text/html": "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n Facebook is best\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n place to find friends.</div>\n\n<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n Facebook is good\n <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MATCH</span>\n</mark>\n for mental health.</div></span>",
	"text/plain": "<IPython.core.display.HTML object>"
	},
	"metadata": {},
	"output_type": "display_data"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "#### Extract Phone Number"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "### (123) 456 7890 OR (123) 456-8901\npattern_phone = [{'ORTH': \"(\"}, {'SHAPE': 'ddd'}, {'ORTH': \")\"}, {'SHAPE': 'ddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]",
	"execution_count": 67,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher = Matcher(nlp.vocab)",
	"execution_count": 66,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher.add('phone_number', None, pattern_phone)",
	"execution_count": 68,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "doc = nlp('My phone number is (123) 456-7890')",
	"execution_count": 69,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "mathces = matcher(doc)",
	"execution_count": 70,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "print([t.text for t in doc])",
	"execution_count": 71,
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": "['My', 'phone', 'number', 'is', '(', '123', ')', '456', '-', '7890']\n"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches",
	"execution_count": 72,
	"outputs": [
	{
	"data": {
	"text/plain": "[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]"
	},
	"execution_count": 72,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"code_folding": [],
	"trusted": true
	},
	"cell_type": "code",
	"source": "for match_id, start, end in matches:\n print(start, end)\n span = doc[start:end]\n print(span.text)",
	"execution_count": 73,
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": "0 3\nMy phone number\n8 11\n-7890\n"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "#### Email Address matching"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pattern_email = [{'TEXT': {'REGEX': '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'}}]",
	"execution_count": 74,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher = Matcher(nlp.vocab)\nmatcher.add('email', None, pattern_email)",
	"execution_count": 75,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "text = 'Email me at jack_bisho@gmail.com.'\ndoc = nlp(text)",
	"execution_count": 76,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches = matcher(doc)",
	"execution_count": 77,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches",
	"execution_count": 78,
	"outputs": [
	{
	"data": {
	"text/plain": "[(7320900731437023467, 3, 4)]"
	},
	"execution_count": 78,
	"metadata": {},
	"output_type": "execute_result"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "for match_id, start, end in matches:\n span = doc[start:end]\n print(span.text)",
	"execution_count": 79,
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": "jack_bisho@gmail.com\n"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "markdown",
	"source": "#### Efficient Phrase Matching"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "from spacy.matcher import PhraseMatcher",
	"execution_count": 80,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher = PhraseMatcher(nlp.vocab)",
	"execution_count": 81,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "terms = ['BARAC OBAMA', 'ANGELA MARKEL', 'NEW YORK']",
	"execution_count": 83,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pattern = [nlp.make_doc(text) for text in terms]",
	"execution_count": 84,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pattern",
	"execution_count": 85,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 85,
	"data": {
	"text/plain": "[BARAC OBAMA, ANGELA MARKEL, NEW YORK]"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matcher.add('pm', None, *pattern)",
	"execution_count": 88,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "doc = nlp('U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK')",
	"execution_count": 93,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "doc",
	"execution_count": 94,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 94,
	"data": {
	"text/plain": "U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches = matcher(doc)",
	"execution_count": 95,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "matches",
	"execution_count": 96,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 96,
	"data": {
	"text/plain": "[(10701989183306053849, 2, 4),\n (10701989183306053849, 7, 9),\n (10701989183306053849, 17, 19)]"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "for match_id, start, end in matches:\n span = doc[start:end]\n print(span.text)",
	"execution_count": 98,
	"outputs": [
	{
	"output_type": "stream",
	"text": "BARAC OBAMA\nANGELA MARKEL\nNEW YORK\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "#### Custom Rules Based Entity Recognition"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "from spacy.pipeline import EntityRuler",
	"execution_count": 99,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "ruler = EntityRuler(nlp)",
	"execution_count": 100,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pattern = [{'label': 'ORG', 'pattern': 'theBisho'}, {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]",
	"execution_count": 101,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "pattern",
	"execution_count": 102,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 102,
	"data": {
	"text/plain": "[{'label': 'ORG', 'pattern': 'theBisho'},\n {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]"
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "ruler.add_patterns(pattern)\nnlp.add_pipe(ruler)",
	"execution_count": 103,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "doc = nlp('My new organization is theBisho.')",
	"execution_count": 104,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "for ent in doc.ents:\n print(ent.text, ent.label_)",
	"execution_count": 105,
	"outputs": [
	{
	"output_type": "stream",
	"text": "theBisho ORG\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"_draft": {
	"nbviewer_url": "https://gist.github.com/d3d1a1fbed931d38cec427a062af56d1"
	},
	"gist": {
	"id": "d3d1a1fbed931d38cec427a062af56d1",
	"data": {
	"description": "Untitled.ipynb",
	"public": true
	}
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3",
	"language": "python"
	},
	"language_info": {
	"name": "python",
	"version": "3.7.6",
	"mimetype": "text/x-python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"pygments_lexer": "ipython3",
	"nbconvert_exporter": "python",
	"file_extension": ".py"
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"base_numbering": 1,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}