Last active
June 16, 2020 06:57
-
-
Save thebishorup/46854e255673344a9c52b42328fd0065 to your computer and use it in GitHub Desktop.
Rule Based Text Phrase Extraction and Matching.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Rule based matching" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy", | |
"execution_count": 38, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "nlp = spacy.load('en_core_web_lg')", | |
"execution_count": 40, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Hello World!')", | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "doc", | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "Hello World!" | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "for token in doc:\n print(token)", | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "Hello\nWorld\n!\n" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "pattern = [{\"LOWER\": 'hello'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'world'}]", | |
"execution_count": 17, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "matcher = Matcher(nlp.vocab)\nmatcher.add('HelloWorld', None, pattern)", | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp('Hello world!')", | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "matches = matcher(doc)", | |
"execution_count": 20, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "matches", | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": "[(15578876784678163569, 0, 2)]" | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "for match_id, start, end in matches:\n string_id = nlp.vocab.strings[match_id]\n span = doc[start: end]\n print(match_id, string_id, start, end, span.text)", | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": "15578876784678163569 HelloWorld 0 2 Hello world\n" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Regular Expression" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = 'Please reach me at 5554446666 for further discussion. Also my extention to the number is 456.'", | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import re", | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# identify phone number with (10 characters long and digit)\nre.search(r'\\d{10}', text)", | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 10, | |
"data": { | |
"text/plain": "<re.Match object; span=(19, 29), match='5554446666'>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## find all the digits length ranging from 3 to 10\nre.findall(r'\\d{3,10}', text)", | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 12, | |
"data": { | |
"text/plain": "['5554446666', '456']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## find all the words that has at least 4 charactrs\nre.findall(r'\\w{4,}', text)", | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 14, | |
"data": { | |
"text/plain": "['Please',\n 'reach',\n '5554446666',\n 'further',\n 'discussion',\n 'Also',\n 'extention',\n 'number']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Wild cared text" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## find all the words that starts with p\nre.findall(r'P....', text)", | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 18, | |
"data": { | |
"text/plain": "['Pleas']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## find the word that has character u\nre.findall(r'.u.', text)", | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 20, | |
"data": { | |
"text/plain": "['fur', 'cus', 'num']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = 'Please notice my phone ends with 8'", | |
"execution_count": 21, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## find of sentence contains number at the end\nre.findall(r'\\d$', text)", | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 24, | |
"data": { | |
"text/plain": "['8']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = '7 is the first digit of my phone number and last is 8'", | |
"execution_count": 25, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## find the digits at the start of the sentence\nre.findall(r'^\\d', text)", | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 26, | |
"data": { | |
"text/plain": "['7']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "### Exclusion" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## get the content of sentece without digits\nre.findall(r'[^\\d]+', text)", | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 27, | |
"data": { | |
"text/plain": "[' is the first digit of my phone number and last is ']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## get all the digits from sentence\nre.findall(r'[^\\D]', text)", | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 30, | |
"data": { | |
"text/plain": "['7', '8']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = 'This number has text-free plan as well as unlimited-call.'", | |
"execution_count": 31, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "## get the grouped word (hyphened)\nre.findall(r'[\\w]+-[\\w]+', text)", | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 32, | |
"data": { | |
"text/plain": "['text-free', 'unlimited-call']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "## Regular Expression in spacy" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text = 'Other than gist-it, I’m using the following extensions: Code Font Size, Codefolding, Collapsible Headings, Hide Header, plus the ones that are ticked by default. If you want more, here’s one post on the top 5 extensions from Eliot Andres, a machine learning engineer. What are your favorite extensions?'", | |
"execution_count": 33, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "text", | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 34, | |
"data": { | |
"text/plain": "'Other than gist-it, I’m using the following extensions: Code Font Size, Codefolding, Collapsible Headings, Hide Header, plus the ones that are ticked by default. If you want more, here’s one post on the top 5 extensions from Eliot Andres, a machine learning engineer. What are your favorite extensions?'" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "pattern = [{'TEXT':'Code'}]", | |
"execution_count": 73, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "def callback_custom_match(matcher, doc, i, matches):\n match_id, start, end = matches[i]\n entity = doc[start:end]\n print(entity.text)", | |
"execution_count": 74, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher = Matcher(nlp.vocab)", | |
"execution_count": 75, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher.add('Code', callback_custom_match, pattern)", | |
"execution_count": 76, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "doc = nlp(text)", | |
"execution_count": 77, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "matcher(doc)", | |
"execution_count": 78, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Code\n", | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"execution_count": 78, | |
"data": { | |
"text/plain": "[(12652780974462928065, 13, 14)]" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"gist": { | |
"id": "46854e255673344a9c52b42328fd0065", | |
"data": { | |
"description": "Rule Based Text Phrase Extraction and Matching.ipynb", | |
"public": true | |
} | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.7.6", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"toc": { | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"base_numbering": 1, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
}, | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/46854e255673344a9c52b42328fd0065" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment