Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save thebishorup/46854e255673344a9c52b42328fd0065 to your computer and use it in GitHub Desktop.
Save thebishorup/46854e255673344a9c52b42328fd0065 to your computer and use it in GitHub Desktop.
Rule Based Text Phrase Extraction and Matching.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "## Rule based matching"
},
{
"metadata": {},
"cell_type": "markdown",
"source": ""
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import spacy\nfrom spacy.matcher import Matcher\nfrom spacy.tokens import Span\nfrom spacy import displacy",
"execution_count": 38,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "nlp = spacy.load('en_core_web_lg')",
"execution_count": 40,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "doc = nlp('Hello World!')",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "doc",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "Hello World!"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "for token in doc:\n print(token)",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "Hello\nWorld\n!\n"
}
]
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "pattern = [{\"LOWER\": 'hello'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'world'}]",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "matcher = Matcher(nlp.vocab)\nmatcher.add('HelloWorld', None, pattern)",
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "doc = nlp('Hello world!')",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "matches = matcher(doc)",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "matches",
"execution_count": 21,
"outputs": [
{
"data": {
"text/plain": "[(15578876784678163569, 0, 2)]"
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
]
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "for match_id, start, end in matches:\n string_id = nlp.vocab.strings[match_id]\n span = doc[start: end]\n print(match_id, string_id, start, end, span.text)",
"execution_count": 22,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "15578876784678163569 HelloWorld 0 2 Hello world\n"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Regular Expression"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text = 'Please reach me at 5554446666 for further discussion. Also my extention to the number is 456.'",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import re",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# identify phone number with (10 characters long and digit)\nre.search(r'\\d{10}', text)",
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 10,
"data": {
"text/plain": "<re.Match object; span=(19, 29), match='5554446666'>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## find all the digits length ranging from 3 to 10\nre.findall(r'\\d{3,10}', text)",
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 12,
"data": {
"text/plain": "['5554446666', '456']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## find all the words that has at least 4 charactrs\nre.findall(r'\\w{4,}', text)",
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 14,
"data": {
"text/plain": "['Please',\n 'reach',\n '5554446666',\n 'further',\n 'discussion',\n 'Also',\n 'extention',\n 'number']"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Wild cared text"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## find all the words that starts with p\nre.findall(r'P....', text)",
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 18,
"data": {
"text/plain": "['Pleas']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## find the word that has character u\nre.findall(r'.u.', text)",
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 20,
"data": {
"text/plain": "['fur', 'cus', 'num']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text = 'Please notice my phone ends with 8'",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## find of sentence contains number at the end\nre.findall(r'\\d$', text)",
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 24,
"data": {
"text/plain": "['8']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text = '7 is the first digit of my phone number and last is 8'",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## find the digits at the start of the sentence\nre.findall(r'^\\d', text)",
"execution_count": 26,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 26,
"data": {
"text/plain": "['7']"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Exclusion"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## get the content of sentece without digits\nre.findall(r'[^\\d]+', text)",
"execution_count": 27,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 27,
"data": {
"text/plain": "[' is the first digit of my phone number and last is ']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## get all the digits from sentence\nre.findall(r'[^\\D]', text)",
"execution_count": 30,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 30,
"data": {
"text/plain": "['7', '8']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text = 'This number has text-free plan as well as unlimited-call.'",
"execution_count": 31,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "## get the grouped word (hyphened)\nre.findall(r'[\\w]+-[\\w]+', text)",
"execution_count": 32,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 32,
"data": {
"text/plain": "['text-free', 'unlimited-call']"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Regular Expression in spacy"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text = 'Other than gist-it, I’m using the following extensions: Code Font Size, Codefolding, Collapsible Headings, Hide Header, plus the ones that are ticked by default. If you want more, here’s one post on the top 5 extensions from Eliot Andres, a machine learning engineer. What are your favorite extensions?'",
"execution_count": 33,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "text",
"execution_count": 34,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 34,
"data": {
"text/plain": "'Other than gist-it, I’m using the following extensions: Code Font Size, Codefolding, Collapsible Headings, Hide Header, plus the ones that are ticked by default. If you want more, here’s one post on the top 5 extensions from Eliot Andres, a machine learning engineer. What are your favorite extensions?'"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pattern = [{'TEXT':'Code'}]",
"execution_count": 73,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def callback_custom_match(matcher, doc, i, matches):\n match_id, start, end = matches[i]\n entity = doc[start:end]\n print(entity.text)",
"execution_count": 74,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher = Matcher(nlp.vocab)",
"execution_count": 75,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher.add('Code', callback_custom_match, pattern)",
"execution_count": 76,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "doc = nlp(text)",
"execution_count": 77,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "matcher(doc)",
"execution_count": 78,
"outputs": [
{
"output_type": "stream",
"text": "Code\n",
"name": "stdout"
},
{
"output_type": "execute_result",
"execution_count": 78,
"data": {
"text/plain": "[(12652780974462928065, 13, 14)]"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"gist": {
"id": "46854e255673344a9c52b42328fd0065",
"data": {
"description": "Rule Based Text Phrase Extraction and Matching.ipynb",
"public": true
}
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.7.6",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"base_numbering": 1,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"_draft": {
"nbviewer_url": "https://gist.github.com/46854e255673344a9c52b42328fd0065"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment