Skip to content

Instantly share code, notes, and snippets.

@eriknomitch
Created March 30, 2019 05:12
Show Gist options
  • Save eriknomitch/a4f7d0be0567e2526414b12de6142d01 to your computer and use it in GitHub Desktop.
Save eriknomitch/a4f7d0be0567e2526414b12de6142d01 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import string\n",
"import spacy\n",
"from spacy.tokens import Token\n",
"from spacy.tokenizer import Tokenizer\n",
"import inflect\n",
"import numpy as np\n",
"\n",
"from IPython.core.display import display, HTML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Globals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Library"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"nlp = spacy.load(\"en_core_web_lg\")\n",
"tokenizer = Tokenizer(nlp.vocab)\n",
"ie = inflect.engine()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Text"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"doc = nlp(u\"\"\"Apple’s special event this week was all about apples and oranges and software subscription services, signaling a new focus on business aside from its bread and butter of consumer electronics.\n",
"\n",
"Nestled among the software announcements, there was one announcement that piqued my interest, the physical version of Apple’s new credit card, Apple Card and it's new fruit line Apple Fruits.\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Negative matches\n",
"negative_matches = {u\"no\", u\"not\", u\"unavailable\"}\n",
"\n",
"# The raw matches from the filter\n",
"filter_matches = {u\"apple\", u\"event\"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Utility"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def pluralize(word):\n",
" global ie\n",
"\n",
" # This is a little counterintuitive but `singular_noun(...)` will\n",
" # return false if the word is a singular noun. Only pluralize then.\n",
" if not ie.singular_noun(word):\n",
" return ie.plural(word)\n",
"\n",
" return word\n",
"\n",
"def tokenize(word):\n",
" return tokenizer(word)[0]\n",
"\n",
"def list_to_tokens(words):\n",
" return [tokenize(w) for w in words]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Match Setup"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Matches + pluralized versions\n",
"matches = filter_matches.copy()\n",
"\n",
"for match in list(matches):\n",
" matches.add(pluralize(match))\n",
"\n",
"# Tokenized version for NLP\n",
"matches_tokens = list_to_tokens(matches)\n",
"negative_matches_tokens = list_to_tokens(negative_matches)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Main"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def match_getter(token):\n",
" global matches_tokens\n",
" \n",
" # Save some time on exact matches\n",
" if token.text.lower() in matches:\n",
" return 1.0\n",
" \n",
" if not token.has_vector:\n",
" return 0.0\n",
"\n",
" similarities = list(map(lambda m: m.similarity(token), matches_tokens))\n",
" \n",
" most_similar = max(similarities)\n",
" \n",
" return most_similar\n",
"\n",
"Token.set_extension(\"is_match\", getter=match_getter, force=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#print([(w.text, w.pos_) for w in doc])\n",
"#print(type(doc[0])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def output_highlighted(similarity_threshold):\n",
" \n",
" words_highlighted = []\n",
" \n",
" np_similarity_threshold = np.float32(similarity_threshold)\n",
"\n",
" for w in doc:\n",
" similarity = np.float32(w._.is_match)\n",
" \n",
" if np.less(similarity, np_similarity_threshold):\n",
" word = f\"<span>{w.text}</span>\"\n",
" else:\n",
" weight = \"bold\" if similarity == 1.0 else \"normal\"\n",
" word = f\"<span style='background: rgba(0, 255, 0, {similarity}); font-weight: {weight}'>{w.text}</span>\"\n",
"\n",
" words_highlighted.append(word)\n",
"\n",
" words_output = \" \".join(words_highlighted)\n",
"\n",
"\n",
" output = f\"\"\"\n",
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n",
" <div>\n",
" matches: <b>{\", \".join(filter_matches)}</b>\n",
" </br>\n",
" w/ plurals: <b>{\", \".join(matches)}</b>\n",
" <br/>\n",
" SIMILIARTY_THRESHOLD: <b>{float(similarity_threshold)}</b>\n",
" </div>\n",
" <hr/>\n",
" <p style=\"font-size: 16px;\">\n",
" {words_output}\n",
" </p>\n",
" </div>\"\"\"\n",
"\n",
" display(HTML(output))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"FULL SIMILARITY SHOWN\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n",
" <div>\n",
" matches: <b>event, apple</b>\n",
" </br>\n",
" w/ plurals: <b>event, events, apple, apples</b>\n",
" <br/>\n",
" SIMILIARTY_THRESHOLD: <b>0.0</b>\n",
" </div>\n",
" <hr/>\n",
" <p style=\"font-size: 16px;\">\n",
" <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.3808760643005371); font-weight: normal'>’s</span> <span style='background: rgba(0, 255, 0, 0.5234818458557129); font-weight: normal'>special</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>event</span> <span style='background: rgba(0, 255, 0, 0.4608474373817444); font-weight: normal'>this</span> <span style='background: rgba(0, 255, 0, 0.43837690353393555); font-weight: normal'>week</span> <span style='background: rgba(0, 255, 0, 0.35579434037208557); font-weight: normal'>was</span> <span style='background: rgba(0, 255, 0, 0.42519861459732056); font-weight: normal'>all</span> <span style='background: rgba(0, 255, 0, 0.3430214524269104); font-weight: normal'>about</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>apples</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.7780942320823669); font-weight: normal'>oranges</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.3079564571380615); font-weight: normal'>software</span> <span style='background: rgba(0, 255, 0, 0.22379352152347565); font-weight: normal'>subscription</span> <span style='background: rgba(0, 255, 0, 0.34125784039497375); font-weight: normal'>services</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 0.19052930176258087); font-weight: normal'>signaling</span> <span style='background: rgba(0, 255, 0, 0.33145979046821594); font-weight: normal'>a</span> <span style='background: rgba(0, 255, 0, 0.33666253089904785); font-weight: normal'>new</span> <span style='background: rgba(0, 255, 0, 0.3979387879371643); font-weight: normal'>focus</span> <span style='background: rgba(0, 255, 0, 0.25729233026504517); font-weight: normal'>on</span> <span style='background: rgba(0, 255, 0, 0.34057262539863586); font-weight: normal'>business</span> <span style='background: rgba(0, 255, 0, 0.3790111243724823); font-weight: normal'>aside</span> <span style='background: rgba(0, 255, 0, 0.2546783685684204); font-weight: normal'>from</span> <span style='background: rgba(0, 255, 0, 0.32460078597068787); font-weight: normal'>its</span> <span style='background: rgba(0, 255, 0, 0.5796013474464417); font-weight: normal'>bread</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.608683168888092); font-weight: normal'>butter</span> <span style='background: rgba(0, 255, 0, 0.3342975378036499); font-weight: normal'>of</span> <span style='background: rgba(0, 255, 0, 0.28704968094825745); font-weight: normal'>consumer</span> <span style='background: rgba(0, 255, 0, 0.28327369689941406); font-weight: normal'>electronics</span> <span style='background: rgba(0, 255, 0, 0.20869764685630798); font-weight: normal'>.</span> <span style='background: rgba(0, 255, 0, 0.0); font-weight: normal'>\n",
"\n",
"</span> <span style='background: rgba(0, 255, 0, 0.2163950502872467); font-weight: normal'>Nestled</span> <span style='background: rgba(0, 255, 0, 0.3320554494857788); font-weight: normal'>among</span> <span style='background: rgba(0, 255, 0, 0.42156127095222473); font-weight: normal'>the</span> <span style='background: rgba(0, 255, 0, 0.3079564571380615); font-weight: normal'>software</span> <span style='background: rgba(0, 255, 0, 0.5097285509109497); font-weight: normal'>announcements</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 0.4293268024921417); font-weight: normal'>there</span> <span style='background: rgba(0, 255, 0, 0.35579434037208557); font-weight: normal'>was</span> <span style='background: rgba(0, 255, 0, 0.41250690817832947); font-weight: normal'>one</span> <span style='background: rgba(0, 255, 0, 0.47261321544647217); font-weight: normal'>announcement</span> <span style='background: rgba(0, 255, 0, 0.4123636484146118); font-weight: normal'>that</span> <span style='background: rgba(0, 255, 0, 0.11201054602861404); font-weight: normal'>piqued</span> <span style='background: rgba(0, 255, 0, 0.3132108151912689); font-weight: normal'>my</span> <span style='background: rgba(0, 255, 0, 0.35797837376594543); font-weight: normal'>interest</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 0.42156127095222473); font-weight: normal'>the</span> <span style='background: rgba(0, 255, 0, 0.3398952782154083); font-weight: normal'>physical</span> <span style='background: rgba(0, 255, 0, 0.21415886282920837); font-weight: normal'>version</span> <span style='background: rgba(0, 255, 0, 0.3342975378036499); font-weight: normal'>of</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.3808760643005371); font-weight: normal'>’s</span> <span style='background: rgba(0, 255, 0, 0.33666253089904785); font-weight: normal'>new</span> <span style='background: rgba(0, 255, 0, 0.2234654426574707); font-weight: normal'>credit</span> <span style='background: rgba(0, 255, 0, 0.2577524483203888); font-weight: normal'>card</span> <span style='background: rgba(0, 255, 0, 0.22262457013130188); font-weight: normal'>,</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.2577524483203888); font-weight: normal'>Card</span> <span style='background: rgba(0, 255, 0, 0.3270261585712433); font-weight: normal'>and</span> <span style='background: rgba(0, 255, 0, 0.36499473452568054); font-weight: normal'>it</span> <span style='background: rgba(0, 255, 0, 0.27155137062072754); font-weight: normal'>'s</span> <span style='background: rgba(0, 255, 0, 0.33666253089904785); font-weight: normal'>new</span> <span style='background: rgba(0, 255, 0, 0.724179744720459); font-weight: normal'>fruit</span> <span style='background: rgba(0, 255, 0, 0.25101417303085327); font-weight: normal'>line</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.7155129909515381); font-weight: normal'>Fruits</span> <span style='background: rgba(0, 255, 0, 0.20869764685630798); font-weight: normal'>.</span>\n",
" </p>\n",
" </div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"W/ A MATCH THRESHOLD\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n",
" <div>\n",
" matches: <b>event, apple</b>\n",
" </br>\n",
" w/ plurals: <b>event, events, apple, apples</b>\n",
" <br/>\n",
" SIMILIARTY_THRESHOLD: <b>0.6</b>\n",
" </div>\n",
" <hr/>\n",
" <p style=\"font-size: 16px;\">\n",
" <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>special</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>event</span> <span>this</span> <span>week</span> <span>was</span> <span>all</span> <span>about</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>apples</span> <span>and</span> <span style='background: rgba(0, 255, 0, 0.7780942320823669); font-weight: normal'>oranges</span> <span>and</span> <span>software</span> <span>subscription</span> <span>services</span> <span>,</span> <span>signaling</span> <span>a</span> <span>new</span> <span>focus</span> <span>on</span> <span>business</span> <span>aside</span> <span>from</span> <span>its</span> <span>bread</span> <span>and</span> <span style='background: rgba(0, 255, 0, 0.608683168888092); font-weight: normal'>butter</span> <span>of</span> <span>consumer</span> <span>electronics</span> <span>.</span> <span>\n",
"\n",
"</span> <span>Nestled</span> <span>among</span> <span>the</span> <span>software</span> <span>announcements</span> <span>,</span> <span>there</span> <span>was</span> <span>one</span> <span>announcement</span> <span>that</span> <span>piqued</span> <span>my</span> <span>interest</span> <span>,</span> <span>the</span> <span>physical</span> <span>version</span> <span>of</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>new</span> <span>credit</span> <span>card</span> <span>,</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>Card</span> <span>and</span> <span>it</span> <span>'s</span> <span>new</span> <span style='background: rgba(0, 255, 0, 0.724179744720459); font-weight: normal'>fruit</span> <span>line</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span style='background: rgba(0, 255, 0, 0.7155129909515381); font-weight: normal'>Fruits</span> <span>.</span>\n",
" </p>\n",
" </div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"EXACT MATCHES ONLY\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div style='background: white; padding: 20px; width: 800px; color: black;'>\n",
" <div>\n",
" matches: <b>event, apple</b>\n",
" </br>\n",
" w/ plurals: <b>event, events, apple, apples</b>\n",
" <br/>\n",
" SIMILIARTY_THRESHOLD: <b>1.0</b>\n",
" </div>\n",
" <hr/>\n",
" <p style=\"font-size: 16px;\">\n",
" <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>special</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>event</span> <span>this</span> <span>week</span> <span>was</span> <span>all</span> <span>about</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>apples</span> <span>and</span> <span>oranges</span> <span>and</span> <span>software</span> <span>subscription</span> <span>services</span> <span>,</span> <span>signaling</span> <span>a</span> <span>new</span> <span>focus</span> <span>on</span> <span>business</span> <span>aside</span> <span>from</span> <span>its</span> <span>bread</span> <span>and</span> <span>butter</span> <span>of</span> <span>consumer</span> <span>electronics</span> <span>.</span> <span>\n",
"\n",
"</span> <span>Nestled</span> <span>among</span> <span>the</span> <span>software</span> <span>announcements</span> <span>,</span> <span>there</span> <span>was</span> <span>one</span> <span>announcement</span> <span>that</span> <span>piqued</span> <span>my</span> <span>interest</span> <span>,</span> <span>the</span> <span>physical</span> <span>version</span> <span>of</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>’s</span> <span>new</span> <span>credit</span> <span>card</span> <span>,</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>Card</span> <span>and</span> <span>it</span> <span>'s</span> <span>new</span> <span>fruit</span> <span>line</span> <span style='background: rgba(0, 255, 0, 1.0); font-weight: bold'>Apple</span> <span>Fruits</span> <span>.</span>\n",
" </p>\n",
" </div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print(\"\\n\\n\\nFULL SIMILARITY SHOWN\\n\")\n",
"\n",
"output_highlighted(0.0)\n",
"\n",
"print(\"\\n\\n\\nW/ A MATCH THRESHOLD\\n\")\n",
"output_highlighted(0.6)\n",
"\n",
"print(\"\\n\\n\\nEXACT MATCHES ONLY\\n\")\n",
"output_highlighted(1.0)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment