Created
December 27, 2017 17:59
-
-
Save dlemphers/cc16e994f985d9144298154a1824a139 to your computer and use it in GitHub Desktop.
Char2vec - Character embeddings for word similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Char2Vec\n", | |
"\n", | |
"Dave Lemphers\n", | |
"\n", | |
"https://davelemphers.com" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The intuition behind this is based on the work done on [Word2Vec](https://en.wikipedia.org/wiki/Word2vec) and is an experiment to see if using character embedding is useful for situations where character counts don't achieve the goal." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.metrics.pairwise import cosine_similarity\n", | |
"\n", | |
"from IPython.display import display, Markdown" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## This simply takes a word and creates a sliding window context based on the start and end settings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def embedded_word(word, sliding_window_start, sliding_window_end):\n", | |
" context = {}\n", | |
" \n", | |
" for sliding_window_size in range(sliding_window_start, sliding_window_end):\n", | |
" for window in range(0, len(word)):\n", | |
" current_window = word[window:window + sliding_window_size]\n", | |
" if(len(current_window) == sliding_window_size):\n", | |
" context[current_window] = 0\n", | |
" \n", | |
" return context\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## We can play with the sliding window hp to see the effect on certain words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'a': 0, 'c': 0, 't': 0}" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"{'at': 0, 'ca': 0}" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"{'a': 0, 'at': 0, 'c': 0, 'ca': 0, 't': 0}" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"for configs in [(1,2), (2,3), (1,3)]:\n", | |
" display(embedded_word(\"cat\", *configs))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## The idea here is to simply create an embedding matrix for the words. You can provide a richer vocabulary of words or simply provide the two words and it will construct a vocab from these" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def embedding_matrix(word1, word2, vocab=None, sliding_window_params=(1,3)):\n", | |
" \n", | |
" if not vocab:\n", | |
" vocab = [word1, word2]\n", | |
" \n", | |
" word1_emb = embedded_word(word1, *sliding_window_params)\n", | |
" word2_emb = embedded_word(word2, *sliding_window_params)\n", | |
" \n", | |
" vocab_emb = {}\n", | |
" for vocab_word in vocab:\n", | |
" vocab_emb.update(embedded_word(vocab_word, *sliding_window_params))\n", | |
" \n", | |
" vocab = list(set(list(vocab_emb.keys()))) + list(set(list(word1_emb.keys()) + list(word2_emb.keys())))\n", | |
"# vocab = list(set(list(vocab_emb.keys())))\n", | |
" \n", | |
" embedding = np.zeros((2, len(vocab)))\n", | |
" \n", | |
" for idx, word_emb in enumerate([word1_emb, word2_emb]):\n", | |
" for tok in word_emb.keys():\n", | |
" embedding[idx][vocab.index(tok)] += 1\n", | |
" \n", | |
" return embedding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,\n", | |
" 0.],\n", | |
" [ 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,\n", | |
" 0.]])" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"embedding_matrix(\"cat\", \"hat\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Cosine similarity on the embeddings" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def vec_similarity(word1, word2, vocab=None, config=(1,3)):\n", | |
" \n", | |
" matrix = embedding_matrix(\n", | |
" word1, \n", | |
" word2, \n", | |
" vocab=vocab, \n", | |
" sliding_window_params=config\n", | |
" )\n", | |
" \n", | |
" return cosine_similarity([matrix[0]], [matrix[1]])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Tf-idf cosine similarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tf_similarity(word1, word2, vocab=None, config=(1,3)):\n", | |
" \n", | |
" if not vocab:\n", | |
" vocab = [word1, word2]\n", | |
"\n", | |
" tfidf_vectorizer = TfidfVectorizer(\n", | |
" analyzer=\"char\", \n", | |
" lowercase=True, \n", | |
" ngram_range=config\n", | |
" )\n", | |
"\n", | |
" tfidf_vectorizer.fit_transform(vocab)\n", | |
"\n", | |
" return cosine_similarity(\n", | |
" tfidf_vectorizer.transform([word1]),\n", | |
" tfidf_vectorizer.transform([word2]),\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Try out some canonical pairs and play with sliding window" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"results = []\n", | |
"\n", | |
"for test_pair in [\n", | |
" (\"bag\", \"bags\"),\n", | |
" (\"bag\", \"bad\"),\n", | |
" (\"bag\", \"gab\")\n", | |
"]:\n", | |
" for test_method in [vec_similarity, tf_similarity]:\n", | |
" for test_window in [(1,2), (2,3), (1,3)]:\n", | |
" result = test_method(*test_pair, config=test_window)\n", | |
" results.append([test_method.__name__, test_pair, test_window, result[0][0]])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Method</th>\n", | |
" <th>Test Pair</th>\n", | |
" <th>Options</th>\n", | |
" <th>Result</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, bags)</td>\n", | |
" <td>(1, 2)</td>\n", | |
" <td>0.866025</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, bags)</td>\n", | |
" <td>(2, 3)</td>\n", | |
" <td>0.816497</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, bags)</td>\n", | |
" <td>(1, 3)</td>\n", | |
" <td>0.845154</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, bags)</td>\n", | |
" <td>(1, 2)</td>\n", | |
" <td>0.747407</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, bags)</td>\n", | |
" <td>(2, 3)</td>\n", | |
" <td>0.656973</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, bags)</td>\n", | |
" <td>(1, 3)</td>\n", | |
" <td>0.709297</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, bad)</td>\n", | |
" <td>(1, 2)</td>\n", | |
" <td>0.666667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, bad)</td>\n", | |
" <td>(2, 3)</td>\n", | |
" <td>0.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, bad)</td>\n", | |
" <td>(1, 3)</td>\n", | |
" <td>0.600000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, bad)</td>\n", | |
" <td>(1, 2)</td>\n", | |
" <td>0.431613</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, bad)</td>\n", | |
" <td>(2, 3)</td>\n", | |
" <td>0.201993</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, bad)</td>\n", | |
" <td>(1, 3)</td>\n", | |
" <td>0.336097</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, gab)</td>\n", | |
" <td>(1, 2)</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, gab)</td>\n", | |
" <td>(2, 3)</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>vec_similarity</td>\n", | |
" <td>(bag, gab)</td>\n", | |
" <td>(1, 3)</td>\n", | |
" <td>0.600000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, gab)</td>\n", | |
" <td>(1, 2)</td>\n", | |
" <td>0.431613</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, gab)</td>\n", | |
" <td>(2, 3)</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>tf_similarity</td>\n", | |
" <td>(bag, gab)</td>\n", | |
" <td>(1, 3)</td>\n", | |
" <td>0.336097</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Method Test Pair Options Result\n", | |
"0 vec_similarity (bag, bags) (1, 2) 0.866025\n", | |
"1 vec_similarity (bag, bags) (2, 3) 0.816497\n", | |
"2 vec_similarity (bag, bags) (1, 3) 0.845154\n", | |
"3 tf_similarity (bag, bags) (1, 2) 0.747407\n", | |
"4 tf_similarity (bag, bags) (2, 3) 0.656973\n", | |
"5 tf_similarity (bag, bags) (1, 3) 0.709297\n", | |
"6 vec_similarity (bag, bad) (1, 2) 0.666667\n", | |
"7 vec_similarity (bag, bad) (2, 3) 0.500000\n", | |
"8 vec_similarity (bag, bad) (1, 3) 0.600000\n", | |
"9 tf_similarity (bag, bad) (1, 2) 0.431613\n", | |
"10 tf_similarity (bag, bad) (2, 3) 0.201993\n", | |
"11 tf_similarity (bag, bad) (1, 3) 0.336097\n", | |
"12 vec_similarity (bag, gab) (1, 2) 1.000000\n", | |
"13 vec_similarity (bag, gab) (2, 3) 0.000000\n", | |
"14 vec_similarity (bag, gab) (1, 3) 0.600000\n", | |
"15 tf_similarity (bag, gab) (1, 2) 0.431613\n", | |
"16 tf_similarity (bag, gab) (2, 3) 0.000000\n", | |
"17 tf_similarity (bag, gab) (1, 3) 0.336097" | |
] | |
}, | |
"execution_count": 67, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame(results, columns=[\"Method\", \"Test Pair\", \"Options\", \"Result\"])\n", | |
"df.style.set_properties(**{'font-size':'10pt'})\n", | |
"df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment