Skip to content

Instantly share code, notes, and snippets.

@jobergum
Created January 31, 2020 18:53
Show Gist options
  • Save jobergum/432c416b0a34974316af4d8ee01dfd9a to your computer and use it in GitHub Desktop.
Save jobergum/432c416b0a34974316af4d8ee01dfd9a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explore features "
]
},
{
"cell_type": "code",
"execution_count": 334,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>product_uid</th>\n",
" <th>relevance</th>\n",
" <th>search_term</th>\n",
" <th>bm25(attributes)</th>\n",
" <th>bm25(brand)</th>\n",
" <th>bm25(description)</th>\n",
" <th>bm25(queries)</th>\n",
" <th>bm25(title)</th>\n",
" <th>elementCompleteness(attributes).completeness</th>\n",
" <th>...</th>\n",
" <th>textSimilarity(description).fieldCoverage</th>\n",
" <th>textSimilarity(description).order</th>\n",
" <th>textSimilarity(description).proximity</th>\n",
" <th>textSimilarity(description).queryCoverage</th>\n",
" <th>textSimilarity(description).score</th>\n",
" <th>textSimilarity(title).fieldCoverage</th>\n",
" <th>textSimilarity(title).order</th>\n",
" <th>textSimilarity(title).proximity</th>\n",
" <th>textSimilarity(title).queryCoverage</th>\n",
" <th>textSimilarity(title).score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>140466</td>\n",
" <td>150836</td>\n",
" <td>1.67</td>\n",
" <td>security pc</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>13.170395</td>\n",
" <td>3.827236</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.100000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.170000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>60498</td>\n",
" <td>116263</td>\n",
" <td>1.67</td>\n",
" <td>acrylic window panel</td>\n",
" <td>6.146889</td>\n",
" <td>0.000000</td>\n",
" <td>8.738634</td>\n",
" <td>11.081958</td>\n",
" <td>5.679297</td>\n",
" <td>0.250000</td>\n",
" <td>...</td>\n",
" <td>0.019048</td>\n",
" <td>1.0</td>\n",
" <td>0.859375</td>\n",
" <td>0.666667</td>\n",
" <td>0.654591</td>\n",
" <td>0.166667</td>\n",
" <td>1.0</td>\n",
" <td>0.859375</td>\n",
" <td>0.666667</td>\n",
" <td>0.684115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>90139</td>\n",
" <td>127345</td>\n",
" <td>1.67</td>\n",
" <td>grappler tool hook</td>\n",
" <td>5.998243</td>\n",
" <td>0.000000</td>\n",
" <td>8.653977</td>\n",
" <td>19.471086</td>\n",
" <td>5.551304</td>\n",
" <td>0.333333</td>\n",
" <td>...</td>\n",
" <td>0.022472</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.666667</td>\n",
" <td>0.704494</td>\n",
" <td>0.153846</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.666667</td>\n",
" <td>0.730769</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>99118</td>\n",
" <td>131063</td>\n",
" <td>2.33</td>\n",
" <td>1 gallon paint behr paint</td>\n",
" <td>13.687360</td>\n",
" <td>2.520351</td>\n",
" <td>12.924841</td>\n",
" <td>16.519156</td>\n",
" <td>10.264424</td>\n",
" <td>0.700000</td>\n",
" <td>...</td>\n",
" <td>0.017751</td>\n",
" <td>0.5</td>\n",
" <td>0.375000</td>\n",
" <td>0.600000</td>\n",
" <td>0.389800</td>\n",
" <td>0.222222</td>\n",
" <td>0.0</td>\n",
" <td>0.234375</td>\n",
" <td>0.400000</td>\n",
" <td>0.246476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>109368</td>\n",
" <td>135557</td>\n",
" <td>2.00</td>\n",
" <td>dog poop</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>7.939734</td>\n",
" <td>21.473545</td>\n",
" <td>4.905250</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.004651</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.150930</td>\n",
" <td>0.058824</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.161765</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2095</th>\n",
" <td>196723</td>\n",
" <td>186504</td>\n",
" <td>NaN</td>\n",
" <td>everbuilt lock nut m6-1.0mm</td>\n",
" <td>8.344204</td>\n",
" <td>0.000000</td>\n",
" <td>4.453281</td>\n",
" <td>16.225835</td>\n",
" <td>2.644281</td>\n",
" <td>0.750000</td>\n",
" <td>...</td>\n",
" <td>0.017857</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.250000</td>\n",
" <td>0.078571</td>\n",
" <td>0.076923</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.250000</td>\n",
" <td>0.090385</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2096</th>\n",
" <td>45801</td>\n",
" <td>111304</td>\n",
" <td>NaN</td>\n",
" <td>rice paper</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>15.984498</td>\n",
" <td>12.946460</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.200000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.840000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2097</th>\n",
" <td>224119</td>\n",
" <td>208892</td>\n",
" <td>NaN</td>\n",
" <td>20 homelite bar</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>16.462403</td>\n",
" <td>6.943115</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.222222</td>\n",
" <td>1.0</td>\n",
" <td>0.859375</td>\n",
" <td>0.666667</td>\n",
" <td>0.695226</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2098</th>\n",
" <td>77697</td>\n",
" <td>122439</td>\n",
" <td>NaN</td>\n",
" <td>add</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.528416</td>\n",
" <td>2.517452</td>\n",
" <td>1.550136</td>\n",
" <td>0.000000</td>\n",
" <td>...</td>\n",
" <td>0.005780</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.451156</td>\n",
" <td>0.066667</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.463333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2099</th>\n",
" <td>147770</td>\n",
" <td>154800</td>\n",
" <td>NaN</td>\n",
" <td>bathroom shelf</td>\n",
" <td>3.322686</td>\n",
" <td>0.000000</td>\n",
" <td>6.099678</td>\n",
" <td>8.342897</td>\n",
" <td>3.765415</td>\n",
" <td>0.321429</td>\n",
" <td>...</td>\n",
" <td>0.009259</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.151852</td>\n",
" <td>0.100000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.170000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2100 rows × 163 columns</p>\n",
"</div>"
],
"text/plain": [
" id product_uid relevance search_term \\\n",
"0 140466 150836 1.67 security pc \n",
"1 60498 116263 1.67 acrylic window panel \n",
"2 90139 127345 1.67 grappler tool hook \n",
"3 99118 131063 2.33 1 gallon paint behr paint \n",
"4 109368 135557 2.00 dog poop \n",
"... ... ... ... ... \n",
"2095 196723 186504 NaN everbuilt lock nut m6-1.0mm \n",
"2096 45801 111304 NaN rice paper \n",
"2097 224119 208892 NaN 20 homelite bar \n",
"2098 77697 122439 NaN add \n",
"2099 147770 154800 NaN bathroom shelf \n",
"\n",
" bm25(attributes) bm25(brand) bm25(description) bm25(queries) \\\n",
"0 0.000000 0.000000 0.000000 13.170395 \n",
"1 6.146889 0.000000 8.738634 11.081958 \n",
"2 5.998243 0.000000 8.653977 19.471086 \n",
"3 13.687360 2.520351 12.924841 16.519156 \n",
"4 0.000000 0.000000 7.939734 21.473545 \n",
"... ... ... ... ... \n",
"2095 8.344204 0.000000 4.453281 16.225835 \n",
"2096 0.000000 0.000000 0.000000 15.984498 \n",
"2097 0.000000 0.000000 0.000000 16.462403 \n",
"2098 0.000000 0.000000 1.528416 2.517452 \n",
"2099 3.322686 0.000000 6.099678 8.342897 \n",
"\n",
" bm25(title) elementCompleteness(attributes).completeness ... \\\n",
"0 3.827236 0.000000 ... \n",
"1 5.679297 0.250000 ... \n",
"2 5.551304 0.333333 ... \n",
"3 10.264424 0.700000 ... \n",
"4 4.905250 0.000000 ... \n",
"... ... ... ... \n",
"2095 2.644281 0.750000 ... \n",
"2096 12.946460 0.000000 ... \n",
"2097 6.943115 0.000000 ... \n",
"2098 1.550136 0.000000 ... \n",
"2099 3.765415 0.321429 ... \n",
"\n",
" textSimilarity(description).fieldCoverage \\\n",
"0 0.000000 \n",
"1 0.019048 \n",
"2 0.022472 \n",
"3 0.017751 \n",
"4 0.004651 \n",
"... ... \n",
"2095 0.017857 \n",
"2096 0.000000 \n",
"2097 0.000000 \n",
"2098 0.005780 \n",
"2099 0.009259 \n",
"\n",
" textSimilarity(description).order \\\n",
"0 0.0 \n",
"1 1.0 \n",
"2 1.0 \n",
"3 0.5 \n",
"4 0.0 \n",
"... ... \n",
"2095 0.0 \n",
"2096 0.0 \n",
"2097 0.0 \n",
"2098 1.0 \n",
"2099 0.0 \n",
"\n",
" textSimilarity(description).proximity \\\n",
"0 0.000000 \n",
"1 0.859375 \n",
"2 1.000000 \n",
"3 0.375000 \n",
"4 0.000000 \n",
"... ... \n",
"2095 0.000000 \n",
"2096 0.000000 \n",
"2097 0.000000 \n",
"2098 0.000000 \n",
"2099 0.000000 \n",
"\n",
" textSimilarity(description).queryCoverage \\\n",
"0 0.000000 \n",
"1 0.666667 \n",
"2 0.666667 \n",
"3 0.600000 \n",
"4 0.500000 \n",
"... ... \n",
"2095 0.250000 \n",
"2096 0.000000 \n",
"2097 0.000000 \n",
"2098 1.000000 \n",
"2099 0.500000 \n",
"\n",
" textSimilarity(description).score textSimilarity(title).fieldCoverage \\\n",
"0 0.000000 0.100000 \n",
"1 0.654591 0.166667 \n",
"2 0.704494 0.153846 \n",
"3 0.389800 0.222222 \n",
"4 0.150930 0.058824 \n",
"... ... ... \n",
"2095 0.078571 0.076923 \n",
"2096 0.000000 0.200000 \n",
"2097 0.000000 0.222222 \n",
"2098 0.451156 0.066667 \n",
"2099 0.151852 0.100000 \n",
"\n",
" textSimilarity(title).order textSimilarity(title).proximity \\\n",
"0 0.0 0.000000 \n",
"1 1.0 0.859375 \n",
"2 1.0 1.000000 \n",
"3 0.0 0.234375 \n",
"4 0.0 0.000000 \n",
"... ... ... \n",
"2095 0.0 0.000000 \n",
"2096 1.0 1.000000 \n",
"2097 1.0 0.859375 \n",
"2098 1.0 0.000000 \n",
"2099 0.0 0.000000 \n",
"\n",
" textSimilarity(title).queryCoverage textSimilarity(title).score \n",
"0 0.500000 0.170000 \n",
"1 0.666667 0.684115 \n",
"2 0.666667 0.730769 \n",
"3 0.400000 0.246476 \n",
"4 0.500000 0.161765 \n",
"... ... ... \n",
"2095 0.250000 0.090385 \n",
"2096 1.000000 0.840000 \n",
"2097 0.666667 0.695226 \n",
"2098 1.000000 0.463333 \n",
"2099 0.500000 0.170000 \n",
"\n",
"[2100 rows x 163 columns]"
]
},
"execution_count": 334,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all"
]
},
{
"cell_type": "code",
"execution_count": 314,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"term(3).significance -0.154106\n",
"fieldLength(title) -0.144911\n",
"fieldLength(description) -0.127688\n",
"term(4).significance -0.126086\n",
"product_uid -0.122270\n",
" ... \n",
"fieldMatch(title).queryCompleteness 0.331386\n",
"fieldMatch(title).completeness 0.331573\n",
"textSimilarity(title).queryCoverage 0.333115\n",
"nativeRank(title) 0.341930\n",
"relevance 1.000000\n",
"Name: relevance, Length: 158, dtype: float64"
]
},
"execution_count": 314,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all[0:training_examples].corr()['relevance'].sort_values().dropna()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment