Created
January 31, 2020 18:53
-
-
Save jobergum/432c416b0a34974316af4d8ee01dfd9a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Explore features " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 334, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>product_uid</th>\n", | |
" <th>relevance</th>\n", | |
" <th>search_term</th>\n", | |
" <th>bm25(attributes)</th>\n", | |
" <th>bm25(brand)</th>\n", | |
" <th>bm25(description)</th>\n", | |
" <th>bm25(queries)</th>\n", | |
" <th>bm25(title)</th>\n", | |
" <th>elementCompleteness(attributes).completeness</th>\n", | |
" <th>...</th>\n", | |
" <th>textSimilarity(description).fieldCoverage</th>\n", | |
" <th>textSimilarity(description).order</th>\n", | |
" <th>textSimilarity(description).proximity</th>\n", | |
" <th>textSimilarity(description).queryCoverage</th>\n", | |
" <th>textSimilarity(description).score</th>\n", | |
" <th>textSimilarity(title).fieldCoverage</th>\n", | |
" <th>textSimilarity(title).order</th>\n", | |
" <th>textSimilarity(title).proximity</th>\n", | |
" <th>textSimilarity(title).queryCoverage</th>\n", | |
" <th>textSimilarity(title).score</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>140466</td>\n", | |
" <td>150836</td>\n", | |
" <td>1.67</td>\n", | |
" <td>security pc</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>13.170395</td>\n", | |
" <td>3.827236</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.100000</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.170000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>60498</td>\n", | |
" <td>116263</td>\n", | |
" <td>1.67</td>\n", | |
" <td>acrylic window panel</td>\n", | |
" <td>6.146889</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>8.738634</td>\n", | |
" <td>11.081958</td>\n", | |
" <td>5.679297</td>\n", | |
" <td>0.250000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.019048</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.859375</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>0.654591</td>\n", | |
" <td>0.166667</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.859375</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>0.684115</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>90139</td>\n", | |
" <td>127345</td>\n", | |
" <td>1.67</td>\n", | |
" <td>grappler tool hook</td>\n", | |
" <td>5.998243</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>8.653977</td>\n", | |
" <td>19.471086</td>\n", | |
" <td>5.551304</td>\n", | |
" <td>0.333333</td>\n", | |
" <td>...</td>\n", | |
" <td>0.022472</td>\n", | |
" <td>1.0</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>0.704494</td>\n", | |
" <td>0.153846</td>\n", | |
" <td>1.0</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>0.730769</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>99118</td>\n", | |
" <td>131063</td>\n", | |
" <td>2.33</td>\n", | |
" <td>1 gallon paint behr paint</td>\n", | |
" <td>13.687360</td>\n", | |
" <td>2.520351</td>\n", | |
" <td>12.924841</td>\n", | |
" <td>16.519156</td>\n", | |
" <td>10.264424</td>\n", | |
" <td>0.700000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.017751</td>\n", | |
" <td>0.5</td>\n", | |
" <td>0.375000</td>\n", | |
" <td>0.600000</td>\n", | |
" <td>0.389800</td>\n", | |
" <td>0.222222</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.234375</td>\n", | |
" <td>0.400000</td>\n", | |
" <td>0.246476</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>109368</td>\n", | |
" <td>135557</td>\n", | |
" <td>2.00</td>\n", | |
" <td>dog poop</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>7.939734</td>\n", | |
" <td>21.473545</td>\n", | |
" <td>4.905250</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.004651</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.150930</td>\n", | |
" <td>0.058824</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.161765</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2095</th>\n", | |
" <td>196723</td>\n", | |
" <td>186504</td>\n", | |
" <td>NaN</td>\n", | |
" <td>everbuilt lock nut m6-1.0mm</td>\n", | |
" <td>8.344204</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>4.453281</td>\n", | |
" <td>16.225835</td>\n", | |
" <td>2.644281</td>\n", | |
" <td>0.750000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.017857</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.250000</td>\n", | |
" <td>0.078571</td>\n", | |
" <td>0.076923</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.250000</td>\n", | |
" <td>0.090385</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2096</th>\n", | |
" <td>45801</td>\n", | |
" <td>111304</td>\n", | |
" <td>NaN</td>\n", | |
" <td>rice paper</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>15.984498</td>\n", | |
" <td>12.946460</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.200000</td>\n", | |
" <td>1.0</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.840000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2097</th>\n", | |
" <td>224119</td>\n", | |
" <td>208892</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20 homelite bar</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>16.462403</td>\n", | |
" <td>6.943115</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.222222</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.859375</td>\n", | |
" <td>0.666667</td>\n", | |
" <td>0.695226</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2098</th>\n", | |
" <td>77697</td>\n", | |
" <td>122439</td>\n", | |
" <td>NaN</td>\n", | |
" <td>add</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.528416</td>\n", | |
" <td>2.517452</td>\n", | |
" <td>1.550136</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>0.005780</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.451156</td>\n", | |
" <td>0.066667</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.463333</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2099</th>\n", | |
" <td>147770</td>\n", | |
" <td>154800</td>\n", | |
" <td>NaN</td>\n", | |
" <td>bathroom shelf</td>\n", | |
" <td>3.322686</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>6.099678</td>\n", | |
" <td>8.342897</td>\n", | |
" <td>3.765415</td>\n", | |
" <td>0.321429</td>\n", | |
" <td>...</td>\n", | |
" <td>0.009259</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.151852</td>\n", | |
" <td>0.100000</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.170000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2100 rows × 163 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id product_uid relevance search_term \\\n", | |
"0 140466 150836 1.67 security pc \n", | |
"1 60498 116263 1.67 acrylic window panel \n", | |
"2 90139 127345 1.67 grappler tool hook \n", | |
"3 99118 131063 2.33 1 gallon paint behr paint \n", | |
"4 109368 135557 2.00 dog poop \n", | |
"... ... ... ... ... \n", | |
"2095 196723 186504 NaN everbuilt lock nut m6-1.0mm \n", | |
"2096 45801 111304 NaN rice paper \n", | |
"2097 224119 208892 NaN 20 homelite bar \n", | |
"2098 77697 122439 NaN add \n", | |
"2099 147770 154800 NaN bathroom shelf \n", | |
"\n", | |
" bm25(attributes) bm25(brand) bm25(description) bm25(queries) \\\n", | |
"0 0.000000 0.000000 0.000000 13.170395 \n", | |
"1 6.146889 0.000000 8.738634 11.081958 \n", | |
"2 5.998243 0.000000 8.653977 19.471086 \n", | |
"3 13.687360 2.520351 12.924841 16.519156 \n", | |
"4 0.000000 0.000000 7.939734 21.473545 \n", | |
"... ... ... ... ... \n", | |
"2095 8.344204 0.000000 4.453281 16.225835 \n", | |
"2096 0.000000 0.000000 0.000000 15.984498 \n", | |
"2097 0.000000 0.000000 0.000000 16.462403 \n", | |
"2098 0.000000 0.000000 1.528416 2.517452 \n", | |
"2099 3.322686 0.000000 6.099678 8.342897 \n", | |
"\n", | |
" bm25(title) elementCompleteness(attributes).completeness ... \\\n", | |
"0 3.827236 0.000000 ... \n", | |
"1 5.679297 0.250000 ... \n", | |
"2 5.551304 0.333333 ... \n", | |
"3 10.264424 0.700000 ... \n", | |
"4 4.905250 0.000000 ... \n", | |
"... ... ... ... \n", | |
"2095 2.644281 0.750000 ... \n", | |
"2096 12.946460 0.000000 ... \n", | |
"2097 6.943115 0.000000 ... \n", | |
"2098 1.550136 0.000000 ... \n", | |
"2099 3.765415 0.321429 ... \n", | |
"\n", | |
" textSimilarity(description).fieldCoverage \\\n", | |
"0 0.000000 \n", | |
"1 0.019048 \n", | |
"2 0.022472 \n", | |
"3 0.017751 \n", | |
"4 0.004651 \n", | |
"... ... \n", | |
"2095 0.017857 \n", | |
"2096 0.000000 \n", | |
"2097 0.000000 \n", | |
"2098 0.005780 \n", | |
"2099 0.009259 \n", | |
"\n", | |
" textSimilarity(description).order \\\n", | |
"0 0.0 \n", | |
"1 1.0 \n", | |
"2 1.0 \n", | |
"3 0.5 \n", | |
"4 0.0 \n", | |
"... ... \n", | |
"2095 0.0 \n", | |
"2096 0.0 \n", | |
"2097 0.0 \n", | |
"2098 1.0 \n", | |
"2099 0.0 \n", | |
"\n", | |
" textSimilarity(description).proximity \\\n", | |
"0 0.000000 \n", | |
"1 0.859375 \n", | |
"2 1.000000 \n", | |
"3 0.375000 \n", | |
"4 0.000000 \n", | |
"... ... \n", | |
"2095 0.000000 \n", | |
"2096 0.000000 \n", | |
"2097 0.000000 \n", | |
"2098 0.000000 \n", | |
"2099 0.000000 \n", | |
"\n", | |
" textSimilarity(description).queryCoverage \\\n", | |
"0 0.000000 \n", | |
"1 0.666667 \n", | |
"2 0.666667 \n", | |
"3 0.600000 \n", | |
"4 0.500000 \n", | |
"... ... \n", | |
"2095 0.250000 \n", | |
"2096 0.000000 \n", | |
"2097 0.000000 \n", | |
"2098 1.000000 \n", | |
"2099 0.500000 \n", | |
"\n", | |
" textSimilarity(description).score textSimilarity(title).fieldCoverage \\\n", | |
"0 0.000000 0.100000 \n", | |
"1 0.654591 0.166667 \n", | |
"2 0.704494 0.153846 \n", | |
"3 0.389800 0.222222 \n", | |
"4 0.150930 0.058824 \n", | |
"... ... ... \n", | |
"2095 0.078571 0.076923 \n", | |
"2096 0.000000 0.200000 \n", | |
"2097 0.000000 0.222222 \n", | |
"2098 0.451156 0.066667 \n", | |
"2099 0.151852 0.100000 \n", | |
"\n", | |
" textSimilarity(title).order textSimilarity(title).proximity \\\n", | |
"0 0.0 0.000000 \n", | |
"1 1.0 0.859375 \n", | |
"2 1.0 1.000000 \n", | |
"3 0.0 0.234375 \n", | |
"4 0.0 0.000000 \n", | |
"... ... ... \n", | |
"2095 0.0 0.000000 \n", | |
"2096 1.0 1.000000 \n", | |
"2097 1.0 0.859375 \n", | |
"2098 1.0 0.000000 \n", | |
"2099 0.0 0.000000 \n", | |
"\n", | |
" textSimilarity(title).queryCoverage textSimilarity(title).score \n", | |
"0 0.500000 0.170000 \n", | |
"1 0.666667 0.684115 \n", | |
"2 0.666667 0.730769 \n", | |
"3 0.400000 0.246476 \n", | |
"4 0.500000 0.161765 \n", | |
"... ... ... \n", | |
"2095 0.250000 0.090385 \n", | |
"2096 1.000000 0.840000 \n", | |
"2097 0.666667 0.695226 \n", | |
"2098 1.000000 0.463333 \n", | |
"2099 0.500000 0.170000 \n", | |
"\n", | |
"[2100 rows x 163 columns]" | |
] | |
}, | |
"execution_count": 334, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 314, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"term(3).significance -0.154106\n", | |
"fieldLength(title) -0.144911\n", | |
"fieldLength(description) -0.127688\n", | |
"term(4).significance -0.126086\n", | |
"product_uid -0.122270\n", | |
" ... \n", | |
"fieldMatch(title).queryCompleteness 0.331386\n", | |
"fieldMatch(title).completeness 0.331573\n", | |
"textSimilarity(title).queryCoverage 0.333115\n", | |
"nativeRank(title) 0.341930\n", | |
"relevance 1.000000\n", | |
"Name: relevance, Length: 158, dtype: float64" | |
] | |
}, | |
"execution_count": 314, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all[0:training_examples].corr()['relevance'].sort_values().dropna()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment