Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save analyticsindiamagazine/d07038c1e42c2269ddaaf868eb8cd58a to your computer and use it in GitHub Desktop.
Save analyticsindiamagazine/d07038c1e42c2269ddaaf868eb8cd58a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#1 Importing necessary libraries\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" ---------------------------------------- \n",
" ABV Ratings Cellar Temperature Score\n",
"0 7.5 1 40-45 4.08\n",
"1 5.3 22 40-45 3.82\n",
"2 9.0 1 45-50 4.03\n",
"3 4.6 1 35-40 4.00\n",
"4 6.9 1 45-50 3.75\n",
"5 7.9 32 40-45 4.26\n",
"6 4.7 141 35-40 3.47\n",
"7 5.6 1 40-45 3.70\n",
"8 5.0 1 40-45 3.90\n",
"9 5.4 12 40-45 3.79\n"
]
}
],
"source": [
"#2 Importing the data set\n",
"dataset = pd.read_csv('beer_data.csv')\n",
"\n",
"#Printing first 10 rows of the dataset\n",
"print(\"\\n\",'-'*40,\"\\n\",dataset.head(10))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"----------------------------\n",
" ABV Ratings Minimum_Cellar_Temp Maximum_Cellar_Temp Score\n",
"0 7.5 1 40 45 4.08\n",
"1 5.3 22 40 45 3.82\n",
"2 9.0 1 45 50 4.03\n",
"3 4.6 1 35 40 4.00\n",
"4 6.9 1 45 50 3.75\n",
"5 7.9 32 40 45 4.26\n",
"6 4.7 141 35 40 3.47\n",
"7 5.6 1 40 45 3.70\n",
"8 5.0 1 40 45 3.90\n",
"9 5.4 12 40 45 3.79\n",
"\n",
"----------------------------\n",
"\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1631 entries, 0 to 1630\n",
"Data columns (total 5 columns):\n",
"ABV 1631 non-null float64\n",
"Ratings 1631 non-null int64\n",
"Minimum_Cellar_Temp 1631 non-null int64\n",
"Maximum_Cellar_Temp 1631 non-null int64\n",
"Score 1631 non-null float64\n",
"dtypes: float64(2), int64(3)\n",
"memory usage: 63.8 KB\n",
"None\n"
]
}
],
"source": [
"#3 Dealing with the categorical data\n",
"\n",
"#spliting Cellar Temperature into Maximum and Minimum based on the given data and converting the type from str to int\n",
"dataset['Minimum_Cellar_Temp'] = dataset['Cellar Temperature'].apply(lambda x : int(x.split('-')[0].strip()))\n",
"dataset['Maximum_Cellar_Temp'] = dataset['Cellar Temperature'].apply(lambda x : int(x.split('-')[1].strip()))\n",
"\n",
"#New dataset with selected features\n",
"dataset = dataset[['ABV', 'Ratings','Minimum_Cellar_Temp','Maximum_Cellar_Temp', 'Score']]\n",
"\n",
"#Printing first 10 rows of the dataset\n",
"print(\"\\n----------------------------\\n\",dataset.head(10))\n",
"\n",
"#Printing the summary of the dataset\n",
"print(\"\\n----------------------------\\n\")\n",
"print(dataset.info())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Text(0.5, 0, 'ABV'),\n",
" Text(1.5, 0, 'Ratings'),\n",
" Text(2.5, 0, 'Minimum_Cellar_Temp'),\n",
" Text(3.5, 0, 'Maximum_Cellar_Temp'),\n",
" Text(4.5, 0, 'Score')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x720 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#A simple correlation plot usong seaborn. The below plot shows how the different variables correlate with each other\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"corr = dataset.corr()\n",
"fig, ax = plt.subplots(figsize=(10,10))\n",
"ax = sns.heatmap(\n",
" corr, \n",
" vmin=-1, vmax=1, center=0,\n",
" square=True,\n",
" annot=True,\n",
" linewidths=.5,\n",
" cmap=\"YlGnBu\" )\n",
"\n",
"#Rotating labels on x axis\n",
"ax.set_xticklabels(\n",
" ax.get_xticklabels(),\n",
" rotation=35,\n",
" horizontalalignment='right'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#4 Classifying dependent and independent variables\n",
"\n",
"#All columns except the last column are independent features- (Selecting every column except Score)\n",
"X = dataset.iloc[:,:-1].values\n",
"\n",
"#Only the last column is the dependent feature or the target variable(Score)\n",
"y = dataset.iloc[:,-1].values"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#5 Creating training and test sets\n",
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)\n",
"\n",
"#################Data Preprocessing Ends #################################"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Training Set :\n",
"----------------\n",
"\n",
"X = \n",
" [[ 5.2 5. 40. 45. ]\n",
" [ 5.6 1. 35. 40. ]\n",
" [ 4.8 2. 40. 45. ]\n",
" ...\n",
" [ 6. 1. 40. 45. ]\n",
" [ 4.5 2. 40. 45. ]\n",
" [ 7.4 4. 40. 45. ]]\n",
"y = \n",
" [3.79 3.9 3.44 ... 4.35 3.29 3.74]\n",
"\n",
"\n",
"Test Set :\n",
"----------------\n",
"\n",
"X = \n",
" [[ 4.8 2. 35. 40. ]\n",
" [ 5.1 0. 35. 40. ]\n",
" [ 4. 3. 40. 45. ]\n",
" ...\n",
" [ 5.1 0. 35. 40. ]\n",
" [ 5. 17. 35. 40. ]\n",
" [ 8. 24. 45. 50. ]]\n",
"y = \n",
" [3.13 0. 2.82 3.91 4.38 4.25 3.82 0. 3.49 2. 3.4 3.53 3.89 3.64\n",
" 3.84 4.17 3.71 4.31 3.9 3. 3.83 3.43 0. 3.98 3.81 0. 3.84 3.72\n",
" 2.97 4.12 3.22 0. 3.77 0. 4.27 4.07 3.8 4. 3.1 3.46 3.75 3.86\n",
" 3.72 3.55 4.16 4.29 4.18 3.86 4.23 4.11 3.72 1.95 3.87 1.99 3.84 3.75\n",
" 0. 3.77 3.25 3.87 4.25 3.73 3.84 0. 3.5 3.59 4.29 3.17 3.37 4.29\n",
" 3.67 3.4 3.9 4.21 4.09 3.56 3.25 3.73 3.79 3.98 3.63 4.04 4.31 3.96\n",
" 4.24 3.71 2.73 0. 3.65 3.8 3.58 4.5 0. 4. 0. 3.71 3.61 3.63\n",
" 3.06 3.58 4.09 3.12 3.59 3.73 4.06 3.5 3.81 3.43 3.7 3.97 3.58 3.99\n",
" 4.14 3.5 0. 4.25 3.41 4.26 4.4 3.89 3.75 0. 3.8 3.89 3.92 2.72\n",
" 4.72 4.08 3.9 3.8 3.88 0. 3.27 3.59 3.98 3.79 3.75 4.03 2. 3.94\n",
" 3.37 3.98 3.32 0. 3.11 3.15 4.33 3. 4.02 2.92 3.73 0. 0. 3.91\n",
" 0. 4.43 0. 0. 4.28 3.8 3.58 4.01 3.56 4. 2.79 3.88 4.16 4.25\n",
" 3.69 4.29 4.14 4.24 3.86 3.65 3.92 4.34 3.76 3.13 3.26 4.05 3.86 3.75\n",
" 3.76 3.87 0. 4.02 3.5 3.12 3.83 3.43 0. 3.88 3.75 3.96 0. 4.06\n",
" 4.25 2.75 3.65 3.3 3.88 3.78 4.03 3.75 4.08 4.08 4.04 0. 3.25 0.\n",
" 4. 3.38 3.63 0. 3.72 3.79 0. 0. 4.09 3.65 4.13 3.63 4. 0.\n",
" 5. 0. 3.76 3.72 3.85 3.5 0. 2.1 3. 0. 0. 3.66 3.76 3.46\n",
" 3.82 4.04 3.72 1.95 3.89 3.91 3.73 4.85 0. 3.79 4.49 4.14 3.87 3.68\n",
" 3.49 0. 3.65 4.35 4.59 4.25 3.86 4.07 3.73 0. 4.3 3.87 3.51 3.79\n",
" 3.62 3.68 3.63 3.42 0. 4.05 3.61 4.18 3.84 3.73 4.05 3.66 3.69 3.6\n",
" 3.56 4.25 4.38 3.89 3.69 3.79 0. 3.56 3.96 3.29 3.69 0. 0. 4.14\n",
" 3.13 1.31 3.72 3.79 4.1 0. 4.28 3.67 0. 3.94 4.35 3.94 3.43 4.08\n",
" 3.61 4.47 3.83 3.15 3.96 3.71 4.25 3.5 4.04 4. 3.52 4.39 3.77 4.15\n",
" 2.96 4.13 0. 4.05 4.14]\n"
]
}
],
"source": [
"print(\"\\n\\nTraining Set :\\n----------------\\n\")\n",
"print(\"X = \\n\", X_train)\n",
"print(\"y = \\n\", y_train)\n",
"\n",
"print(\"\\n\\nTest Set :\\n----------------\\n\")\n",
"print(\"X = \\n\",X_test)\n",
"print(\"y = \\n\", y_test)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"\"\"\" Multiple Linear regression \"\"\"\n",
"\n",
"#6 Creating the Regressor and training it with the training set\n",
"\n",
"from sklearn.linear_model import LinearRegression\n",
"regressor = LinearRegression(normalize = True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#7 Feeding the data and training the model\n",
"regressor.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#8 Predicting the Score for test set observations\n",
"y_pred = regressor.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"----------------------------\n",
"Predictions = \n",
" [2.89362768 2.90755783 3.07058711 3.48413975 3.65433446 2.9310254\n",
" 3.12667115 2.90202462 3.11905124 3.1177165 3.13259288 3.13790974\n",
" 2.84802728 2.95505068 3.58824928 3.15091575 2.85489522 3.2262689\n",
" 3.13565087 3.184115 3.23524861 3.64880125 3.11084856 5.82687381\n",
" 3.37539607 3.53546761 3.54748025 3.08718673 3.12458445 3.27929496\n",
" 3.23944708 3.54100082 3.16387461 3.19384668 3.56883606 3.08031878\n",
" 3.38869332 3.2505135 2.98405145 3.05685121 3.14538254 3.14972514\n",
" 3.13259288 3.26844786 3.23391387 3.2241822 4.24094226 3.7083318\n",
" 3.08451725 3.67646729 3.57475778 3.1177165 3.21864899 2.91576051\n",
" 3.23381064 3.14538254 3.15511422 3.17858179 3.20071463 3.12189289\n",
" 3.30031237 3.25031924 3.25604671 3.16618064 2.88675973 3.4108733\n",
" 3.61980047 3.15758944 2.9666748 6.49031035 3.39274468 2.96822886\n",
" 3.06791763 3.54080656 2.96002618 3.16331691 3.1121833 3.01936247\n",
" 3.22799215 3.13984934 3.10186884 3.26364155 3.44832413 3.73313411\n",
" 3.41640651 2.91097927 3.23391387 3.07764931 3.22092995 3.49154033\n",
" 3.22284746 3.05685121 3.53546761 3.2241822 3.17171385 3.02884976\n",
" 2.93216588 2.95082714 3.32225095 3.10665009 2.9046941 3.14652302\n",
" 3.14727499 3.38740573 3.21178104 3.10133918 3.1299234 2.93545196\n",
" 3.31652348 3.17858179 3.05675409 3.23391387 2.91442577 3.28924596\n",
" 3.11084856 3.51180579 2.91231401 3.32508962 3.12324971 3.14671728\n",
" 3.23828154 2.99055595 3.21311578 3.21178104 3.05248653 2.85069675\n",
" 3.79113567 3.53583106 3.20624783 3.45513897 3.17438332 3.23257914\n",
" 2.9502945 3.29931602 3.10931956 3.16006466 3.20758257 3.04122585\n",
" 3.19518142 3.21711999 3.53813709 3.39293894 3.1562547 2.90202462\n",
" 3.28596864 2.85889943 8.46101308 3.40953856 3.32906876 2.88122652\n",
" 3.15902741 3.13298139 3.22151272 3.01402352 3.09978214 3.28074578\n",
" 3.14404781 3.19937989 3.29303085 3.56713787 3.0029571 3.12458445\n",
" 3.87218465 3.27264633 2.9680346 3.25738145 3.1177165 3.15225049\n",
" 3.16189115 3.28885744 3.23658335 3.1035921 3.46809344 2.92566138\n",
" 2.92549219 3.22073569 4.22552699 3.56446839 2.90850405 3.6228334\n",
" 2.92262846 2.9310254 3.18659022 2.90889257 3.17724706 2.9974239\n",
" 3.13431613 2.91136778 2.98577471 3.55194489 3.39713741 3.61275708\n",
" 3.18525548 3.51581 2.90202462 3.29058069 3.13431613 2.9102273\n",
" 3.56446839 3.14805202 3.34314007 3.37080611 3.52020273 3.44827102\n",
" 3.86996336 3.49234243 3.18659022 3.12191497 3.26157992 3.15511422\n",
" 3.07309036 3.28591852 2.9310254 3.23811235 3.56560887 3.04158633\n",
" 3.23811235 3.14404781 3.15091575 3.13210284 3.21178104 2.94417852\n",
" 3.28924596 3.53546761 3.08595523 3.15511422 3.31947824 3.53527336\n",
" 3.76919709 3.70413333 3.26024518 2.88809447 3.19518142 3.12191497\n",
" 3.60186611 3.1476635 2.89362768 3.33816457 3.34942822 3.16198217\n",
" 3.15491997 3.09558367 3.23658335 3.14193604 3.37080611 3.53469058\n",
" 2.99055595 2.87569332 3.18792496 3.51180579 3.57626171 2.89229294\n",
" 3.07345084 3.36947137 3.05682615 3.57514629 3.28924596 3.78127867\n",
" 2.96555938 3.67646729 2.93655861 2.90202462 4.98814472 3.26900557\n",
" 3.16445739 3.75813068 3.17991653 3.55893518 3.10092262 3.22245894\n",
" 3.2049131 3.48154844 2.92376894 3.12324971 3.11198904 3.54748025\n",
" 3.10665009 3.26424939 3.34591278 2.91986185 3.71663772 3.41507177\n",
" 3.33484636 3.73046464 2.90335936 3.57876199 3.12191497 3.16634983\n",
" 3.26473191 3.16288124 3.43194382 2.90755783 3.26024518 3.62113521\n",
" 3.05952069 2.82036124 2.91270252 3.23658335 3.01258555 2.85775895\n",
" 3.51847947 3.17991653 3.13298139 3.59194018 3.28235295 3.56980734\n",
" 3.12189289 3.26711312 2.93636435 3.22952115 3.66406614 2.90335936\n",
" 3.7027986 3.16487974 3.23391387 3.17991653 3.39788938 3.02508994\n",
" 2.9046941 3.65853293 3.26558413 3.18964821 3.15644896 3.215591\n",
" 2.90755783 2.92471516 3.53983528]\n"
]
}
],
"source": [
"#printing the predictions\n",
"print(\"\\n----------------------------\\nPredictions = \\n\",y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" ----------------------------------------\n",
"RMLSE : 0.23371046561588427\n",
"Score : 0.7662895343841157\n"
]
}
],
"source": [
"#9 Calculating score from Root Mean Log Squared Error\n",
"\n",
"def rmlse(y_test, y_pred):\n",
" error = np.square(np.log10(y_pred +1) - np.log10(y_test +1)).mean() ** 0.5\n",
" score = 1 - error\n",
" return error, score\n",
"\n",
"error, score = rmlse(y_test, y_pred)\n",
"\n",
"print(\"\\n\",'-'*40)\n",
"print(\"RMLSE : \", error)\n",
"print(\"Score : \", score)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment