"name": "MultiVar-Linear-Regression.ipynb",
"# **Multiple Variable Linear Regression**\n",
"Multiple variable linear regression คือ โมเดลทางสถิติที่สามารถบอกถึงความสัมพันธ์ ของตัวแปลต้น \"หลายตัว\" และตัวแปรตามได้ \n",
"ซึ่งสามารถเขียน **hypothesis function** ได้ว่า <br>\n",
"h_\\theta (x) = \\theta _0 + \\theta _1x_1 +\\theta _2x_2 + \\theta_3x_3+...+\\theta_nx_n\n",
"\\end{equation} <br>\n",
"ซึ่งถ้าเรามองว่า $\\theta$ คือ vector <$\\theta _0 , \\theta _1,\\theta _2 , \\theta_3,...,\\theta_n$><br>\n",
"$\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;\\;$ x คือ vector <$x_0 , x_1,x_2 , x_3,...,x_n$><br><br>\n",
"h_\\theta (x) = \\theta^Tx \n",
"\\end{equation} <br>\n",
" สามารถเขียน **cost function** หรือ loss function ได้ว่า <br><br> \n",
"J(\\theta) = \\frac{1}{2m}\\sum^m_{i=1}(h_\\theta(x^{(i)}) - y^{(i)})^2\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.datasets import load_boston\n",
"pd.set_option('display.max_rows', df.shape[0]+1) "
"โดยขั้นแรกจะทำการโหลด boston housing datasets ด้วย library sklearn "
"dat = load_boston(return_X_y=False) # boston housing datasets"
"df = pd.DataFrame(,columns=dat.feature_names) # create dataframe\n",
"outputs": [
" <tbody>
" <tr>\n",
" <th>0</th>\n",
" <td>0.00632</td>\n",
" <td>18.0</td>\n",
" <td>2.31</td>\n",
" <td>0.0</td>\n",
" <td>0.538</td>\n",
" <td>6.575</td>\n",
" <td>65.2</td>\n",
" <td>4.0900</td>\n",
" <td>1.0</td>\n",
" <td>296.0</td>\n",
" <td>15.3</td>\n",
" <td>396.90</td>\n",
" <td>4.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.02731</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0.0</td>\n",
" <td>0.469</td>\n",
" <td>6.421</td>\n",
" <td>78.9</td>\n",
" <td>4.9671</td>\n",
" <td>2.0</td>\n",
" <td>242.0</td>\n",
" <td>17.8</td>\n",
" <td>396.90</td>\n",
" <td>9.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.02729</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0.0</td>\n",
" <td>0.469</td>\n",
" <td>7.185</td>\n",
" <td>61.1</td>\n",
" <td>4.9671</td>\n",
" <td>2.0</td>\n",
" <td>242.0</td>\n",
" <td>17.8</td>\n",
" <td>392.83</td>\n",
" <td>4.03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.03237</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0.0</td>\n",
" <td>0.458</td>\n",
" <td>6.998</td>\n",
" <td>45.8</td>\n",
" <td>6.0622</td>\n",
" <td>3.0</td>\n",
" <td>222.0</td>\n",
" <td>18.7</td>\n",
" <td>394.63</td>\n",
" <td>2.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.06905</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0.0</td>\n",
" <td>0.458</td>\n",
" <td>7.147</td>\n",
" <td>54.2</td>\n",
" <td>6.0622</td>\n",
" <td>3.0</td>\n",
" <td>222.0</td>\n",
" <td>18.7</td>\n",
" <td>396.90</td>\n",
" <td>5.33</td>\n",
" </tr>\n",
" </tbody>
"text/plain":
"0 0.00632 18.0 2.31 0.0 0.538 ... 1.0 296.0 15.3 396.90 4.98\n",
"1 0.02731 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 396.90 9.14\n",
"2 0.02729 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 392.83 4.03\n",
"3 0.03237 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 394.63 2.94\n",
"4 0.06905 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 396.90 5.33\n",
"[5 rows x 13 columns]"
"เป้าหมายของเราคือการทำนายราคาของบ้าน จากข้อมูล"
"target = pd.DataFrame(,columns=['PRICES']) # target prices\n",
"outputs": [
" <tbody>
" <th>0</th>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>34.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>36.2</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain":
" PRICES\n",
"0 24.0\n",
"1 21.6\n",
"2 34.7\n",
"3 33.4\n",
"4 36.2"
"โดยจากที่เห็นตัวข้อมูลจะพบว่า ข้อมูลในแต่ละ feature มี scale ที่ต่างกันมาก เลยมากทำการ scaling ด้วย **Z-score Normalization**<br><br>\n",
"x_j := \\frac{x_j - \\mu_j}{\\sigma_j}\n",
"\\end{equation} <br>\n",
"$\\mu_j$ คือ ค่าเฉลี่ยของ feature นั้นๆ<br>\n",
"$\\sigma_j$ คือ ส่วนเบี่ยงเบนมาตรฐานของ feature นั้นๆ"
"def scaling(x): #scaling input \n",
" std = np.std(x)\n",
" std[0] = 1\n",
" mean = np.mean(x)\n",
" mean[0] = 0\n",
" return (x-mean)/std\n",
"df = scaling(df)\n",
"outputs": [
" <tbody>
" <th>0</th>\n",
" <td>0.00632</td>\n",
" <td>0.284830</td>\n",
" <td>-1.287909</td>\n",
" <td>-0.272599</td>\n",
" <td>-0.144217</td>\n",
" <td>0.413672</td>\n",
" <td>-0.120013</td>\n",
" <td>0.140214</td>\n",
" <td>-0.982843</td>\n",
" <td>-0.666608</td>\n",
" <td>-1.459000</td>\n",
" <td>0.441052</td>\n",
" <td>-1.075562</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.02731</td>\n",
" <td>-0.487722</td>\n",
" <td>-0.593381</td>\n",
" <td>-0.272599</td>\n",
" <td>-0.740262</td>\n",
" <td>0.194274</td>\n",
" <td>0.367166</td>\n",
" <td>0.557160</td>\n",
" <td>-0.867883</td>\n",
" <td>-0.987329</td>\n",
" <td>-0.303094</td>\n",
" <td>0.441052</td>\n",
" <td>-0.492439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.02729</td>\n",
" <td>-0.487722</td>\n",
" <td>-0.593381</td>\n",
" <td>-0.272599</td>\n",
" <td>-0.740262</td>\n",
" <td>1.282714</td>\n",
" <td>-0.265812</td>\n",
" <td>0.557160</td>\n",
" <td>-0.867883</td>\n",
" <td>-0.987329</td>\n",
" <td>-0.303094</td>\n",
" <td>0.396427</td>\n",
" <td>-1.208727</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.03237</td>\n",
" <td>-0.487722</td>\n",
" <td>-1.306878</td>\n",
" <td>-0.272599</td>\n",
" <td>-0.835284</td>\n",
" <td>1.016303</td>\n",
" <td>-0.809889</td>\n",
" <td>1.077737</td>\n",
" <td>-0.752922</td>\n",
" <td>-1.106115</td>\n",
" <td>0.113032</td>\n",
" <td>0.416163</td>\n",
" <td>-1.361517</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.06905</td>\n",
" <td>-0.487722</td>\n",
" <td>-1.306878</td>\n",
" <td>-0.272599</td>\n",
" <td>-0.835284</td>\n",
" <td>1.228577</td>\n",
" <td>-0.511180</td>\n",
" <td>1.077737</td>\n",
" <td>-0.752922</td>\n",
" <td>-1.106115</td>\n",
" <td>0.113032</td>\n",
" <td>0.441052</td>\n",
" <td>-1.026501</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain":
"0 0.00632 0.284830 -1.287909 ... -1.459000 0.441052 -1.075562\n",
"1 0.02731 -0.487722 -0.593381 ... -0.303094 0.441052 -0.492439\n",
"2 0.02729 -0.487722 -0.593381 ... -0.303094 0.396427 -1.208727\n",
"3 0.03237 -0.487722 -1.306878 ... 0.113032 0.416163 -1.361517\n",
"4 0.06905 -0.487722 -1.306878 ... 0.113032 0.441052 -1.026501\n",
"[5 rows x 13 columns]"
"ต่อจากนั้นก็มาทำสมการ **hypothesis function**, **cost function** และ **gradient descent** <br><br>\n",
"Hypothesis\\;Function :\\;\\;\n",
"h_\\theta (x) = \\theta^Tx \n",
"\\end{equation} <br>\n",
"J(\\theta) = \\frac{1}{2m}\\sum^m_{i=1}(h_\\theta(x^{(i)}) - y^{(i)})^2\n",
"\\theta_j := \\theta_j-\\alpha\\frac{1}{m}\\sum^m_{i=1}(h_\\theta(x^{(i)}) - y^{(i)})x^{(i)}_j\n",
"ones = pd.DataFrame({'ones':np.ones(df.shape[0]).T}) \n",
"df = ones.join(df)\n",
"m = df.shape[0]\n",
"n = df.shape[1]\n",
"s = np.zeros(n)\n",
"sT = np.array([s]).T #all parameters\n",
"def hypo(x,sT): #hypothesis function\n",
" return,sT)\n",
"def costFunction(x,sT): #cost function\n",
" h = hypo(x,sT)\n",
" return (1/(2*m))*sum(np.array((h-target))**2)\n",
" \n",
"def gradientCost(x,sT): #gradient descent\n",
" h = hypo(x,sT)\n",
" return (1/(m))*,(h-target))"
"กำหนดให้ learning rate = 0.01 <br>\n",
"และ เริ่มทำการ train"
"alpha = 0.01 # learning rate\n",
"last = costFunction(df,sT)+1\n",
"while last - costFunction(df,sT)>0.001: #loop until change < 0.001\n",
" last = costFunction(df,sT)\n",
" sT = sT - alpha*(gradientCost(df,sT))\n",
"print('result :',sT.T[0])"
"result : [22.65365393 -0.06590317 0.64831977 -0.18460845 0.76100326 -1.12507124\n",
" 3.03972749 -0.11594876 -2.25233247 0.81150778 -0.56628214 -1.82116539\n",
" 0.88059889 -3.70432229]\n"
"สรุป เปรียบเทียบผลการทำนาย"
"compare = pd.DataFrame({'Target Prices':np.array(target).T[0],'Predicted Prices',sT).T[0]}) # comparing result with target \n",
" <tbody>
" <th>0</th>\n",
" <td>24.0</td>\n",
" <td>30.595625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>21.6</td>\n",
" <td>24.982998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>34.7</td>\n",
" <td>30.979016</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33.4</td>\n",
" <td>29.284144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>36.2</td>\n",
" <td>28.673260</td>\n",
" </tr>\n",
" </tbody>\n",
"text/plain":
" Target Prices Predicted Prices\n",
"0 24.0 30.595625\n",
"1 21.6 24.982998\n",
"2 34.7 30.979016\n",
"3 33.4 29.284144\n",
"4 36.2 28.673260"
