Created
May 29, 2022 13:21
-
-
Save JIElite/f021e43d5775647bdf60a6e309682187 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "d2478df4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import pandas as pd\n", | |
"from pandas.plotting import scatter_matrix\n", | |
"import matplotlib.pyplot as plt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "2039b0c7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.set_option('display.max_columns', 120)\n", | |
"pd.set_option('display.max_rows', 120)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "392e6460", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def clean_and_drop(df):\n", | |
" # 只篩選有包含 '住' 用途的交易案\n", | |
" df = df.loc[df['Main_Usage_Living'] == 1]\n", | |
" df = df.drop(columns=['Main_Usage_Living'])\n", | |
" \n", | |
" # 因為都是 0\n", | |
" df = df.drop(columns=['Non_City_Land_Usage', 'Main_Usage_Walk', \n", | |
" 'Main_Usage_Selling',\n", | |
" 'Main_Usage_SnE'])\n", | |
" \n", | |
" # 只有 344 筆是包含工廠用途,且都不具住宅用途,故剔除\n", | |
" df = df.loc[df['Main_Usage_Manufacturing'] == 0]\n", | |
" df = df.drop(columns=['Main_Usage_Manufacturing'])\n", | |
" \n", | |
" # 只有 76 筆是包含停車用途,且都不具住宅用途,故剔除\n", | |
" df = df.loc[df['Main_Usage_Parking'] == 0]\n", | |
" df = df.drop(columns=['Main_Usage_Parking'])\n", | |
" \n", | |
" # 只有 78 筆有農業用途,且都不具住宅用途,故剔除\n", | |
" df = df.loc[df['Main_Usage_Farm'] == 0]\n", | |
" df = df.drop(columns=['Main_Usage_Farm'])\n", | |
" \n", | |
" # NOTICE: 我沒有錢,所以我先只買 6 房以下的\n", | |
" df = df.loc[df['room'] < 6]\n", | |
" \n", | |
" df = df.loc[df['trading_floors_count'] == 1]\n", | |
" \n", | |
" # 雖然有 95 個樣本包含地下室,但是樣本太少,可能不足以推廣\n", | |
" # 所以先剔除,剔除完後,都是 0 所以直接 drop\n", | |
" df = df.loc[df['including_basement'] == 0]\n", | |
" df = df.drop(columns=['including_basement'])\n", | |
" \n", | |
" # 所有的樣本都不包含人行道,所以直接去除這個 feature\n", | |
" df = df.drop(columns=['including_arcade'])\n", | |
"\n", | |
" # 剔除交易樓層高度是 -1 (原本有一個樣本)\n", | |
" df = df.loc[df['min_floors_height'] != -1]\n", | |
"\n", | |
" # 剔除交易建物是 0 個樓層的情況\n", | |
" df = df.loc[df['building_total_floors'] != 0]\n", | |
" \n", | |
" # 因為車位交易 50 坪以上的資料只有 22 筆,所以先去除\n", | |
" # 因為浮點數在硬體儲存會有小數點,故不能直接用 == 50.0 去比較\n", | |
" df = df.loc[df['Parking_Area'] < 49.5]\n", | |
" \n", | |
" # 把農舍,廠辦踢掉\n", | |
" df = df.loc[df['Building_Types'] < 8]\n", | |
"\n", | |
" # 把超大轉移坪數刪掉\n", | |
" df = df.loc[df['Transfer_Total_Ping'] < 150]\n", | |
" \n", | |
" # 我先刪除 area_m2, 因為覺得跟 area_ping 的意義很類似,但是不確定會不會有些微差距。\n", | |
" # 因為在 future data 中,manager 都是 0,所以也把這個欄位刪除\n", | |
" # trading_floor_count 有 0 的情況,這樣應該不是房屋交易\n", | |
" df = df.drop(columns=['address', 'area_m2', 'manager', 'Building_Material_stone', \n", | |
" 'TDATE', 'Total_price', '編號'])\n", | |
" \n", | |
" # Convert the categorical features' dtype to 'category'\n", | |
" category_columns = ['Type', 'Month', 'Month_raw',\n", | |
" 'room', 'City_Land_Usage', 'Main_Usage_Business',\n", | |
" 'Building_Material_S', 'Building_Material_R', 'Building_Material_C',\n", | |
" 'Building_Material_steel', 'Building_Material_B', \n", | |
" 'Building_Material_W', 'Building_Material_iron',\n", | |
" 'Building_Material_tile', 'Building_Material_clay',\n", | |
" 'Building_Material_RC_reinforce',\n", | |
" 'Parking_Space_Types', 'Building_Types']\n", | |
" df.loc[:, category_columns] = df.loc[:, category_columns].astype('category')\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "23fefe16", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def split_features_target(df):\n", | |
" X = df.drop(columns=['Unit_Price_Ping'])\n", | |
" y = df['Unit_Price_Ping']\n", | |
" return X, y\n", | |
"\n", | |
"def train(model, X_train, y_train):\n", | |
" model.fit(X_train, y_train)\n", | |
" return model\n", | |
"\n", | |
"def eval(model, X_test, y_test):\n", | |
" from sklearn.metrics import (r2_score, \n", | |
" mean_absolute_error, \n", | |
" mean_squared_error)\n", | |
" \n", | |
" y_pred = model.predict(X_test)\n", | |
" print(f'R2 score: {r2_score(y_test, y_pred)}')\n", | |
" print(f'MAE score: {mean_absolute_error(y_test, y_pred)}')\n", | |
" print(f'MSE score: {mean_squared_error(y_test, y_pred)}')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "06f18869", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_future = pd.read_csv('../temp_future/output_feature/clean_data_future_train.csv')\n", | |
"df_future_test = pd.read_csv('../temp_future/output_feature/clean_data_future_test.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "d42ed451", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_future = clean_and_drop(df_future)\n", | |
"X_train, y_train = split_features_target(df_future)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "b9654b0c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.tree import DecisionTreeRegressor\n", | |
"model = train(DecisionTreeRegressor(max_depth=16), X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "ec64058b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_future_test = clean_and_drop(df_future_test)\n", | |
"X_test, y_test = split_features_target(df_future_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "8887a76d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Training performance: \n", | |
"R2 score: 0.9718462454562103\n", | |
"MAE score: 17557.26483457317\n", | |
"MSE score: 874372717.5443403\n", | |
"\n", | |
"Evaluation performance: \n", | |
"R2 score: 0.8604217189337562\n", | |
"MAE score: 45734.755834867094\n", | |
"MSE score: 5186478273.357563\n" | |
] | |
} | |
], | |
"source": [ | |
"print('Training performance: ')\n", | |
"eval(model, X_train, y_train)\n", | |
"print()\n", | |
"print('Evaluation performance: ')\n", | |
"eval(model, X_test, y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "5a3a1e25", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[LibLinear]..............................\n", | |
"optimization finished, #iter = 300\n", | |
"\n", | |
"WARNING: reaching max number of iterations\n", | |
"Using -s 11 may be faster\n", | |
"\n", | |
"Objective value = -1.268062\n", | |
"nSV = 180434\n", | |
"Training performance: \n", | |
"R2 score: -0.6423872133386352\n", | |
"MAE score: 160428.64156413326\n", | |
"MSE score: 51007710845.57724\n", | |
"\n", | |
"Evaluation performance: \n", | |
"R2 score: -0.39104894557182335\n", | |
"MAE score: 159721.1200107199\n", | |
"MSE score: 51688880807.76079\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/elichen/anaconda3/envs/py310/lib/python3.10/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", | |
" warnings.warn(\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.svm import LinearSVR\n", | |
"\n", | |
"model = LinearSVR(C=1.0, epsilon=0.0, verbose=True, max_iter=300, random_state=1207)\n", | |
"train(model, X_train, y_train)\n", | |
"\n", | |
"print('Training performance: ')\n", | |
"eval(model, X_train, y_train)\n", | |
"print()\n", | |
"print('Evaluation performance: ')\n", | |
"eval(model, X_test, y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9f399719", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[LibSVM]......." | |
] | |
} | |
], | |
"source": [ | |
"# Too slow\n", | |
"# The fit time complexity is more than quadratic\n", | |
"# with the number of samples which makes it hard\n", | |
"# to scale to datasets with more than a couple of\n", | |
"# 10000 samples. For large datasets consider using\n", | |
"# LinearSVR or SGDRegressor instead, possibly\n", | |
"# after a Nystroem transformer.\n", | |
"from sklearn.svm import SVR\n", | |
"\n", | |
"model = SVR(C=1.0, epsilon=0.0, verbose=True)\n", | |
"train(model, X_train, y_train)\n", | |
"\n", | |
"print('Training performance: ')\n", | |
"eval(model, X_train, y_train)\n", | |
"print()\n", | |
"print('Evaluation performance: ')\n", | |
"eval(model, X_test, y_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "717e1eb3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"interpreter": { | |
"hash": "6bb21fecf6dccdf6d7bdcdc2e427eb532b41f9984a3ac143d01e9b178188a26a" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment