Skip to content

Instantly share code, notes, and snippets.

@JIElite
Created May 29, 2022 13:21
Show Gist options
  • Save JIElite/f021e43d5775647bdf60a6e309682187 to your computer and use it in GitHub Desktop.
Save JIElite/f021e43d5775647bdf60a6e309682187 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d2478df4",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import pandas as pd\n",
"from pandas.plotting import scatter_matrix\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2039b0c7",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', 120)\n",
"pd.set_option('display.max_rows', 120)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "392e6460",
"metadata": {},
"outputs": [],
"source": [
"def clean_and_drop(df):\n",
" # 只篩選有包含 '住' 用途的交易案\n",
" df = df.loc[df['Main_Usage_Living'] == 1]\n",
" df = df.drop(columns=['Main_Usage_Living'])\n",
" \n",
" # 因為都是 0\n",
" df = df.drop(columns=['Non_City_Land_Usage', 'Main_Usage_Walk', \n",
" 'Main_Usage_Selling',\n",
" 'Main_Usage_SnE'])\n",
" \n",
" # 只有 344 筆是包含工廠用途,且都不具住宅用途,故剔除\n",
" df = df.loc[df['Main_Usage_Manufacturing'] == 0]\n",
" df = df.drop(columns=['Main_Usage_Manufacturing'])\n",
" \n",
" # 只有 76 筆是包含停車用途,且都不具住宅用途,故剔除\n",
" df = df.loc[df['Main_Usage_Parking'] == 0]\n",
" df = df.drop(columns=['Main_Usage_Parking'])\n",
" \n",
" # 只有 78 筆有農業用途,且都不具住宅用途,故剔除\n",
" df = df.loc[df['Main_Usage_Farm'] == 0]\n",
" df = df.drop(columns=['Main_Usage_Farm'])\n",
" \n",
" # NOTICE: 我沒有錢,所以我先只買 6 房以下的\n",
" df = df.loc[df['room'] < 6]\n",
" \n",
" df = df.loc[df['trading_floors_count'] == 1]\n",
" \n",
" # 雖然有 95 個樣本包含地下室,但是樣本太少,可能不足以推廣\n",
" # 所以先剔除,剔除完後,都是 0 所以直接 drop\n",
" df = df.loc[df['including_basement'] == 0]\n",
" df = df.drop(columns=['including_basement'])\n",
" \n",
" # 所有的樣本都不包含人行道,所以直接去除這個 feature\n",
" df = df.drop(columns=['including_arcade'])\n",
"\n",
" # 剔除交易樓層高度是 -1 (原本有一個樣本)\n",
" df = df.loc[df['min_floors_height'] != -1]\n",
"\n",
" # 剔除交易建物是 0 個樓層的情況\n",
" df = df.loc[df['building_total_floors'] != 0]\n",
" \n",
" # 因為車位交易 50 坪以上的資料只有 22 筆,所以先去除\n",
" # 因為浮點數在硬體儲存會有小數點,故不能直接用 == 50.0 去比較\n",
" df = df.loc[df['Parking_Area'] < 49.5]\n",
" \n",
" # 把農舍,廠辦踢掉\n",
" df = df.loc[df['Building_Types'] < 8]\n",
"\n",
" # 把超大轉移坪數刪掉\n",
" df = df.loc[df['Transfer_Total_Ping'] < 150]\n",
" \n",
" # 我先刪除 area_m2, 因為覺得跟 area_ping 的意義很類似,但是不確定會不會有些微差距。\n",
" # 因為在 future data 中,manager 都是 0,所以也把這個欄位刪除\n",
" # trading_floor_count 有 0 的情況,這樣應該不是房屋交易\n",
" df = df.drop(columns=['address', 'area_m2', 'manager', 'Building_Material_stone', \n",
" 'TDATE', 'Total_price', '編號'])\n",
" \n",
" # Convert the categorical features' dtype to 'category'\n",
" category_columns = ['Type', 'Month', 'Month_raw',\n",
" 'room', 'City_Land_Usage', 'Main_Usage_Business',\n",
" 'Building_Material_S', 'Building_Material_R', 'Building_Material_C',\n",
" 'Building_Material_steel', 'Building_Material_B', \n",
" 'Building_Material_W', 'Building_Material_iron',\n",
" 'Building_Material_tile', 'Building_Material_clay',\n",
" 'Building_Material_RC_reinforce',\n",
" 'Parking_Space_Types', 'Building_Types']\n",
" df.loc[:, category_columns] = df.loc[:, category_columns].astype('category')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "23fefe16",
"metadata": {},
"outputs": [],
"source": [
"def split_features_target(df):\n",
" X = df.drop(columns=['Unit_Price_Ping'])\n",
" y = df['Unit_Price_Ping']\n",
" return X, y\n",
"\n",
"def train(model, X_train, y_train):\n",
" model.fit(X_train, y_train)\n",
" return model\n",
"\n",
"def eval(model, X_test, y_test):\n",
" from sklearn.metrics import (r2_score, \n",
" mean_absolute_error, \n",
" mean_squared_error)\n",
" \n",
" y_pred = model.predict(X_test)\n",
" print(f'R2 score: {r2_score(y_test, y_pred)}')\n",
" print(f'MAE score: {mean_absolute_error(y_test, y_pred)}')\n",
" print(f'MSE score: {mean_squared_error(y_test, y_pred)}')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "06f18869",
"metadata": {},
"outputs": [],
"source": [
"df_future = pd.read_csv('../temp_future/output_feature/clean_data_future_train.csv')\n",
"df_future_test = pd.read_csv('../temp_future/output_feature/clean_data_future_test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d42ed451",
"metadata": {},
"outputs": [],
"source": [
"df_future = clean_and_drop(df_future)\n",
"X_train, y_train = split_features_target(df_future)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b9654b0c",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.tree import DecisionTreeRegressor\n",
"model = train(DecisionTreeRegressor(max_depth=16), X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ec64058b",
"metadata": {},
"outputs": [],
"source": [
"df_future_test = clean_and_drop(df_future_test)\n",
"X_test, y_test = split_features_target(df_future_test)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8887a76d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training performance: \n",
"R2 score: 0.9718462454562103\n",
"MAE score: 17557.26483457317\n",
"MSE score: 874372717.5443403\n",
"\n",
"Evaluation performance: \n",
"R2 score: 0.8604217189337562\n",
"MAE score: 45734.755834867094\n",
"MSE score: 5186478273.357563\n"
]
}
],
"source": [
"print('Training performance: ')\n",
"eval(model, X_train, y_train)\n",
"print()\n",
"print('Evaluation performance: ')\n",
"eval(model, X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "5a3a1e25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LibLinear]..............................\n",
"optimization finished, #iter = 300\n",
"\n",
"WARNING: reaching max number of iterations\n",
"Using -s 11 may be faster\n",
"\n",
"Objective value = -1.268062\n",
"nSV = 180434\n",
"Training performance: \n",
"R2 score: -0.6423872133386352\n",
"MAE score: 160428.64156413326\n",
"MSE score: 51007710845.57724\n",
"\n",
"Evaluation performance: \n",
"R2 score: -0.39104894557182335\n",
"MAE score: 159721.1200107199\n",
"MSE score: 51688880807.76079\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/elichen/anaconda3/envs/py310/lib/python3.10/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
" warnings.warn(\n"
]
}
],
"source": [
"from sklearn.svm import LinearSVR\n",
"\n",
"model = LinearSVR(C=1.0, epsilon=0.0, verbose=True, max_iter=300, random_state=1207)\n",
"train(model, X_train, y_train)\n",
"\n",
"print('Training performance: ')\n",
"eval(model, X_train, y_train)\n",
"print()\n",
"print('Evaluation performance: ')\n",
"eval(model, X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f399719",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LibSVM]......."
]
}
],
"source": [
"# Too slow\n",
"# The fit time complexity is more than quadratic\n",
"# with the number of samples which makes it hard\n",
"# to scale to datasets with more than a couple of\n",
"# 10000 samples. For large datasets consider using\n",
"# LinearSVR or SGDRegressor instead, possibly\n",
"# after a Nystroem transformer.\n",
"from sklearn.svm import SVR\n",
"\n",
"model = SVR(C=1.0, epsilon=0.0, verbose=True)\n",
"train(model, X_train, y_train)\n",
"\n",
"print('Training performance: ')\n",
"eval(model, X_train, y_train)\n",
"print()\n",
"print('Evaluation performance: ')\n",
"eval(model, X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "717e1eb3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "6bb21fecf6dccdf6d7bdcdc2e427eb532b41f9984a3ac143d01e9b178188a26a"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment