JIElite/EDA_futuer_price_model.ipynb

## EDA_futuer_price_model.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d2478df4",
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import pandas as pd\n",
    "from pandas.plotting import scatter_matrix\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2039b0c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_columns', 120)\n",
    "pd.set_option('display.max_rows', 120)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "392e6460",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_and_drop(df):\n",
    "    # 只篩選有包含 '住' 用途的交易案\n",
    "    df = df.loc[df['Main_Usage_Living'] == 1]\n",
    "    df = df.drop(columns=['Main_Usage_Living'])\n",
    "    \n",
    "    # 因為都是 0\n",
    "    df = df.drop(columns=['Non_City_Land_Usage', 'Main_Usage_Walk', \n",
    "                          'Main_Usage_Selling',\n",
    "                          'Main_Usage_SnE'])\n",
    "    \n",
    "    # 只有 344 筆是包含工廠用途，且都不具住宅用途，故剔除\n",
    "    df = df.loc[df['Main_Usage_Manufacturing'] == 0]\n",
    "    df = df.drop(columns=['Main_Usage_Manufacturing'])\n",
    "    \n",
    "    # 只有 76 筆是包含停車用途，且都不具住宅用途，故剔除\n",
    "    df = df.loc[df['Main_Usage_Parking'] == 0]\n",
    "    df = df.drop(columns=['Main_Usage_Parking'])\n",
    "    \n",
    "    # 只有 78 筆有農業用途，且都不具住宅用途，故剔除\n",
    "    df = df.loc[df['Main_Usage_Farm'] == 0]\n",
    "    df = df.drop(columns=['Main_Usage_Farm'])\n",
    "    \n",
    "    # NOTICE: 我沒有錢，所以我先只買 6 房以下的\n",
    "    df = df.loc[df['room'] < 6]\n",
    "    \n",
    "    df = df.loc[df['trading_floors_count'] == 1]\n",
    "    \n",
    "    # 雖然有 95 個樣本包含地下室，但是樣本太少，可能不足以推廣\n",
    "    # 所以先剔除，剔除完後，都是 0 所以直接 drop\n",
    "    df = df.loc[df['including_basement'] == 0]\n",
    "    df = df.drop(columns=['including_basement'])\n",
    "    \n",
    "    # 所有的樣本都不包含人行道，所以直接去除這個 feature\n",
    "    df = df.drop(columns=['including_arcade'])\n",
    "\n",
    "    # 剔除交易樓層高度是 -1 (原本有一個樣本)\n",
    "    df = df.loc[df['min_floors_height'] != -1]\n",
    "\n",
    "    # 剔除交易建物是 0 個樓層的情況\n",
    "    df = df.loc[df['building_total_floors'] != 0]\n",
    "    \n",
    "    # 因為車位交易 50 坪以上的資料只有 22 筆，所以先去除\n",
    "    # 因為浮點數在硬體儲存會有小數點，故不能直接用 == 50.0 去比較\n",
    "    df = df.loc[df['Parking_Area'] < 49.5]\n",
    "    \n",
    "    # 把農舍，廠辦踢掉\n",
    "    df = df.loc[df['Building_Types'] < 8]\n",
    "\n",
    "    # 把超大轉移坪數刪掉\n",
    "    df = df.loc[df['Transfer_Total_Ping'] < 150]\n",
    "    \n",
    "    # 我先刪除 area_m2, 因為覺得跟 area_ping 的意義很類似，但是不確定會不會有些微差距。\n",
    "    # 因為在 future data 中，manager 都是 0，所以也把這個欄位刪除\n",
    "    # trading_floor_count 有 0 的情況，這樣應該不是房屋交易\n",
    "    df = df.drop(columns=['address', 'area_m2', 'manager', 'Building_Material_stone', \n",
    "                     'TDATE', 'Total_price', '編號'])\n",
    "    \n",
    "    # Convert the categorical features' dtype to 'category'\n",
    "    category_columns = ['Type', 'Month', 'Month_raw',\n",
    "                       'room', 'City_Land_Usage', 'Main_Usage_Business',\n",
    "                       'Building_Material_S', 'Building_Material_R', 'Building_Material_C',\n",
    "                       'Building_Material_steel', 'Building_Material_B', \n",
    "                       'Building_Material_W', 'Building_Material_iron',\n",
    "                       'Building_Material_tile', 'Building_Material_clay',\n",
    "                       'Building_Material_RC_reinforce',\n",
    "                       'Parking_Space_Types', 'Building_Types']\n",
    "    df.loc[:, category_columns] = df.loc[:, category_columns].astype('category')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "23fefe16",
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_features_target(df):\n",
    "    X = df.drop(columns=['Unit_Price_Ping'])\n",
    "    y = df['Unit_Price_Ping']\n",
    "    return X, y\n",
    "\n",
    "def train(model, X_train, y_train):\n",
    "    model.fit(X_train, y_train)\n",
    "    return model\n",
    "\n",
    "def eval(model, X_test, y_test):\n",
    "    from sklearn.metrics import (r2_score, \n",
    "                                 mean_absolute_error, \n",
    "                                 mean_squared_error)\n",
    "    \n",
    "    y_pred = model.predict(X_test)\n",
    "    print(f'R2 score: {r2_score(y_test, y_pred)}')\n",
    "    print(f'MAE score: {mean_absolute_error(y_test, y_pred)}')\n",
    "    print(f'MSE score: {mean_squared_error(y_test, y_pred)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "06f18869",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_future = pd.read_csv('../temp_future/output_feature/clean_data_future_train.csv')\n",
    "df_future_test = pd.read_csv('../temp_future/output_feature/clean_data_future_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d42ed451",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_future = clean_and_drop(df_future)\n",
    "X_train, y_train = split_features_target(df_future)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b9654b0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "model = train(DecisionTreeRegressor(max_depth=16), X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ec64058b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_future_test = clean_and_drop(df_future_test)\n",
    "X_test, y_test = split_features_target(df_future_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8887a76d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training performance: \n",
      "R2 score: 0.9718462454562103\n",
      "MAE score: 17557.26483457317\n",
      "MSE score: 874372717.5443403\n",
      "\n",
      "Evaluation performance: \n",
      "R2 score: 0.8604217189337562\n",
      "MAE score: 45734.755834867094\n",
      "MSE score: 5186478273.357563\n"
     ]
    }
   ],
   "source": [
    "print('Training performance: ')\n",
    "eval(model, X_train, y_train)\n",
    "print()\n",
    "print('Evaluation performance: ')\n",
    "eval(model, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5a3a1e25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LibLinear]..............................\n",
      "optimization finished, #iter = 300\n",
      "\n",
      "WARNING: reaching max number of iterations\n",
      "Using -s 11 may be faster\n",
      "\n",
      "Objective value = -1.268062\n",
      "nSV = 180434\n",
      "Training performance: \n",
      "R2 score: -0.6423872133386352\n",
      "MAE score: 160428.64156413326\n",
      "MSE score: 51007710845.57724\n",
      "\n",
      "Evaluation performance: \n",
      "R2 score: -0.39104894557182335\n",
      "MAE score: 159721.1200107199\n",
      "MSE score: 51688880807.76079\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/elichen/anaconda3/envs/py310/lib/python3.10/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import LinearSVR\n",
    "\n",
    "model = LinearSVR(C=1.0, epsilon=0.0, verbose=True, max_iter=300, random_state=1207)\n",
    "train(model, X_train, y_train)\n",
    "\n",
    "print('Training performance: ')\n",
    "eval(model, X_train, y_train)\n",
    "print()\n",
    "print('Evaluation performance: ')\n",
    "eval(model, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f399719",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LibSVM]......."
     ]
    }
   ],
   "source": [
    "# Too slow\n",
    "# The fit time complexity is more than quadratic\n",
    "# with the number of samples which makes it hard\n",
    "# to scale to datasets with more than a couple of\n",
    "# 10000 samples. For large datasets consider using\n",
    "# LinearSVR or SGDRegressor instead, possibly\n",
    "# after a Nystroem transformer.\n",
    "from sklearn.svm import SVR\n",
    "\n",
    "model = SVR(C=1.0, epsilon=0.0, verbose=True)\n",
    "train(model, X_train, y_train)\n",
    "\n",
    "print('Training performance: ')\n",
    "eval(model, X_train, y_train)\n",
    "print()\n",
    "print('Evaluation performance: ')\n",
    "eval(model, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "717e1eb3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "6bb21fecf6dccdf6d7bdcdc2e427eb532b41f9984a3ac143d01e9b178188a26a"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d2478df4",
	"metadata": {},
	"outputs": [],
	"source": [
	"%matplotlib inline\n",
	"import pandas as pd\n",
	"from pandas.plotting import scatter_matrix\n",
	"import matplotlib.pyplot as plt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "2039b0c7",
	"metadata": {},
	"outputs": [],
	"source": [
	"pd.set_option('display.max_columns', 120)\n",
	"pd.set_option('display.max_rows', 120)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "392e6460",
	"metadata": {},
	"outputs": [],
	"source": [
	"def clean_and_drop(df):\n",
	" # 只篩選有包含 '住' 用途的交易案\n",
	" df = df.loc[df['Main_Usage_Living'] == 1]\n",
	" df = df.drop(columns=['Main_Usage_Living'])\n",
	" \n",
	" # 因為都是 0\n",
	" df = df.drop(columns=['Non_City_Land_Usage', 'Main_Usage_Walk', \n",
	" 'Main_Usage_Selling',\n",
	" 'Main_Usage_SnE'])\n",
	" \n",
	" # 只有 344 筆是包含工廠用途，且都不具住宅用途，故剔除\n",
	" df = df.loc[df['Main_Usage_Manufacturing'] == 0]\n",
	" df = df.drop(columns=['Main_Usage_Manufacturing'])\n",
	" \n",
	" # 只有 76 筆是包含停車用途，且都不具住宅用途，故剔除\n",
	" df = df.loc[df['Main_Usage_Parking'] == 0]\n",
	" df = df.drop(columns=['Main_Usage_Parking'])\n",
	" \n",
	" # 只有 78 筆有農業用途，且都不具住宅用途，故剔除\n",
	" df = df.loc[df['Main_Usage_Farm'] == 0]\n",
	" df = df.drop(columns=['Main_Usage_Farm'])\n",
	" \n",
	" # NOTICE: 我沒有錢，所以我先只買 6 房以下的\n",
	" df = df.loc[df['room'] < 6]\n",
	" \n",
	" df = df.loc[df['trading_floors_count'] == 1]\n",
	" \n",
	" # 雖然有 95 個樣本包含地下室，但是樣本太少，可能不足以推廣\n",
	" # 所以先剔除，剔除完後，都是 0 所以直接 drop\n",
	" df = df.loc[df['including_basement'] == 0]\n",
	" df = df.drop(columns=['including_basement'])\n",
	" \n",
	" # 所有的樣本都不包含人行道，所以直接去除這個 feature\n",
	" df = df.drop(columns=['including_arcade'])\n",
	"\n",
	" # 剔除交易樓層高度是 -1 (原本有一個樣本)\n",
	" df = df.loc[df['min_floors_height'] != -1]\n",
	"\n",
	" # 剔除交易建物是 0 個樓層的情況\n",
	" df = df.loc[df['building_total_floors'] != 0]\n",
	" \n",
	" # 因為車位交易 50 坪以上的資料只有 22 筆，所以先去除\n",
	" # 因為浮點數在硬體儲存會有小數點，故不能直接用 == 50.0 去比較\n",
	" df = df.loc[df['Parking_Area'] < 49.5]\n",
	" \n",
	" # 把農舍，廠辦踢掉\n",
	" df = df.loc[df['Building_Types'] < 8]\n",
	"\n",
	" # 把超大轉移坪數刪掉\n",
	" df = df.loc[df['Transfer_Total_Ping'] < 150]\n",
	" \n",
	" # 我先刪除 area_m2, 因為覺得跟 area_ping 的意義很類似，但是不確定會不會有些微差距。\n",
	" # 因為在 future data 中，manager 都是 0，所以也把這個欄位刪除\n",
	" # trading_floor_count 有 0 的情況，這樣應該不是房屋交易\n",
	" df = df.drop(columns=['address', 'area_m2', 'manager', 'Building_Material_stone', \n",
	" 'TDATE', 'Total_price', '編號'])\n",
	" \n",
	" # Convert the categorical features' dtype to 'category'\n",
	" category_columns = ['Type', 'Month', 'Month_raw',\n",
	" 'room', 'City_Land_Usage', 'Main_Usage_Business',\n",
	" 'Building_Material_S', 'Building_Material_R', 'Building_Material_C',\n",
	" 'Building_Material_steel', 'Building_Material_B', \n",
	" 'Building_Material_W', 'Building_Material_iron',\n",
	" 'Building_Material_tile', 'Building_Material_clay',\n",
	" 'Building_Material_RC_reinforce',\n",
	" 'Parking_Space_Types', 'Building_Types']\n",
	" df.loc[:, category_columns] = df.loc[:, category_columns].astype('category')\n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "23fefe16",
	"metadata": {},
	"outputs": [],
	"source": [
	"def split_features_target(df):\n",
	" X = df.drop(columns=['Unit_Price_Ping'])\n",
	" y = df['Unit_Price_Ping']\n",
	" return X, y\n",
	"\n",
	"def train(model, X_train, y_train):\n",
	" model.fit(X_train, y_train)\n",
	" return model\n",
	"\n",
	"def eval(model, X_test, y_test):\n",
	" from sklearn.metrics import (r2_score, \n",
	" mean_absolute_error, \n",
	" mean_squared_error)\n",
	" \n",
	" y_pred = model.predict(X_test)\n",
	" print(f'R2 score: {r2_score(y_test, y_pred)}')\n",
	" print(f'MAE score: {mean_absolute_error(y_test, y_pred)}')\n",
	" print(f'MSE score: {mean_squared_error(y_test, y_pred)}')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "06f18869",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_future = pd.read_csv('../temp_future/output_feature/clean_data_future_train.csv')\n",
	"df_future_test = pd.read_csv('../temp_future/output_feature/clean_data_future_test.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "d42ed451",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_future = clean_and_drop(df_future)\n",
	"X_train, y_train = split_features_target(df_future)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "b9654b0c",
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.tree import DecisionTreeRegressor\n",
	"model = train(DecisionTreeRegressor(max_depth=16), X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "ec64058b",
	"metadata": {},
	"outputs": [],
	"source": [
	"df_future_test = clean_and_drop(df_future_test)\n",
	"X_test, y_test = split_features_target(df_future_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "8887a76d",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Training performance: \n",
	"R2 score: 0.9718462454562103\n",
	"MAE score: 17557.26483457317\n",
	"MSE score: 874372717.5443403\n",
	"\n",
	"Evaluation performance: \n",
	"R2 score: 0.8604217189337562\n",
	"MAE score: 45734.755834867094\n",
	"MSE score: 5186478273.357563\n"
	]
	}
	],
	"source": [
	"print('Training performance: ')\n",
	"eval(model, X_train, y_train)\n",
	"print()\n",
	"print('Evaluation performance: ')\n",
	"eval(model, X_test, y_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "5a3a1e25",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[LibLinear]..............................\n",
	"optimization finished, #iter = 300\n",
	"\n",
	"WARNING: reaching max number of iterations\n",
	"Using -s 11 may be faster\n",
	"\n",
	"Objective value = -1.268062\n",
	"nSV = 180434\n",
	"Training performance: \n",
	"R2 score: -0.6423872133386352\n",
	"MAE score: 160428.64156413326\n",
	"MSE score: 51007710845.57724\n",
	"\n",
	"Evaluation performance: \n",
	"R2 score: -0.39104894557182335\n",
	"MAE score: 159721.1200107199\n",
	"MSE score: 51688880807.76079\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/elichen/anaconda3/envs/py310/lib/python3.10/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
	" warnings.warn(\n"
	]
	}
	],
	"source": [
	"from sklearn.svm import LinearSVR\n",
	"\n",
	"model = LinearSVR(C=1.0, epsilon=0.0, verbose=True, max_iter=300, random_state=1207)\n",
	"train(model, X_train, y_train)\n",
	"\n",
	"print('Training performance: ')\n",
	"eval(model, X_train, y_train)\n",
	"print()\n",
	"print('Evaluation performance: ')\n",
	"eval(model, X_test, y_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9f399719",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[LibSVM]......."
	]
	}
	],
	"source": [
	"# Too slow\n",
	"# The fit time complexity is more than quadratic\n",
	"# with the number of samples which makes it hard\n",
	"# to scale to datasets with more than a couple of\n",
	"# 10000 samples. For large datasets consider using\n",
	"# LinearSVR or SGDRegressor instead, possibly\n",
	"# after a Nystroem transformer.\n",
	"from sklearn.svm import SVR\n",
	"\n",
	"model = SVR(C=1.0, epsilon=0.0, verbose=True)\n",
	"train(model, X_train, y_train)\n",
	"\n",
	"print('Training performance: ')\n",
	"eval(model, X_train, y_train)\n",
	"print()\n",
	"print('Evaluation performance: ')\n",
	"eval(model, X_test, y_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "717e1eb3",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"interpreter": {
	"hash": "6bb21fecf6dccdf6d7bdcdc2e427eb532b41f9984a3ac143d01e9b178188a26a"
	},
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}