Skip to content

Instantly share code, notes, and snippets.

@JIElite
Last active May 15, 2022 11:02
Show Gist options
  • Save JIElite/c8ac24bf3c16bb3d264e6b314ffe4745 to your computer and use it in GitHub Desktop.
Save JIElite/c8ac24bf3c16bb3d264e6b314ffe4745 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "72f6f12b",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "948d300e",
"metadata": {},
"outputs": [],
"source": [
"def clean_invalid(dataframe):\n",
" \"\"\" Clean invalid data\n",
"\n",
" *** This is a prerequisite of the following code cells ***\n",
" We transform the features based on the clean data\n",
" -------------------------------------------------------\n",
" Question: 是不是因為清完資料,都市住宅分區的 unique 才會變少?Yes\n",
" \"\"\"\n",
" dataframe = dataframe[(dataframe['交易標的']!='土地') & (~dataframe['交易標的'].isna())]\n",
" dataframe['Month'] = dataframe['交易年月日'].str[:-2].astype('float')\n",
" dataframe = dataframe.query(\"Month>=10601 and Month<=11003 \")\n",
" dataframe = dataframe[(dataframe.Month!=10600) & (dataframe.Month!=10700) & \\\n",
" (dataframe.Month!=10800) & (dataframe.Month!=10900) & (dataframe.Month!=11000)]\n",
"\n",
" return dataframe"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "cc52d1ea",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_866734/77206321.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" dataframe['Month'] = dataframe['交易年月日'].str[:-2].astype('float')\n"
]
}
],
"source": [
"sale_data = pd.read_csv('./sale_data.csv', dtype=str)\n",
"sale_data = clean_invalid(sale_data)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "26af0774",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['鄉鎮市區', '交易標的', '土地位置建物門牌', '土地移轉總面積平方公尺', '都市土地使用分區', '非都市土地使用分區',\n",
" '非都市土地使用編定', '交易年月日', '交易筆棟數', '移轉層次', '總樓層數', '建物型態', '主要用途', '主要建材',\n",
" '建築完成年月', '建物移轉總面積平方公尺', '建物現況格局-房', '建物現況格局-廳', '建物現況格局-衛',\n",
" '建物現況格局-隔間', '有無管理組織', '總價元', '單價元平方公尺', '車位類別', '車位移轉總面積(平方公尺)',\n",
" '車位總價元', '備註', '編號', '主建物面積', '附屬建物面積', '陽台面積', '電梯', '移轉編號', 'Month'],\n",
" dtype='object')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sale_data.columns"
]
},
{
"cell_type": "markdown",
"id": "faf59b48",
"metadata": {},
"source": [
"## 交易標的中 nan 的值是否跟交易的年月有關?\n",
"無,因為各個年月都有"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b63b37cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Q: 有多少交易標的中,電梯的欄位是 nan?\n",
"房地(土地+建物) 451245\n",
"車位 16429\n",
"房地(土地+建物)+車位 296317\n",
"建物 5393\n",
"--------------------\n",
"Q: 這些電梯欄位是 nan 的資料是分佈在哪些年月?\n",
"1 10601.0\n",
"2 10602.0\n",
"3 10603.0\n",
"4 10604.0\n",
"5 10605.0\n",
"6 10606.0\n",
"7 10607.0\n",
"8 10608.0\n",
"9 10609.0\n",
"10 10610.0\n",
"11 10611.0\n",
"12 10612.0\n",
"13 10701.0\n",
"14 10702.0\n",
"15 10707.0\n",
"16 10711.0\n",
"17 10703.0\n",
"18 10704.0\n",
"19 10705.0\n",
"20 10706.0\n",
"21 10708.0\n",
"22 10805.0\n",
"23 10709.0\n",
"24 10710.0\n",
"25 10712.0\n",
"26 10801.0\n",
"27 10804.0\n",
"28 10802.0\n",
"29 10803.0\n",
"30 10806.0\n",
"31 10807.0\n",
"32 10808.0\n",
"33 10811.0\n",
"34 10809.0\n",
"35 10810.0\n",
"36 10812.0\n",
"37 10901.0\n",
"38 10902.0\n",
"39 10905.0\n",
"40 10903.0\n",
"41 10904.0\n",
"42 10906.0\n",
"43 10907.0\n",
"44 10908.0\n",
"45 10911.0\n",
"46 10910.0\n",
"47 10909.0\n",
"48 10912.0\n",
"49 11001.0\n",
"50 11002.0\n",
"51 11003.0\n"
]
}
],
"source": [
"none_elevator_df = sale_data.loc[sale_data['電梯'].isna()]\n",
"# none_elevator_df.shape[0]\n",
"\n",
"print('Q: 有多少交易標的中,電梯的欄位是 nan?')\n",
"for val in none_elevator_df['交易標的'].unique():\n",
" n_samples = none_elevator_df.loc[none_elevator_df['交易標的'] == val].shape[0]\n",
" print(val, n_samples)\n",
"\n",
"print('-'*20)\n",
"print('Q: 這些電梯欄位是 nan 的資料是分佈在哪些年月?')\n",
"for i, trading_time in enumerate(none_elevator_df['Month'].unique(), start=1):\n",
" print(i, trading_time)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7b6a1150",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['公寓(5樓含以下無電梯)', '華廈(10層含以下有電梯)', '其他', '住宅大樓(11層含以上有電梯)',\n",
" '套房(1房1廳1衛)', '透天厝', '辦公商業大樓', '店面(店鋪)', '工廠', '廠辦', '農舍', '倉庫'],\n",
" dtype=object)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sale_data['建物型態'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1fdebcbc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"建物型態為: 公寓(5樓含以下無電梯), 且沒有電梯的筆數: 82445\n",
"建物型態為: 華廈(10層含以下有電梯), 且沒有電梯的筆數: 102549\n",
"建物型態為: 其他, 且沒有電梯的筆數: 19860\n",
"建物型態為: 住宅大樓(11層含以上有電梯), 且沒有電梯的筆數: 296877\n",
"建物型態為: 套房(1房1廳1衛), 且沒有電梯的筆數: 40405\n",
"建物型態為: 透天厝, 且沒有電梯的筆數: 196829\n",
"建物型態為: 辦公商業大樓, 且沒有電梯的筆數: 6934\n",
"建物型態為: 店面(店鋪), 且沒有電梯的筆數: 13619\n",
"建物型態為: 工廠, 且沒有電梯的筆數: 2638\n",
"建物型態為: 廠辦, 且沒有電梯的筆數: 3736\n",
"建物型態為: 農舍, 且沒有電梯的筆數: 3318\n",
"建物型態為: 倉庫, 且沒有電梯的筆數: 174\n"
]
}
],
"source": [
"for building in sale_data['建物型態'].unique():\n",
" n_samples = none_elevator_df.loc[none_elevator_df['建物型態'] == building].shape[0]\n",
" print(f'建物型態為: {building}, 且沒有電梯的筆數: {n_samples}')"
]
},
{
"cell_type": "markdown",
"id": "ed9d4411",
"metadata": {},
"source": [
"以結果來看,我們可以針對以下建物型態去添加有無電梯的資訊:\n",
"1. 建物型態為: 華廈(10層含以下有電梯)\n",
"2. 建物型態為: 住宅大樓(11層含以上有電梯)\n",
"\n",
"總共可以多增加 ~400,000 筆有標記的資料"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "8883bd64",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"公寓(5樓含以下無電梯)\n",
"華廈(10層含以下有電梯)\n",
"其他\n",
"其他\n",
"華廈(10層含以下有電梯)\n",
"其他\n",
"住宅大樓(11層含以上有電梯)\n",
"其他\n",
"套房(1房1廳1衛)\n",
"套房(1房1廳1衛)\n",
"透天厝\n",
"華廈(10層含以下有電梯)\n",
"其他\n",
"其他\n",
"華廈(10層含以下有電梯)\n",
"公寓(5樓含以下無電梯)\n",
"透天厝\n",
"其他\n",
"公寓(5樓含以下無電梯)\n",
"其他\n",
"其他\n",
"公寓(5樓含以下無電梯)\n",
"住宅大樓(11層含以上有電梯)\n",
"公寓(5樓含以下無電梯)\n",
"公寓(5樓含以下無電梯)\n",
"華廈(10層含以下有電梯)\n",
"住宅大樓(11層含以上有電梯)\n",
"住宅大樓(11層含以上有電梯)\n",
"其他\n",
"住宅大樓(11層含以上有電梯)\n",
"辦公商業大樓\n",
"住宅大樓(11層含以上有電梯)\n",
"公寓(5樓含以下無電梯)\n",
"華廈(10層含以下有電梯)\n",
"住宅大樓(11層含以上有電梯)\n",
"公寓(5樓含以下無電梯)\n",
"其他\n",
"套房(1房1廳1衛)\n",
"其他\n",
"華廈(10層含以下有電梯)\n",
"其他\n",
"套房(1房1廳1衛)\n",
"透天厝\n",
"其他\n",
"公寓(5樓含以下無電梯)\n",
"其他\n",
"套房(1房1廳1衛)\n",
"住宅大樓(11層含以上有電梯)\n",
"公寓(5樓含以下無電梯)\n",
"公寓(5樓含以下無電梯)\n"
]
},
{
"data": {
"text/plain": [
"3499 None\n",
"3656 None\n",
"3718 None\n",
"3723 None\n",
"3740 None\n",
"3764 None\n",
"3807 None\n",
"3816 None\n",
"3865 None\n",
"3866 None\n",
"3876 None\n",
"3948 None\n",
"3987 None\n",
"3988 None\n",
"4020 None\n",
"4025 None\n",
"4033 None\n",
"4047 None\n",
"4056 None\n",
"4060 None\n",
"4066 None\n",
"4078 None\n",
"4079 None\n",
"4093 None\n",
"4111 None\n",
"4138 None\n",
"4164 None\n",
"4165 None\n",
"4166 None\n",
"4168 None\n",
"4171 None\n",
"4178 None\n",
"4179 None\n",
"4191 None\n",
"4216 None\n",
"4225 None\n",
"4234 None\n",
"4235 None\n",
"4239 None\n",
"4245 None\n",
"4256 None\n",
"4264 None\n",
"4265 None\n",
"4267 None\n",
"4272 None\n",
"4283 None\n",
"4294 None\n",
"4305 None\n",
"4309 None\n",
"4323 None\n",
"dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sale_data[['建物型態', '電梯']][:50].apply(test, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "70b4ad0a",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"3499 0\n",
"3656 1\n",
"3718 0\n",
"3723 0\n",
"3740 1\n",
" ..\n",
"1840149 0\n",
"1840331 1\n",
"1840379 1\n",
"1840406 1\n",
"1840409 1\n",
"Length: 1074132, dtype: int64"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sale_data[['建物型態', '電梯']].apply(transform_elevator, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "6f4c8a6f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3499 NaN\n",
"3656 NaN\n",
"3718 NaN\n",
"3723 NaN\n",
"3740 NaN\n",
" ... \n",
"1840149 無\n",
"1840331 有\n",
"1840379 有\n",
"1840406 有\n",
"1840409 有\n",
"Name: 電梯, Length: 1074132, dtype: object"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sale_data['電梯']"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "c33819ae",
"metadata": {},
"outputs": [],
"source": [
"def transform_elevator(x):\n",
" \"\"\"Transform the elevator field data to one-hot encoding\n",
" \n",
" If there is nan in the '電梯' field, we can use '建物型態' to\n",
" fix the missing value.\n",
" \"\"\"\n",
" if not isinstance(x['電梯'], str):\n",
" if x['建物型態'] in ['華廈(10層含以下有電梯)', '住宅大樓(11層含以上有電梯)']:\n",
" return 1\n",
" return 0\n",
" \n",
" map_ = {'有': 1, '無': 0}\n",
" return map_[x['電梯']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1501ab4f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment