Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@nishio-dens
Created February 29, 2020 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nishio-dens/d66a52cb26a4fe0851ceb353c75ecdf7 to your computer and use it in GitHub Desktop.
Save nishio-dens/d66a52cb26a4fe0851ceb353c75ecdf7 to your computer and use it in GitHub Desktop.
import glob
import pandas as pd
pd.options.display.max_columns = None
JA_DIR = "./All_20053_20193_ja"
EN_DIR = "./All_20053_20193_en"
# JA file has PricePerUnit column
JA_FILE_HEADERS = [
"No", "Type", "Region", "MunicipalityCode", "Prefecture",
"Municipality", "DistrictName", "NearestStation", "TimeToNearestStation", "TradePrice",
"PricePerTsubo", "FloorPlan", "Area", "UnitPrice", "LandShape",
"Frontage", "TotalFloorArea", "BuildingYear", "Structure", "Use",
"Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
"CoverageRatio", "FloorAreaRatio", "Period", "Renovation", "Remarks",
]
EN_FILE_HEADERS = [
"No", "Type", "Region", "MunicipalityCode", "Prefecture",
"Municipality", "DistrictName", "NearestStation", "TimeToNearestStation",
"TradePrice", "FloorPlan", "Area", "UnitPrice", "LandShape",
"Frontage", "TotalFloorArea", "BuildingYear", "Structure", "Use",
"Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
"CoverageRatio", "FloorAreaRatio", "Period", "Renovation", "Remarks",
]
RESULT_HEADERS = [
"No", "Type", "Region", "MunicipalityCode", "Prefecture",
"Municipality", "DistrictName", "NearestStation", "TimeToNearestStation", "MinTimeToNearestStation", "MaxTimeToNearestStation", "TradePrice",
"FloorPlan", "Area", "AreaIsGreaterFlag", "UnitPrice", "PricePerTsubo", "LandShape",
"Frontage", "FrontageIsGreaterFlag", "TotalFloorArea", "TotalFloorAreaIsGreaterFlag", "BuildingYear", "PrewarBuilding", "Structure", "Use",
"Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
"CoverageRatio", "FloorAreaRatio", "Period", "Year", "Quarter", "Renovation", "Remarks",
]
def pref_code_all():
"""
there are 47 prefectures in japan
"""
return [("%02d" % i) for i in range(1, 48)]
def to_pref(path):
"""
Filename example:
./All_20053_20193_en/13_Tokyo_20053_20193.csv
13 = prefecture_code
20053 = year:2005, quarter:3
"""
filename = path.split("/")[-1]
pref_code = filename.split("_")[0]
return pref_code
def to_file_info(path, lang):
info = { "lang": lang, "pref_code": to_pref(path), "path": path }
return info
def pref_files(dir_path, lang):
files = glob.glob("%s/*" % dir_path)
d = {}
for path in files:
info = to_file_info(path, lang)
d[info["pref_code"]] = info
return d
def is_same_ja_en_file(ja_df, en_df):
if len(ja_df) != len(en_df):
return False
if ja_df["TradePrice"].sum() != en_df["TradePrice"].sum():
return False
if ja_df.head(10)["TradePrice"].sum() != en_df.head(10)["TradePrice"].sum():
return False
if ja_df.tail(10)["TradePrice"].sum() != en_df.tail(10)["TradePrice"].sum():
return False
return True
def period_to_year(str):
year = str.split()[-1]
return year
def period_to_quarter(str):
if "1st quarter" in str:
return 1
elif "2nd quarter" in str:
return 2
elif "3rd quarter" in str:
return 3
elif "4th quarter" in str:
return 4
else:
raise ValueError("Invalid Quarter")
def clean_landshape(s):
if pd.isnull(s):
return s
elif ""Flag-shaped" etc" in s:
return "Flag-shaped etc."
else:
return s
def area_is_greater_flag(s):
if pd.isnull(s):
return 0
elif "greater" in s:
return 1
else:
return 0
def clean_area(s):
if pd.isnull(s):
return s
else:
return s.replace("m^2 or greater.", "").replace(" ", "").replace(",","")
def year_is_before_the_war(s):
if pd.isnull(s):
return 0
elif "before the war" in s:
return 1
else:
return 0
def clean_year(s):
if pd.isnull(s):
return s
elif "before the war" in s:
return 1945
else:
return s
def min_max_time_to_nearest_station(s):
if pd.isnull(s):
return [s, s]
elif s.isnumeric():
return [s, s]
elif "-" in s:
min, max = s.split("-")
original = s
return [str_time_to_minute(min, original), str_time_to_minute(max, original)]
else:
return [s, s]
def min_time_to_nearest_station(s):
return min_max_time_to_nearest_station(s)[0]
def max_time_to_nearest_station(s):
return min_max_time_to_nearest_station(s)[1]
def str_time_to_minute(s, original):
if "minutes" in s:
return s.replace("minutes", "")
elif ("minutes" not in s) and ("H" not in s) and ("minutes" in original):
return s
elif "H" in s:
v = s.replace("H", " ").split(" ")
v0 = v[0]
v1 = v[1]
if v0 and v1:
return int(v0) * 60 + int(v1)
else:
return int(v0) * 60
elif "H" in original:
if not s:
return None
else:
return int(s) * 60
else:
return s
def frontage_is_greater_flag(s):
if pd.isnull(s):
return False
elif "longer" in s:
return True
else:
return False
def clean_frontage(s):
if pd.isnull(s):
return s
elif "m or longer." in s:
return s.replace("m or longer.", "")
else:
return s
def clean_total_floor_area(s):
if pd.isnull(s):
return s
elif "lessthan10m" in s:
return 10
else:
return s
def clean(df_ja, df_en):
if not is_same_ja_en_file(df_ja, df_en):
raise ValueError("Invalid Files")
df = df_en.copy()
df["PricePerTsubo"] = df_ja["PricePerTsubo"]
df["Year"] = df["Period"].transform(period_to_year)
df["Quarter"] = df["Period"].transform(period_to_quarter)
df["LandShape"] = df["LandShape"].transform(clean_landshape)
df["AreaIsGreaterFlag"] = df["Area"].transform(area_is_greater_flag)
df["Area"] = df["Area"].transform(clean_area)
df["TotalFloorAreaIsGreaterFlag"] = df["TotalFloorArea"].transform(area_is_greater_flag)
df["TotalFloorArea"] = df["TotalFloorArea"].transform(clean_area)
df["PrewarBuilding"] = df["BuildingYear"].transform(year_is_before_the_war)
df["BuildingYear"] = df["BuildingYear"].transform(clean_year)
df["MinTimeToNearestStation"] = df["TimeToNearestStation"].transform(min_time_to_nearest_station)
df["MaxTimeToNearestStation"] = df["TimeToNearestStation"].transform(max_time_to_nearest_station)
df["FrontageIsGreaterFlag"] = df["Frontage"].transform(frontage_is_greater_flag)
df["Frontage"] = df["Frontage"].transform(clean_frontage)
df["TotalFloorArea"] = df["TotalFloorArea"].transform(clean_total_floor_area)
return df
ja_files = pref_files(JA_DIR, "ja")
en_files = pref_files(EN_DIR, "en")
for code in pref_code_all():
df_ja = pd.read_csv(ja_files[code]["path"], encoding="cp932", low_memory=False)
df_en = pd.read_csv(en_files[code]["path"], encoding="cp932", low_memory=False)
df_ja.columns = JA_FILE_HEADERS
df_en.columns = EN_FILE_HEADERS
df = clean(df_ja, df_en)
df = df[RESULT_HEADERS]
df.to_csv("./trade_prices/%s.csv" % code, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment