nishio-dens/clean.py

## clean.py
import glob
import pandas as pd

pd.options.display.max_columns = None

JA_DIR = "./All_20053_20193_ja"
EN_DIR = "./All_20053_20193_en"

# JA file has PricePerUnit column
JA_FILE_HEADERS = [
        "No", "Type", "Region", "MunicipalityCode", "Prefecture",
        "Municipality", "DistrictName", "NearestStation", "TimeToNearestStation", "TradePrice",
        "PricePerTsubo", "FloorPlan", "Area", "UnitPrice", "LandShape",
        "Frontage", "TotalFloorArea", "BuildingYear", "Structure", "Use",
        "Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
        "CoverageRatio", "FloorAreaRatio", "Period", "Renovation", "Remarks",
        ]

EN_FILE_HEADERS = [
        "No", "Type", "Region", "MunicipalityCode", "Prefecture",
        "Municipality", "DistrictName", "NearestStation", "TimeToNearestStation",
        "TradePrice", "FloorPlan", "Area", "UnitPrice", "LandShape",
        "Frontage", "TotalFloorArea", "BuildingYear", "Structure", "Use",
        "Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
        "CoverageRatio", "FloorAreaRatio", "Period", "Renovation", "Remarks",
        ]

RESULT_HEADERS = [
        "No", "Type", "Region", "MunicipalityCode", "Prefecture",
        "Municipality", "DistrictName", "NearestStation", "TimeToNearestStation", "MinTimeToNearestStation", "MaxTimeToNearestStation", "TradePrice",
        "FloorPlan", "Area", "AreaIsGreaterFlag", "UnitPrice", "PricePerTsubo", "LandShape",
        "Frontage", "FrontageIsGreaterFlag", "TotalFloorArea", "TotalFloorAreaIsGreaterFlag", "BuildingYear", "PrewarBuilding", "Structure", "Use",
        "Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
        "CoverageRatio", "FloorAreaRatio", "Period", "Year", "Quarter", "Renovation", "Remarks",
        ]

def pref_code_all():
    """
    there are 47 prefectures in japan
    """
    return [("%02d" % i) for i in range(1, 48)]

def to_pref(path):
    """
    Filename example:
      ./All_20053_20193_en/13_Tokyo_20053_20193.csv

    13    = prefecture_code
    20053 = year:2005, quarter:3
    """
    filename = path.split("/")[-1]
    pref_code = filename.split("_")[0]

    return pref_code

def to_file_info(path, lang):
    info = { "lang": lang, "pref_code": to_pref(path), "path": path }
    return info

def pref_files(dir_path, lang):
    files = glob.glob("%s/*" % dir_path)
    d = {}
    for path in files:
        info = to_file_info(path, lang)
        d[info["pref_code"]] = info

    return d

def is_same_ja_en_file(ja_df, en_df):
    if len(ja_df) != len(en_df):
        return False
    if ja_df["TradePrice"].sum() != en_df["TradePrice"].sum():
        return False
    if ja_df.head(10)["TradePrice"].sum() != en_df.head(10)["TradePrice"].sum():
        return False
    if ja_df.tail(10)["TradePrice"].sum() != en_df.tail(10)["TradePrice"].sum():
        return False

    return True

def period_to_year(str):
    year = str.split()[-1]
    return year

def period_to_quarter(str):
    if "1st quarter" in str:
        return 1
    elif "2nd quarter" in str:
        return 2
    elif "3rd quarter" in str:
        return 3
    elif "4th quarter" in str:
        return 4
    else:
       raise ValueError("Invalid Quarter")

def clean_landshape(s):
    if  pd.isnull(s):
        return s
    elif "&quot;Flag-shaped&quot; etc" in s:
        return "Flag-shaped etc."
    else:
        return s

def area_is_greater_flag(s):
    if pd.isnull(s):
        return 0
    elif "greater" in s:
        return 1
    else:
        return 0

def clean_area(s):
    if pd.isnull(s):
        return s
    else:
        return s.replace("m^2 or greater.", "").replace(" ", "").replace(",","")

def year_is_before_the_war(s):
    if pd.isnull(s):
        return 0
    elif "before the war" in s:
        return 1
    else:
        return 0

def clean_year(s):
    if pd.isnull(s):
        return s
    elif "before the war" in s:
        return 1945
    else:
        return s

def  min_max_time_to_nearest_station(s):
    if pd.isnull(s):
        return [s, s]
    elif s.isnumeric():
        return [s, s]
    elif "-" in s:
        min, max = s.split("-")
        original = s
        return [str_time_to_minute(min, original), str_time_to_minute(max, original)]
    else:
        return [s, s]

def min_time_to_nearest_station(s):
    return min_max_time_to_nearest_station(s)[0]

def max_time_to_nearest_station(s):
    return min_max_time_to_nearest_station(s)[1]

def  str_time_to_minute(s, original):
    if "minutes" in s:
        return s.replace("minutes", "")
    elif ("minutes" not in s) and  ("H" not in s) and ("minutes" in original):
        return s
    elif "H" in s:
        v = s.replace("H", " ").split(" ")
        v0 = v[0]
        v1 = v[1]
        if v0 and v1:
            return int(v0) * 60 + int(v1)
        else:
            return int(v0) * 60
    elif "H" in original:
        if not s:
            return None
        else:
            return int(s) * 60
    else:
        return s

def frontage_is_greater_flag(s):
    if pd.isnull(s):
        return False
    elif "longer" in s:
        return True
    else:
        return False

def clean_frontage(s):
    if pd.isnull(s):
        return s
    elif "m or longer." in s:
        return s.replace("m or longer.", "")
    else:
        return s

def clean_total_floor_area(s):
    if pd.isnull(s):
        return s
    elif "lessthan10m" in s:
        return 10
    else:
        return s

def clean(df_ja, df_en):
    if not is_same_ja_en_file(df_ja, df_en):
        raise ValueError("Invalid Files")

    df = df_en.copy()
    df["PricePerTsubo"] = df_ja["PricePerTsubo"]
    df["Year"] = df["Period"].transform(period_to_year)
    df["Quarter"] = df["Period"].transform(period_to_quarter)
    df["LandShape"] = df["LandShape"].transform(clean_landshape)
    df["AreaIsGreaterFlag"] = df["Area"].transform(area_is_greater_flag)
    df["Area"] = df["Area"].transform(clean_area)
    df["TotalFloorAreaIsGreaterFlag"] = df["TotalFloorArea"].transform(area_is_greater_flag)
    df["TotalFloorArea"] = df["TotalFloorArea"].transform(clean_area)
    df["PrewarBuilding"] = df["BuildingYear"].transform(year_is_before_the_war)
    df["BuildingYear"] = df["BuildingYear"].transform(clean_year)
    df["MinTimeToNearestStation"] = df["TimeToNearestStation"].transform(min_time_to_nearest_station)
    df["MaxTimeToNearestStation"] = df["TimeToNearestStation"].transform(max_time_to_nearest_station)
    df["FrontageIsGreaterFlag"] = df["Frontage"].transform(frontage_is_greater_flag)
    df["Frontage"] = df["Frontage"].transform(clean_frontage)
    df["TotalFloorArea"] = df["TotalFloorArea"].transform(clean_total_floor_area)

    return df

ja_files = pref_files(JA_DIR, "ja")
en_files = pref_files(EN_DIR, "en")

for code in pref_code_all():
    df_ja = pd.read_csv(ja_files[code]["path"], encoding="cp932", low_memory=False)
    df_en = pd.read_csv(en_files[code]["path"], encoding="cp932", low_memory=False)
    df_ja.columns = JA_FILE_HEADERS
    df_en.columns = EN_FILE_HEADERS

    df = clean(df_ja, df_en)
    df = df[RESULT_HEADERS]

    df.to_csv("./trade_prices/%s.csv" % code, index=False)
	import glob
	import pandas as pd

	pd.options.display.max_columns = None

	JA_DIR = "./All_20053_20193_ja"
	EN_DIR = "./All_20053_20193_en"

	# JA file has PricePerUnit column
	JA_FILE_HEADERS = [
	"No", "Type", "Region", "MunicipalityCode", "Prefecture",
	"Municipality", "DistrictName", "NearestStation", "TimeToNearestStation", "TradePrice",
	"PricePerTsubo", "FloorPlan", "Area", "UnitPrice", "LandShape",
	"Frontage", "TotalFloorArea", "BuildingYear", "Structure", "Use",
	"Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
	"CoverageRatio", "FloorAreaRatio", "Period", "Renovation", "Remarks",
	]

	EN_FILE_HEADERS = [
	"No", "Type", "Region", "MunicipalityCode", "Prefecture",
	"Municipality", "DistrictName", "NearestStation", "TimeToNearestStation",
	"TradePrice", "FloorPlan", "Area", "UnitPrice", "LandShape",
	"Frontage", "TotalFloorArea", "BuildingYear", "Structure", "Use",
	"Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
	"CoverageRatio", "FloorAreaRatio", "Period", "Renovation", "Remarks",
	]

	RESULT_HEADERS = [
	"No", "Type", "Region", "MunicipalityCode", "Prefecture",
	"Municipality", "DistrictName", "NearestStation", "TimeToNearestStation", "MinTimeToNearestStation", "MaxTimeToNearestStation", "TradePrice",
	"FloorPlan", "Area", "AreaIsGreaterFlag", "UnitPrice", "PricePerTsubo", "LandShape",
	"Frontage", "FrontageIsGreaterFlag", "TotalFloorArea", "TotalFloorAreaIsGreaterFlag", "BuildingYear", "PrewarBuilding", "Structure", "Use",
	"Purpose", "Direction", "Classification", "Breadth", "CityPlanning",
	"CoverageRatio", "FloorAreaRatio", "Period", "Year", "Quarter", "Renovation", "Remarks",
	]

	def pref_code_all():
	"""
	there are 47 prefectures in japan
	"""
	return [("%02d" % i) for i in range(1, 48)]

	def to_pref(path):
	"""
	Filename example:
	./All_20053_20193_en/13_Tokyo_20053_20193.csv

	13 = prefecture_code
	20053 = year:2005, quarter:3
	"""
	filename = path.split("/")[-1]
	pref_code = filename.split("_")[0]

	return pref_code

	def to_file_info(path, lang):
	info = { "lang": lang, "pref_code": to_pref(path), "path": path }
	return info

	def pref_files(dir_path, lang):
	files = glob.glob("%s/*" % dir_path)
	d = {}
	for path in files:
	info = to_file_info(path, lang)
	d[info["pref_code"]] = info

	return d

	def is_same_ja_en_file(ja_df, en_df):
	if len(ja_df) != len(en_df):
	return False
	if ja_df["TradePrice"].sum() != en_df["TradePrice"].sum():
	return False
	if ja_df.head(10)["TradePrice"].sum() != en_df.head(10)["TradePrice"].sum():
	return False
	if ja_df.tail(10)["TradePrice"].sum() != en_df.tail(10)["TradePrice"].sum():
	return False

	return True

	def period_to_year(str):
	year = str.split()[-1]
	return year

	def period_to_quarter(str):
	if "1st quarter" in str:
	return 1
	elif "2nd quarter" in str:
	return 2
	elif "3rd quarter" in str:
	return 3
	elif "4th quarter" in str:
	return 4
	else:
	raise ValueError("Invalid Quarter")

	def clean_landshape(s):
	if pd.isnull(s):
	return s
	elif ""Flag-shaped" etc" in s:
	return "Flag-shaped etc."
	else:
	return s

	def area_is_greater_flag(s):
	if pd.isnull(s):
	return 0
	elif "greater" in s:
	return 1
	else:
	return 0

	def clean_area(s):
	if pd.isnull(s):
	return s
	else:
	return s.replace("m^2 or greater.", "").replace(" ", "").replace(",","")

	def year_is_before_the_war(s):
	if pd.isnull(s):
	return 0
	elif "before the war" in s:
	return 1
	else:
	return 0

	def clean_year(s):
	if pd.isnull(s):
	return s
	elif "before the war" in s:
	return 1945
	else:
	return s

	def min_max_time_to_nearest_station(s):
	if pd.isnull(s):
	return [s, s]
	elif s.isnumeric():
	return [s, s]
	elif "-" in s:
	min, max = s.split("-")
	original = s
	return [str_time_to_minute(min, original), str_time_to_minute(max, original)]
	else:
	return [s, s]

	def min_time_to_nearest_station(s):
	return min_max_time_to_nearest_station(s)[0]

	def max_time_to_nearest_station(s):
	return min_max_time_to_nearest_station(s)[1]

	def str_time_to_minute(s, original):
	if "minutes" in s:
	return s.replace("minutes", "")
	elif ("minutes" not in s) and ("H" not in s) and ("minutes" in original):
	return s
	elif "H" in s:
	v = s.replace("H", " ").split(" ")
	v0 = v[0]
	v1 = v[1]
	if v0 and v1:
	return int(v0) * 60 + int(v1)
	else:
	return int(v0) * 60
	elif "H" in original:
	if not s:
	return None
	else:
	return int(s) * 60
	else:
	return s

	def frontage_is_greater_flag(s):
	if pd.isnull(s):
	return False
	elif "longer" in s:
	return True
	else:
	return False

	def clean_frontage(s):
	if pd.isnull(s):
	return s
	elif "m or longer." in s:
	return s.replace("m or longer.", "")
	else:
	return s

	def clean_total_floor_area(s):
	if pd.isnull(s):
	return s
	elif "lessthan10m" in s:
	return 10
	else:
	return s

	def clean(df_ja, df_en):
	if not is_same_ja_en_file(df_ja, df_en):
	raise ValueError("Invalid Files")

	df = df_en.copy()
	df["PricePerTsubo"] = df_ja["PricePerTsubo"]
	df["Year"] = df["Period"].transform(period_to_year)
	df["Quarter"] = df["Period"].transform(period_to_quarter)
	df["LandShape"] = df["LandShape"].transform(clean_landshape)
	df["AreaIsGreaterFlag"] = df["Area"].transform(area_is_greater_flag)
	df["Area"] = df["Area"].transform(clean_area)
	df["TotalFloorAreaIsGreaterFlag"] = df["TotalFloorArea"].transform(area_is_greater_flag)
	df["TotalFloorArea"] = df["TotalFloorArea"].transform(clean_area)
	df["PrewarBuilding"] = df["BuildingYear"].transform(year_is_before_the_war)
	df["BuildingYear"] = df["BuildingYear"].transform(clean_year)
	df["MinTimeToNearestStation"] = df["TimeToNearestStation"].transform(min_time_to_nearest_station)
	df["MaxTimeToNearestStation"] = df["TimeToNearestStation"].transform(max_time_to_nearest_station)
	df["FrontageIsGreaterFlag"] = df["Frontage"].transform(frontage_is_greater_flag)
	df["Frontage"] = df["Frontage"].transform(clean_frontage)
	df["TotalFloorArea"] = df["TotalFloorArea"].transform(clean_total_floor_area)

	return df

	ja_files = pref_files(JA_DIR, "ja")
	en_files = pref_files(EN_DIR, "en")

	for code in pref_code_all():
	df_ja = pd.read_csv(ja_files[code]["path"], encoding="cp932", low_memory=False)
	df_en = pd.read_csv(en_files[code]["path"], encoding="cp932", low_memory=False)
	df_ja.columns = JA_FILE_HEADERS
	df_en.columns = EN_FILE_HEADERS

	df = clean(df_ja, df_en)
	df = df[RESULT_HEADERS]

	df.to_csv("./trade_prices/%s.csv" % code, index=False)