Created
March 21, 2022 19:41
-
-
Save andrewljohnson/2587b5a8e82b6830d15fc2d6b539076e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import math | |
import numpy as np | |
import pandas as pd | |
import pickle | |
import time | |
from django.db.models.functions import Cast | |
from django.db.models import IntegerField | |
from real_estate.models import AnalysisRecord | |
from tensorflow import keras | |
from real_estate.scripts.utils import clean_dataset | |
import json | |
import zlib | |
def embedded_columns(): | |
return [ | |
"bedrooms", "units", "baths", "fireplaces", "nbhd", "year_blt", | |
"water", "sewer", "prop_code", "townhouse", "sec_heat_type", "agency", | |
"land_use", "tax_dist", "zip", "zoning", | |
"qc", "occupancy", "stories", "building_ty","heat_type", | |
"sale_year_1", "sale_year_2", "sale_year_3", "sale_year_4", "sale_year_5", | |
"sale_month_1", "sale_month_2", "sale_month_3", "sale_month_4", "sale_month_5", | |
"list_year_1", "list_year_2", "list_year_3", "list_year_4", "list_year_5", | |
"list_month_1", "list_month_2", "list_month_3", "list_month_4", "list_month_5", | |
] | |
def value_columns(): | |
return [ | |
"sq_feet", | |
"acreage", "build_ass", "land_ass", | |
"depreciation", "land_base", | |
"sale_price_2", "sale_price_3", "sale_price_4", "sale_price_5", | |
"list_price_1", "list_price_2", "list_price_3", "list_price_4", "list_price_5", | |
] | |
def all_columns(): | |
return embedded_columns() + value_columns() | |
def train_model_for_records(): | |
start = time.time() | |
past_date = datetime.datetime(2000, 1, 1) | |
records = AnalysisRecord.objects.filter(sale_timestamp_1__gt=past_date.timestamp()).filter(baths__lt=10).filter(bedrooms__lt=10).filter(sale_price_2__gt=0).annotate(zip=Cast('sit_us_zip', IntegerField())).filter(sale_price_1__lt=1000001).filter(sale_price_1__gt=99999).values() | |
end = time.time() | |
print(f"fetching AnalysisRecords took {end-start}") | |
print(f"Fetched {records.count()} records for properties.") | |
start = time.time() | |
db_hash = zlib.adler32(json.dumps(list(records.values())).encode("utf8")) | |
end = time.time() | |
print(f"{db_hash} - Hashing AnalysisRecords took {end-start}") | |
if db_hash != 2183530944: | |
print("THE DB HASH IS UNEXPECTEDLY DIFFERENT") | |
start = time.time() | |
Data = pd.DataFrame(list(records)) | |
for x in range(1, 5): | |
Data[f'sale_year_{x}'] = Data[f'sale_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).year) | |
Data[f'list_year_{x}'] = Data[f'list_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).year) | |
Data[f'sale_month_{x}'] = Data[f'sale_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).month) | |
Data[f'list_month_{x}'] = Data[f'list_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).month) | |
end = time.time() | |
print(f"annotating years and months took {end-start}") | |
start = time.time() | |
print(Data["qc"].unique()) | |
# transform embeddings data | |
value_count_map = {} | |
value_map = {} | |
for c in embedded_columns(): | |
c_map = {} | |
for x in range(0, len(Data[c].unique())): | |
c_map[Data[c].unique()[x]] = x + 1 | |
for index, row in Data.iterrows(): | |
Data.at[index,c] = c_map[row[c]] | |
value_count_map[c] = len(Data[c].unique()) + 1 | |
value_map[c] = c_map | |
# print(c_map) | |
for c in all_columns(): | |
Data[c] = Data[c].astype(np.float32) | |
Data["sale_price_1"] = Data["sale_price_1"].astype(np.float32) | |
print(Data["qc"].unique()) | |
with open('value_map.pkl', 'wb') as f: | |
pickle.dump(value_map, f) | |
end = time.time() | |
print(f"making value map took {end-start}") | |
start = time.time() | |
clean_dataset(Data) | |
print(f"Using {len(Data)} properties for analysis.") | |
data_hash = zlib.adler32(json.dumps(Data.to_json()).encode("utf8")) | |
print(f"{data_hash} - data_hash") | |
if data_hash != 4202095867: | |
print("THE DATA HASH IS UNEXPECTEDLY DIFFERENT") | |
value_map_hash = zlib.adler32(json.dumps(list(value_map)).encode("utf8")) | |
print(f"{value_map_hash} - value_map_hash") | |
if value_map_hash != 4036931971: | |
print("THE VALUE MAP HASH IS UNEXPECTEDLY DIFFERENT") | |
inputs = {} | |
for ec in embedded_columns(): | |
inputs[ec] = keras.Input(shape=(None,), name=ec) | |
for vc in value_columns(): | |
inputs[vc] = keras.Input(shape=(None, 1), name=vc) | |
features = {} | |
for ec in embedded_columns(): | |
features[ec] = keras.layers.Embedding(value_count_map[ec], value_count_map[ec])(inputs[ec]) | |
for vc in value_columns(): | |
features[vc] = keras.layers.Dense(1, activation="relu")(inputs[vc]) | |
x = keras.layers.concatenate([features[key] for key in all_columns()]) | |
price_pred = keras.layers.Dense(1, name="price")(x) | |
model = keras.Model( | |
inputs=[inputs[key] for key in all_columns()], | |
outputs=[price_pred], | |
) | |
model.compile( | |
optimizer="adam", | |
loss="mae", #mse, msle, mae | |
# steps_per_execution=32 | |
) | |
end = time.time() | |
print(f"making model took {end-start}") | |
analysis_data = {} | |
for key in all_columns(): | |
analysis_data[key] = Data[key] | |
for key in all_columns(): | |
for value in Data[key]: | |
if value > 100000000 or value < 0: | |
continue | |
print(f"{key} {value}") | |
model.summary() | |
model.fit( | |
analysis_data, | |
{"price": Data['sale_price_1']}, | |
epochs=10, | |
batch_size=32, | |
#verbose=0 | |
) | |
data_to_predict = [] | |
for key in all_columns(): | |
data_to_predict.append(analysis_data[key]) | |
y_pred = model.predict(data_to_predict) | |
y_preds = [obj[0][0] for obj in y_pred.round()] | |
df = pd.DataFrame({'Actual': Data['sale_price_1'], 'Predicted': y_preds, 'APN': Data['apn'],}) | |
df1 = df.head(25) | |
print(df1) | |
return model | |
# reproducible Keras | |
seed_value = 0 | |
import os | |
os.environ['PYTHONHASHSEED']=str(seed_value) | |
import random | |
random.seed(seed_value) | |
import numpy as np | |
np.random.seed(seed_value) | |
import tensorflow as tf | |
tf.random.set_seed(seed_value) | |
# for later versions: | |
# tf.compat.v1.set_random_seed(seed_value) | |
# 5. Configure a new global `tensorflow` session | |
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) | |
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf) | |
tf.compat.v1.keras.backend.set_session(sess) | |
# 1067 University ter 00608320 | |
apn_string = "00608320" | |
model = train_model_for_records() | |
model.save('./real_estate/data/reno-model-embedded') | |
prediction, record = predict_record(apn_string, model) | |
# print(f"\n\nFor target property, predicted {int(prediction[0][0])} vs previous sale {record['sale_price_2']}\n\n") | |
print(f"\n\nFor target property, predicted {int(prediction[0][0])} vs list {record['list_price_1']} vs previous sale {record['sale_price_2']}\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment