Skip to content

Instantly share code, notes, and snippets.

@andrewljohnson
Created March 21, 2022 19:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewljohnson/2587b5a8e82b6830d15fc2d6b539076e to your computer and use it in GitHub Desktop.
Save andrewljohnson/2587b5a8e82b6830d15fc2d6b539076e to your computer and use it in GitHub Desktop.
import datetime
import math
import numpy as np
import pandas as pd
import pickle
import time
from django.db.models.functions import Cast
from django.db.models import IntegerField
from real_estate.models import AnalysisRecord
from tensorflow import keras
from real_estate.scripts.utils import clean_dataset
import json
import zlib
def embedded_columns():
return [
"bedrooms", "units", "baths", "fireplaces", "nbhd", "year_blt",
"water", "sewer", "prop_code", "townhouse", "sec_heat_type", "agency",
"land_use", "tax_dist", "zip", "zoning",
"qc", "occupancy", "stories", "building_ty","heat_type",
"sale_year_1", "sale_year_2", "sale_year_3", "sale_year_4", "sale_year_5",
"sale_month_1", "sale_month_2", "sale_month_3", "sale_month_4", "sale_month_5",
"list_year_1", "list_year_2", "list_year_3", "list_year_4", "list_year_5",
"list_month_1", "list_month_2", "list_month_3", "list_month_4", "list_month_5",
]
def value_columns():
return [
"sq_feet",
"acreage", "build_ass", "land_ass",
"depreciation", "land_base",
"sale_price_2", "sale_price_3", "sale_price_4", "sale_price_5",
"list_price_1", "list_price_2", "list_price_3", "list_price_4", "list_price_5",
]
def all_columns():
return embedded_columns() + value_columns()
def train_model_for_records():
start = time.time()
past_date = datetime.datetime(2000, 1, 1)
records = AnalysisRecord.objects.filter(sale_timestamp_1__gt=past_date.timestamp()).filter(baths__lt=10).filter(bedrooms__lt=10).filter(sale_price_2__gt=0).annotate(zip=Cast('sit_us_zip', IntegerField())).filter(sale_price_1__lt=1000001).filter(sale_price_1__gt=99999).values()
end = time.time()
print(f"fetching AnalysisRecords took {end-start}")
print(f"Fetched {records.count()} records for properties.")
start = time.time()
db_hash = zlib.adler32(json.dumps(list(records.values())).encode("utf8"))
end = time.time()
print(f"{db_hash} - Hashing AnalysisRecords took {end-start}")
if db_hash != 2183530944:
print("THE DB HASH IS UNEXPECTEDLY DIFFERENT")
start = time.time()
Data = pd.DataFrame(list(records))
for x in range(1, 5):
Data[f'sale_year_{x}'] = Data[f'sale_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).year)
Data[f'list_year_{x}'] = Data[f'list_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).year)
Data[f'sale_month_{x}'] = Data[f'sale_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).month)
Data[f'list_month_{x}'] = Data[f'list_timestamp_{x}'].apply(lambda sale_date:datetime.datetime.fromtimestamp(sale_date).month)
end = time.time()
print(f"annotating years and months took {end-start}")
start = time.time()
print(Data["qc"].unique())
# transform embeddings data
value_count_map = {}
value_map = {}
for c in embedded_columns():
c_map = {}
for x in range(0, len(Data[c].unique())):
c_map[Data[c].unique()[x]] = x + 1
for index, row in Data.iterrows():
Data.at[index,c] = c_map[row[c]]
value_count_map[c] = len(Data[c].unique()) + 1
value_map[c] = c_map
# print(c_map)
for c in all_columns():
Data[c] = Data[c].astype(np.float32)
Data["sale_price_1"] = Data["sale_price_1"].astype(np.float32)
print(Data["qc"].unique())
with open('value_map.pkl', 'wb') as f:
pickle.dump(value_map, f)
end = time.time()
print(f"making value map took {end-start}")
start = time.time()
clean_dataset(Data)
print(f"Using {len(Data)} properties for analysis.")
data_hash = zlib.adler32(json.dumps(Data.to_json()).encode("utf8"))
print(f"{data_hash} - data_hash")
if data_hash != 4202095867:
print("THE DATA HASH IS UNEXPECTEDLY DIFFERENT")
value_map_hash = zlib.adler32(json.dumps(list(value_map)).encode("utf8"))
print(f"{value_map_hash} - value_map_hash")
if value_map_hash != 4036931971:
print("THE VALUE MAP HASH IS UNEXPECTEDLY DIFFERENT")
inputs = {}
for ec in embedded_columns():
inputs[ec] = keras.Input(shape=(None,), name=ec)
for vc in value_columns():
inputs[vc] = keras.Input(shape=(None, 1), name=vc)
features = {}
for ec in embedded_columns():
features[ec] = keras.layers.Embedding(value_count_map[ec], value_count_map[ec])(inputs[ec])
for vc in value_columns():
features[vc] = keras.layers.Dense(1, activation="relu")(inputs[vc])
x = keras.layers.concatenate([features[key] for key in all_columns()])
price_pred = keras.layers.Dense(1, name="price")(x)
model = keras.Model(
inputs=[inputs[key] for key in all_columns()],
outputs=[price_pred],
)
model.compile(
optimizer="adam",
loss="mae", #mse, msle, mae
# steps_per_execution=32
)
end = time.time()
print(f"making model took {end-start}")
analysis_data = {}
for key in all_columns():
analysis_data[key] = Data[key]
for key in all_columns():
for value in Data[key]:
if value > 100000000 or value < 0:
continue
print(f"{key} {value}")
model.summary()
model.fit(
analysis_data,
{"price": Data['sale_price_1']},
epochs=10,
batch_size=32,
#verbose=0
)
data_to_predict = []
for key in all_columns():
data_to_predict.append(analysis_data[key])
y_pred = model.predict(data_to_predict)
y_preds = [obj[0][0] for obj in y_pred.round()]
df = pd.DataFrame({'Actual': Data['sale_price_1'], 'Predicted': y_preds, 'APN': Data['apn'],})
df1 = df.head(25)
print(df1)
return model
# reproducible Keras
seed_value = 0
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions:
# tf.compat.v1.set_random_seed(seed_value)
# 5. Configure a new global `tensorflow` session
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)
# 1067 University ter 00608320
apn_string = "00608320"
model = train_model_for_records()
model.save('./real_estate/data/reno-model-embedded')
prediction, record = predict_record(apn_string, model)
# print(f"\n\nFor target property, predicted {int(prediction[0][0])} vs previous sale {record['sale_price_2']}\n\n")
print(f"\n\nFor target property, predicted {int(prediction[0][0])} vs list {record['list_price_1']} vs previous sale {record['sale_price_2']}\n\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment