Last active
May 17, 2019 01:16
-
-
Save abdel1979/62dea9c80931210c59a86222864395df to your computer and use it in GitHub Desktop.
Real Estate prediction model to predict home price based on rooms numbers area and location.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
sf = pd.read_csv('final_data.csv') | |
sf.head() | |
#columns not needed will be deleted | |
sf.drop(sf.columns[[0, 2, 3, 15, 17, 18]], axis=1, inplace=True) | |
#display dataset structure | |
sf.info() | |
# Display min and max sold date / display min,max,mean,deviation,25% .. and some other statics | |
print(sf.lastsolddate.min()) | |
print(sf.lastsolddate.max()) | |
print(sf.describe()) | |
# display plot histogram | |
import matplotlib.pyplot as plt | |
sf.hist(bins=50, figsize=(20,15)) | |
plt.savefig("attribute_histogram_plots") | |
#plt.show() | |
# display Map | |
sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10, 7), | |
c="lastsoldprice", cmap=plt.get_cmap("jet"), colorbar=True, | |
sharex=False) | |
plt.savefig('map1.png') | |
# calculate correlation | |
corr_matrix = sf.corr() | |
print(corr_matrix["lastsoldprice"].sort_values(ascending=False)) | |
# correlation matrix | |
from pandas.plotting import scatter_matrix | |
attributes = ["lastsoldprice", "finishedsqft", "bathrooms", "zindexvalue"] | |
scatter_matrix(sf[attributes], figsize=(12, 8)) | |
plt.savefig('matrix.png') | |
sf.plot(kind="scatter", x="finishedsqft", y="lastsoldprice", alpha=0.5) | |
plt.savefig('scatter.png') | |
# add price per feet | |
sf['price_per_sqft'] = sf['lastsoldprice']/sf['finishedsqft'] | |
corr_matrix = sf.corr() | |
print(corr_matrix["lastsoldprice"].sort_values(ascending=False)) | |
#calculate number of quartiers | |
print(len(sf['neighborhood'].value_counts())) | |
# group them | |
freq = sf.groupby('neighborhood').count()['address'] | |
mean = sf.groupby('neighborhood').mean()['price_per_sqft'] | |
cluster = pd.concat([freq, mean], axis=1) | |
cluster['neighborhood'] = cluster.index | |
cluster.columns = ['freq', 'price_per_sqft','neighborhood'] | |
print(cluster.describe()) | |
cluster1 = cluster[cluster.price_per_sqft < 756] | |
print(cluster1.index) | |
cluster_temp = cluster[cluster.price_per_sqft >= 756] | |
cluster2 = cluster_temp[cluster_temp.freq <123] | |
cluster2.index | |
cluster3 = cluster_temp[cluster_temp.freq >=123] | |
cluster3.index | |
def get_group(x): | |
if x in cluster1.index: | |
return 'low_price' | |
elif x in cluster2.index: | |
return 'high_price_low_freq' | |
else: | |
return 'high_price_high_freq' | |
sf['group'] = sf['neighborhood'].apply(get_group) | |
sf.drop(sf.columns[[0, 4, 6, 7, 8, 13]], axis=1, inplace=True) | |
sf = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt','zindexvalue', 'group', 'lastsoldprice']] | |
print(sf.head()) | |
X = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt', 'zindexvalue', 'group']] | |
Y = sf['lastsoldprice'] | |
n = pd.get_dummies(sf.group) | |
X = pd.concat([X, n], axis=1) | |
m = pd.get_dummies(sf.usecode) | |
X = pd.concat([X, m], axis=1) | |
drops = ['group', 'usecode'] | |
X.drop(drops, inplace=True, axis=1) | |
print(X.head()) | |
#linear regression | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0) | |
# to be continued .... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment