Skip to content

Instantly share code, notes, and snippets.

@abdel1979
Last active May 17, 2019 01:16
Show Gist options
  • Save abdel1979/62dea9c80931210c59a86222864395df to your computer and use it in GitHub Desktop.
Save abdel1979/62dea9c80931210c59a86222864395df to your computer and use it in GitHub Desktop.
Real Estate prediction model to predict home price based on rooms numbers area and location.
import pandas as pd
sf = pd.read_csv('final_data.csv')
sf.head()
#columns not needed will be deleted
sf.drop(sf.columns[[0, 2, 3, 15, 17, 18]], axis=1, inplace=True)
#display dataset structure
sf.info()
# Display min and max sold date / display min,max,mean,deviation,25% .. and some other statics
print(sf.lastsolddate.min())
print(sf.lastsolddate.max())
print(sf.describe())
# display plot histogram
import matplotlib.pyplot as plt
sf.hist(bins=50, figsize=(20,15))
plt.savefig("attribute_histogram_plots")
#plt.show()
# display Map
sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10, 7),
c="lastsoldprice", cmap=plt.get_cmap("jet"), colorbar=True,
sharex=False)
plt.savefig('map1.png')
# calculate correlation
corr_matrix = sf.corr()
print(corr_matrix["lastsoldprice"].sort_values(ascending=False))
# correlation matrix
from pandas.plotting import scatter_matrix
attributes = ["lastsoldprice", "finishedsqft", "bathrooms", "zindexvalue"]
scatter_matrix(sf[attributes], figsize=(12, 8))
plt.savefig('matrix.png')
sf.plot(kind="scatter", x="finishedsqft", y="lastsoldprice", alpha=0.5)
plt.savefig('scatter.png')
# add price per feet
sf['price_per_sqft'] = sf['lastsoldprice']/sf['finishedsqft']
corr_matrix = sf.corr()
print(corr_matrix["lastsoldprice"].sort_values(ascending=False))
#calculate number of quartiers
print(len(sf['neighborhood'].value_counts()))
# group them
freq = sf.groupby('neighborhood').count()['address']
mean = sf.groupby('neighborhood').mean()['price_per_sqft']
cluster = pd.concat([freq, mean], axis=1)
cluster['neighborhood'] = cluster.index
cluster.columns = ['freq', 'price_per_sqft','neighborhood']
print(cluster.describe())
cluster1 = cluster[cluster.price_per_sqft < 756]
print(cluster1.index)
cluster_temp = cluster[cluster.price_per_sqft >= 756]
cluster2 = cluster_temp[cluster_temp.freq <123]
cluster2.index
cluster3 = cluster_temp[cluster_temp.freq >=123]
cluster3.index
def get_group(x):
if x in cluster1.index:
return 'low_price'
elif x in cluster2.index:
return 'high_price_low_freq'
else:
return 'high_price_high_freq'
sf['group'] = sf['neighborhood'].apply(get_group)
sf.drop(sf.columns[[0, 4, 6, 7, 8, 13]], axis=1, inplace=True)
sf = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt','zindexvalue', 'group', 'lastsoldprice']]
print(sf.head())
X = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt', 'zindexvalue', 'group']]
Y = sf['lastsoldprice']
n = pd.get_dummies(sf.group)
X = pd.concat([X, n], axis=1)
m = pd.get_dummies(sf.usecode)
X = pd.concat([X, m], axis=1)
drops = ['group', 'usecode']
X.drop(drops, inplace=True, axis=1)
print(X.head())
#linear regression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
# to be continued ....
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment