abdel1979/Real-Estate_prediction_model.py

## Real-Estate_prediction_model.py
import pandas as pd
sf = pd.read_csv('final_data.csv')
sf.head()
#columns not needed will be deleted
sf.drop(sf.columns[[0, 2, 3, 15, 17, 18]], axis=1, inplace=True)
#display dataset structure
sf.info()
# Display min and max sold date / display min,max,mean,deviation,25% .. and some other statics
print(sf.lastsolddate.min())
print(sf.lastsolddate.max())
print(sf.describe())
# display plot histogram
import matplotlib.pyplot as plt
sf.hist(bins=50, figsize=(20,15))
plt.savefig("attribute_histogram_plots")
#plt.show()
# display Map
sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10, 7),
        c="lastsoldprice", cmap=plt.get_cmap("jet"), colorbar=True,
        sharex=False)
plt.savefig('map1.png')
# calculate correlation
corr_matrix = sf.corr()
print(corr_matrix["lastsoldprice"].sort_values(ascending=False))
# correlation matrix
from pandas.plotting import scatter_matrix
attributes = ["lastsoldprice", "finishedsqft", "bathrooms", "zindexvalue"]
scatter_matrix(sf[attributes], figsize=(12, 8))
plt.savefig('matrix.png')
sf.plot(kind="scatter", x="finishedsqft", y="lastsoldprice", alpha=0.5)
plt.savefig('scatter.png')
# add price per feet
sf['price_per_sqft'] = sf['lastsoldprice']/sf['finishedsqft']
corr_matrix = sf.corr()
print(corr_matrix["lastsoldprice"].sort_values(ascending=False))
#calculate number of quartiers
print(len(sf['neighborhood'].value_counts()))
# group them
freq = sf.groupby('neighborhood').count()['address']
mean = sf.groupby('neighborhood').mean()['price_per_sqft']
cluster = pd.concat([freq, mean], axis=1)
cluster['neighborhood'] = cluster.index
cluster.columns = ['freq', 'price_per_sqft','neighborhood']
print(cluster.describe())

cluster1 = cluster[cluster.price_per_sqft < 756]
print(cluster1.index)

cluster_temp = cluster[cluster.price_per_sqft >= 756]
cluster2 = cluster_temp[cluster_temp.freq <123]
cluster2.index

cluster3 = cluster_temp[cluster_temp.freq >=123]
cluster3.index

def get_group(x):
    if x in cluster1.index:
        return 'low_price'
    elif x in cluster2.index:
        return 'high_price_low_freq'
    else:
        return 'high_price_high_freq'
sf['group'] = sf['neighborhood'].apply(get_group)

sf.drop(sf.columns[[0, 4, 6, 7, 8, 13]], axis=1, inplace=True)
sf = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt','zindexvalue', 'group', 'lastsoldprice']]
print(sf.head())


X = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt', 'zindexvalue', 'group']]
Y = sf['lastsoldprice']

n = pd.get_dummies(sf.group)
X = pd.concat([X, n], axis=1)
m = pd.get_dummies(sf.usecode)
X = pd.concat([X, m], axis=1)
drops = ['group', 'usecode']
X.drop(drops, inplace=True, axis=1)
print(X.head())


#linear regression

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# to be continued ....
	import pandas as pd
	sf = pd.read_csv('final_data.csv')
	sf.head()
	#columns not needed will be deleted
	sf.drop(sf.columns[[0, 2, 3, 15, 17, 18]], axis=1, inplace=True)
	#display dataset structure
	sf.info()
	# Display min and max sold date / display min,max,mean,deviation,25% .. and some other statics
	print(sf.lastsolddate.min())
	print(sf.lastsolddate.max())
	print(sf.describe())
	# display plot histogram
	import matplotlib.pyplot as plt
	sf.hist(bins=50, figsize=(20,15))
	plt.savefig("attribute_histogram_plots")
	#plt.show()
	# display Map
	sf.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, figsize=(10, 7),
	c="lastsoldprice", cmap=plt.get_cmap("jet"), colorbar=True,
	sharex=False)
	plt.savefig('map1.png')
	# calculate correlation
	corr_matrix = sf.corr()
	print(corr_matrix["lastsoldprice"].sort_values(ascending=False))
	# correlation matrix
	from pandas.plotting import scatter_matrix
	attributes = ["lastsoldprice", "finishedsqft", "bathrooms", "zindexvalue"]
	scatter_matrix(sf[attributes], figsize=(12, 8))
	plt.savefig('matrix.png')
	sf.plot(kind="scatter", x="finishedsqft", y="lastsoldprice", alpha=0.5)
	plt.savefig('scatter.png')
	# add price per feet
	sf['price_per_sqft'] = sf['lastsoldprice']/sf['finishedsqft']
	corr_matrix = sf.corr()
	print(corr_matrix["lastsoldprice"].sort_values(ascending=False))
	#calculate number of quartiers
	print(len(sf['neighborhood'].value_counts()))
	# group them
	freq = sf.groupby('neighborhood').count()['address']
	mean = sf.groupby('neighborhood').mean()['price_per_sqft']
	cluster = pd.concat([freq, mean], axis=1)
	cluster['neighborhood'] = cluster.index
	cluster.columns = ['freq', 'price_per_sqft','neighborhood']
	print(cluster.describe())

	cluster1 = cluster[cluster.price_per_sqft < 756]
	print(cluster1.index)

	cluster_temp = cluster[cluster.price_per_sqft >= 756]
	cluster2 = cluster_temp[cluster_temp.freq <123]
	cluster2.index

	cluster3 = cluster_temp[cluster_temp.freq >=123]
	cluster3.index

	def get_group(x):
	if x in cluster1.index:
	return 'low_price'
	elif x in cluster2.index:
	return 'high_price_low_freq'
	else:
	return 'high_price_high_freq'
	sf['group'] = sf['neighborhood'].apply(get_group)

	sf.drop(sf.columns[[0, 4, 6, 7, 8, 13]], axis=1, inplace=True)
	sf = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt','zindexvalue', 'group', 'lastsoldprice']]
	print(sf.head())


	X = sf[['bathrooms', 'bedrooms', 'finishedsqft', 'totalrooms', 'usecode', 'yearbuilt', 'zindexvalue', 'group']]
	Y = sf['lastsoldprice']

	n = pd.get_dummies(sf.group)
	X = pd.concat([X, n], axis=1)
	m = pd.get_dummies(sf.usecode)
	X = pd.concat([X, m], axis=1)
	drops = ['group', 'usecode']
	X.drop(drops, inplace=True, axis=1)
	print(X.head())


	#linear regression

	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

	# to be continued ....