francoisstamant/house_prediction.py

## house_prediction.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('house_prices.csv', sep=';')

#One hot encoding
neighborhoods = pd.get_dummies(df.Neighborhood, prefix='In_')
houses = pd.concat([df,neighborhoods], axis=1)
houses = houses.drop(['Neighborhood','House_Id'], axis=1)

data_to_use = houses.head(196)
potential_houses = houses.tail(4)

#Train-test split of the data
x = data_to_use.loc[:, data_to_use.columns != 'Price']
y = data_to_use['Price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

### Build models
#I build 3 simple models, i.e. a linear regression, a random forest and gradient boosting regressor

# ----------------------- Linear Regression --------------------------------------
from statsmodels.api import OLS

reg = OLS(y_train, X_train).fit()

# Use the forest's predict method on the test data
predictions = reg.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error of Regression:', round(np.mean(errors), 2), '$.')

# --------------------- Random forest ------------------------------------------------------
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=5, random_state=1234)
regr.fit(X_train, y_train)

# Use the forest's predict method on the test data
predictions = regr.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error of Random Forest:', round(np.mean(errors), 2), '$.')

# --------------------- Gradient Boosting ------------------------------------------------------
from sklearn.ensemble import GradientBoostingRegressor

reg2 = GradientBoostingRegressor()
reg2.fit(X_train, y_train)

# Use the forest's predict method on the test data
predictions = reg2.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error of XGBoost:', round(np.mean(errors), 2), '$.')
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split

	df = pd.read_csv('house_prices.csv', sep=';')

	#One hot encoding
	neighborhoods = pd.get_dummies(df.Neighborhood, prefix='In_')
	houses = pd.concat([df,neighborhoods], axis=1)
	houses = houses.drop(['Neighborhood','House_Id'], axis=1)

	data_to_use = houses.head(196)
	potential_houses = houses.tail(4)

	#Train-test split of the data
	x = data_to_use.loc[:, data_to_use.columns != 'Price']
	y = data_to_use['Price']

	X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

	### Build models
	#I build 3 simple models, i.e. a linear regression, a random forest and gradient boosting regressor

	# ----------------------- Linear Regression --------------------------------------
	from statsmodels.api import OLS

	reg = OLS(y_train, X_train).fit()

	# Use the forest's predict method on the test data
	predictions = reg.predict(X_test)

	# Calculate the absolute errors
	errors = abs(predictions - y_test)

	# Print out the mean absolute error (mae)
	print('Mean Absolute Error of Regression:', round(np.mean(errors), 2), '$.')

	# --------------------- Random forest ------------------------------------------------------
	from sklearn.ensemble import RandomForestRegressor

	regr = RandomForestRegressor(max_depth=5, random_state=1234)
	regr.fit(X_train, y_train)

	# Use the forest's predict method on the test data
	predictions = regr.predict(X_test)

	# Calculate the absolute errors
	errors = abs(predictions - y_test)

	# Print out the mean absolute error (mae)
	print('Mean Absolute Error of Random Forest:', round(np.mean(errors), 2), '$.')

	# --------------------- Gradient Boosting ------------------------------------------------------
	from sklearn.ensemble import GradientBoostingRegressor

	reg2 = GradientBoostingRegressor()
	reg2.fit(X_train, y_train)

	# Use the forest's predict method on the test data
	predictions = reg2.predict(X_test)

	# Calculate the absolute errors
	errors = abs(predictions - y_test)

	# Print out the mean absolute error (mae)
	print('Mean Absolute Error of XGBoost:', round(np.mean(errors), 2), '$.')