martingaido/house-prices.py

## house-prices.py
## Predict House prices using linear regression
## Dataset: https://www.kaggle.com/rubenssjr/brasilian-houses-to-rent

import pandas as pd
from sklearn import preprocessing, linear_model
import numpy as np
import sklearn

### Loading Data ###
print('-' * 30); print(' Importing Data ...'); print('-' * 30)
data = pd.read_csv('houses_to_rent.csv', sep=',')
data = data[['city', 'rooms', 'bathroom', 'parking spaces', 'fire insurance', 'furniture', 'rent amount']]
print(data.head())

### Process Data ###
print('-' * 30); print(' Processing Data ...'); print('-' * 30)
data['rent amount'] = data['rent amount'].map(lambda i: int(i[2:].replace(',', ''))) # take out R$ and commas
data['fire insurance'] = data['fire insurance'].map(lambda i: int(i[2:].replace(',', ''))) # take out R$ and commas
le = preprocessing.LabelEncoder()
data['furniture'] = le.fit_transform((data['furniture'])) # change to 1 or 0
print(data.head())

print('-' * 30); print(' Checking Null Data...'); print('-' * 30)
print(data.isnull().sum()) # show null data
data = data.dropna() # replace null data
print(data.isnull().sum()) # show null data

print('-' * 30); print(' Header Information '); print('-' * 30)
print(data.head())

### Split Data ###
print('-' * 30); print(' Split Data '); print('-' * 30)
x = np.array(data.drop(['rent amount'], 1))
y = np.array(data['rent amount'])
print('X', x.shape) # instances, features
print('Y', y.shape)

xTrain, xTest, yTrain, yTest = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=100) # 20% testing, 80% training

print('XTrain ', xTrain.shape)
print('XTest  ', xTest.shape)

### Training ###
print('-' * 30); print(' Training Model ... '); print('-' * 30)
model = linear_model.LinearRegression()
model.fit(xTrain, yTrain)
accuracy = model.score(xTest, yTest)
print('Coefficient: ', model.coef_)
print('Intercept: ', model.intercept_)
print('Accuracy: ', round(accuracy * 100, 3), '%')

### Evaluation ###
print('-' * 30); print(' Manual Testing ... '); print('-' * 30)
testVals = model.predict(xTest)
print(f'Values: {testVals.shape}')

error = []
for i, testVal in enumerate(testVals):
    error.append(yTest[i] - testVal)
    print(f'Original Value: {yTest[i]} - Prediction Value: {int(testVal)} - Error: {int(error[i])}')
	## Predict House prices using linear regression
	## Dataset: https://www.kaggle.com/rubenssjr/brasilian-houses-to-rent

	import pandas as pd
	from sklearn import preprocessing, linear_model
	import numpy as np
	import sklearn

	### Loading Data ###
	print('-' * 30); print(' Importing Data ...'); print('-' * 30)
	data = pd.read_csv('houses_to_rent.csv', sep=',')
	data = data[['city', 'rooms', 'bathroom', 'parking spaces', 'fire insurance', 'furniture', 'rent amount']]
	print(data.head())

	### Process Data ###
	print('-' * 30); print(' Processing Data ...'); print('-' * 30)
	data['rent amount'] = data['rent amount'].map(lambda i: int(i[2:].replace(',', ''))) # take out R$ and commas
	data['fire insurance'] = data['fire insurance'].map(lambda i: int(i[2:].replace(',', ''))) # take out R$ and commas
	le = preprocessing.LabelEncoder()
	data['furniture'] = le.fit_transform((data['furniture'])) # change to 1 or 0
	print(data.head())

	print('-' * 30); print(' Checking Null Data...'); print('-' * 30)
	print(data.isnull().sum()) # show null data
	data = data.dropna() # replace null data
	print(data.isnull().sum()) # show null data

	print('-' * 30); print(' Header Information '); print('-' * 30)
	print(data.head())

	### Split Data ###
	print('-' * 30); print(' Split Data '); print('-' * 30)
	x = np.array(data.drop(['rent amount'], 1))
	y = np.array(data['rent amount'])
	print('X', x.shape) # instances, features
	print('Y', y.shape)

	xTrain, xTest, yTrain, yTest = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=100) # 20% testing, 80% training

	print('XTrain ', xTrain.shape)
	print('XTest ', xTest.shape)

	### Training ###
	print('-' * 30); print(' Training Model ... '); print('-' * 30)
	model = linear_model.LinearRegression()
	model.fit(xTrain, yTrain)
	accuracy = model.score(xTest, yTest)
	print('Coefficient: ', model.coef_)
	print('Intercept: ', model.intercept_)
	print('Accuracy: ', round(accuracy * 100, 3), '%')

	### Evaluation ###
	print('-' * 30); print(' Manual Testing ... '); print('-' * 30)
	testVals = model.predict(xTest)
	print(f'Values: {testVals.shape}')

	error = []
	for i, testVal in enumerate(testVals):
	error.append(yTest[i] - testVal)
	print(f'Original Value: {yTest[i]} - Prediction Value: {int(testVal)} - Error: {int(error[i])}')