Created
August 25, 2020 12:58
-
-
Save martingaido/fe74fd2f0951733d132a852f2293f3f3 to your computer and use it in GitHub Desktop.
Predict House prices using linear regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Predict House prices using linear regression | |
## Dataset: https://www.kaggle.com/rubenssjr/brasilian-houses-to-rent | |
import pandas as pd | |
from sklearn import preprocessing, linear_model | |
import numpy as np | |
import sklearn | |
### Loading Data ### | |
print('-' * 30); print(' Importing Data ...'); print('-' * 30) | |
data = pd.read_csv('houses_to_rent.csv', sep=',') | |
data = data[['city', 'rooms', 'bathroom', 'parking spaces', 'fire insurance', 'furniture', 'rent amount']] | |
print(data.head()) | |
### Process Data ### | |
print('-' * 30); print(' Processing Data ...'); print('-' * 30) | |
data['rent amount'] = data['rent amount'].map(lambda i: int(i[2:].replace(',', ''))) # take out R$ and commas | |
data['fire insurance'] = data['fire insurance'].map(lambda i: int(i[2:].replace(',', ''))) # take out R$ and commas | |
le = preprocessing.LabelEncoder() | |
data['furniture'] = le.fit_transform((data['furniture'])) # change to 1 or 0 | |
print(data.head()) | |
print('-' * 30); print(' Checking Null Data...'); print('-' * 30) | |
print(data.isnull().sum()) # show null data | |
data = data.dropna() # replace null data | |
print(data.isnull().sum()) # show null data | |
print('-' * 30); print(' Header Information '); print('-' * 30) | |
print(data.head()) | |
### Split Data ### | |
print('-' * 30); print(' Split Data '); print('-' * 30) | |
x = np.array(data.drop(['rent amount'], 1)) | |
y = np.array(data['rent amount']) | |
print('X', x.shape) # instances, features | |
print('Y', y.shape) | |
xTrain, xTest, yTrain, yTest = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=100) # 20% testing, 80% training | |
print('XTrain ', xTrain.shape) | |
print('XTest ', xTest.shape) | |
### Training ### | |
print('-' * 30); print(' Training Model ... '); print('-' * 30) | |
model = linear_model.LinearRegression() | |
model.fit(xTrain, yTrain) | |
accuracy = model.score(xTest, yTest) | |
print('Coefficient: ', model.coef_) | |
print('Intercept: ', model.intercept_) | |
print('Accuracy: ', round(accuracy * 100, 3), '%') | |
### Evaluation ### | |
print('-' * 30); print(' Manual Testing ... '); print('-' * 30) | |
testVals = model.predict(xTest) | |
print(f'Values: {testVals.shape}') | |
error = [] | |
for i, testVal in enumerate(testVals): | |
error.append(yTest[i] - testVal) | |
print(f'Original Value: {yTest[i]} - Prediction Value: {int(testVal)} - Error: {int(error[i])}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment