Created
June 21, 2020 14:23
-
-
Save ting11222001/743232fe5393830dab0145976a555922 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import libraries | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import seaborn as sns | |
%matplotlib inline | |
# import dataset from: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html | |
from sklearn.datasets import load_boston | |
# load boston_dataset | |
boston_dataset = load_boston() | |
# check out boston_dataset | |
print(boston_dataset.keys()) | |
# check each column's description | |
print(boston_dataset.DESCR) | |
# convert loaded dataset into pandas dataframe | |
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names) | |
boston.head() | |
# append column 'target' into dataframe | |
boston['MEDV'] = boston_dataset.target | |
# check if any null values | |
boston.isnull().sum() | |
# check distribution of target variable | |
# set the size of the whole plot | |
sns.set(rc={'figure.figsize':(10,10)}) | |
sns.distplot(boston['MEDV']) | |
plt.show() | |
# create correlation matrix for all variables to see if there's any linearity relationship | |
correlation_matrix = boston.corr().round(2) | |
# annot = True to print the values inside the square | |
sns.heatmap(data=correlation_matrix, annot = True) | |
# visualize the relationship of LSTAT vs MEDV and RM vs MEDV | |
plt.figure(figsize=(20, 5)) | |
features = ['LSTAT', 'RM'] | |
target = boston['MEDV'] | |
for i, col in enumerate(features): | |
# format plots as 1 row, 2 columns, nth plot | |
plt.subplot(1, len(features) , i+1) | |
# add data column into plot | |
x = boston[col] | |
y = target | |
plt.scatter(x, y, marker='o') | |
plt.title(col) | |
plt.xlabel(col) | |
plt.ylabel('MEDV') | |
# Use LSTAT and RM as feature variables and MEDV as target variable | |
X = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM']], columns = ['LSTAT','RM']) | |
Y = boston['MEDV'] | |
# train_test_split | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5) | |
print(X_train.shape) | |
print(X_test.shape) | |
print(Y_train.shape) | |
print(Y_test.shape) | |
# Modeling - using LinearRegression | |
from sklearn.linear_model import LinearRegression | |
reg = LinearRegression() | |
# Fitting linear model | |
reg.fit(X_train,Y_train) | |
# redicting using the linear model | |
reg.predict(X_test) | |
# Real target data:Y_test | |
# Use R2-score: return the coefficient of determination R^2 of the prediction | |
# The best possible score is 1.0 | |
print('R2: ', reg.score(X_test, Y_test)) | |
# whole plot size | |
plt.figure(figsize=(8, 5)) | |
# plotting the y_test vs y_pred | |
# ideally should have been a straight line | |
Y_pred = reg.predict(X_test) | |
plt.scatter(Y_pred, Y_test) | |
plt.xlabel('Y_pred') | |
plt.ylabel('Y_test') | |
plt.show() | |
# check the model details: intercept and coefficients | |
reg.intercept_ | |
coeff_df = pd.DataFrame(reg.coef_, X_train.columns, columns=['Coefficient']) | |
coeff_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment