Created
October 11, 2022 15:53
-
-
Save Pratik-Shukla-22/91768ad5c6ca5352da75c1bc95cd6769 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Downloading the data from GitHub: | |
!wget https://raw.githubusercontent.com/Pratik-Shukla-22/Simple-Linear-Regression/main/Fuel_Consumption.csv | |
#Import the required libraries: | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
#Read the csv file: | |
data = pd.read_csv("Fuel_Consumption.csv") | |
data.head() | |
#Columns in our dataset: | |
data.columns | |
#Find datatype of each column and check if any entry is null or not: | |
data.info() | |
#Select only those features from our dataset which we are going to use for predictions: | |
data = data[["ENGINESIZE","CO2EMISSIONS"]] | |
data.head() | |
#Plot a scatterplot of Enginesize vs. Co2emissions | |
plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"]) | |
plt.title("ENGINESIZE VS CO2EMISSIONS") | |
plt.xlabel("Enginesize") | |
plt.ylabel("Emission") | |
plt.show() | |
#Taking 80% of the data for training and 20% for testing: | |
num = int(len(data)*0.8) | |
#Training data: | |
train = data[:num] | |
#Testing data: | |
test = data[num:] | |
print ("Data: ",len(data)) | |
print ("Train: ",len(train)) | |
print ("Test: ",len(test)) | |
#Main function to find the coefficients of line: | |
def simple_linear_regression(input_feature,output): | |
Xi = input_feature | |
Yi = output | |
#Total number of data points: | |
n = len(Xi) | |
#X bar: | |
Xi_mean = Xi.mean() | |
#Y bar: | |
Yi_mean = Yi.mean() | |
#Sum of X: | |
S_Xi = (Xi).sum() | |
#Sum of Y: | |
S_Yi = (Yi).sum() | |
#Sum of (X*Y) multiplied by n: | |
S_XiYi = ((Xi*Yi).sum())*n | |
#Sum of X*Sum of Y: | |
S_Xi_S_Yi = S_Xi*S_Yi | |
#Sum of (X*X) multiplied by n: | |
S_XiXi = ((Xi*Xi).sum())*n | |
#Square of sum of X: | |
S_Xi_Square = S_Xi*S_Xi | |
#Slope: | |
slope = (S_XiYi- S_Xi_S_Yi) / (S_XiXi-S_Xi_Square) | |
#Intercept: | |
intercept = Yi_mean - slope * Xi_mean | |
return slope,intercept | |
#Check the function with dummy data: | |
dummy_input = np.array((1,2,3,4,5)) | |
dummy_output = np.array((1,2,3,4,5)) | |
dummy_slope,dummy_intercept = simple_linear_regression(dummy_input,dummy_output) | |
print ("Slope : " , dummy_slope) | |
print ("Intercept :",dummy_intercept) | |
#Graph for dummy data: | |
plt.scatter([1,2,3,4,5],[1,2,3,4,5]) | |
plt.plot([1,2,3,4,5],[1,2,3,4,5],color="red") | |
plt.show() | |
#Training the model with train data: | |
#Finding the coefficients of best fit line: | |
actual_slope,actual_intercept = simple_linear_regression(train["ENGINESIZE"],train["CO2EMISSIONS"]) | |
print ("Slope: " ,actual_slope) | |
print ("Intercept: " ,actual_intercept) | |
#Define the prediction function: | |
def get_regression_prediction(input_features,slope,intercept): | |
predicted_value = actual_slope*input_features + actual_intercept | |
return predicted_value | |
#Predicting values based on prediction function: | |
my_engine_size = 5 | |
estimated_emission = get_regression_prediction(my_engine_size,actual_slope,actual_intercept) | |
print ("Estimated Emission: ",estimated_emission) | |
#Predicting values for the whole dataset: | |
y_pred = get_regression_prediction(data["ENGINESIZE"],actual_slope,actual_intercept) | |
y_pred | |
#Plot the regression line for test data: | |
plt.scatter(test["ENGINESIZE"],test["CO2EMISSIONS"]) | |
plt.plot(test["ENGINESIZE"],actual_slope*test["ENGINESIZE"]+actual_intercept,color="red") | |
plt.xlabel("Engine_Size") | |
plt.ylabel("Emission") | |
plt.show() | |
#Plot the regression line for the training data: | |
plt.scatter(train["ENGINESIZE"],train["CO2EMISSIONS"]) | |
plt.plot(train["ENGINESIZE"],actual_slope*train["ENGINESIZE"]+actual_intercept,color="red") | |
plt.xlabel("Engine_Size") | |
plt.ylabel("Emission") | |
plt.show() | |
#Plot the regression line for complete data: | |
plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"]) | |
plt.plot(data["ENGINESIZE"],actual_slope*data["ENGINESIZE"]+actual_intercept,color="red") | |
plt.xlabel("Engine_Size") | |
plt.ylabel("Emission") | |
plt.show() | |
#Create a dataframe for Actual and Predicted values: | |
A_P_data = pd.DataFrame({"Actual":data["CO2EMISSIONS"] , "Predicted" :y_pred}) | |
print(A_P_data.head()) | |
#Plot the bar graph for actual and predicted values: | |
A_P_data.head(10).plot(kind='bar',figsize=(12,6)) | |
plt.show() | |
#Error calculation using Residual Sum of Squares: | |
def residual_sum_of_squares(input_feature,output,slope,intercept): | |
prediction = slope*input_feature + intercept | |
residual = (output - prediction) | |
RSS = (residual*residual).sum() | |
return(RSS) | |
#Calculating error in prediction for our dataset: | |
print ("RSS : ",residual_sum_of_squares(test["ENGINESIZE"],test["CO2EMISSIONS"],actual_slope,actual_intercept)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment