Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Pratik-Shukla-22/91768ad5c6ca5352da75c1bc95cd6769 to your computer and use it in GitHub Desktop.
Save Pratik-Shukla-22/91768ad5c6ca5352da75c1bc95cd6769 to your computer and use it in GitHub Desktop.
#Downloading the data from GitHub:
!wget https://raw.githubusercontent.com/Pratik-Shukla-22/Simple-Linear-Regression/main/Fuel_Consumption.csv
#Import the required libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Read the csv file:
data = pd.read_csv("Fuel_Consumption.csv")
data.head()
#Columns in our dataset:
data.columns
#Find datatype of each column and check if any entry is null or not:
data.info()
#Select only those features from our dataset which we are going to use for predictions:
data = data[["ENGINESIZE","CO2EMISSIONS"]]
data.head()
#Plot a scatterplot of Enginesize vs. Co2emissions
plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"])
plt.title("ENGINESIZE VS CO2EMISSIONS")
plt.xlabel("Enginesize")
plt.ylabel("Emission")
plt.show()
#Taking 80% of the data for training and 20% for testing:
num = int(len(data)*0.8)
#Training data:
train = data[:num]
#Testing data:
test = data[num:]
print ("Data: ",len(data))
print ("Train: ",len(train))
print ("Test: ",len(test))
#Main function to find the coefficients of line:
def simple_linear_regression(input_feature,output):
Xi = input_feature
Yi = output
#Total number of data points:
n = len(Xi)
#X bar:
Xi_mean = Xi.mean()
#Y bar:
Yi_mean = Yi.mean()
#Sum of X:
S_Xi = (Xi).sum()
#Sum of Y:
S_Yi = (Yi).sum()
#Sum of (X*Y) multiplied by n:
S_XiYi = ((Xi*Yi).sum())*n
#Sum of X*Sum of Y:
S_Xi_S_Yi = S_Xi*S_Yi
#Sum of (X*X) multiplied by n:
S_XiXi = ((Xi*Xi).sum())*n
#Square of sum of X:
S_Xi_Square = S_Xi*S_Xi
#Slope:
slope = (S_XiYi- S_Xi_S_Yi) / (S_XiXi-S_Xi_Square)
#Intercept:
intercept = Yi_mean - slope * Xi_mean
return slope,intercept
#Check the function with dummy data:
dummy_input = np.array((1,2,3,4,5))
dummy_output = np.array((1,2,3,4,5))
dummy_slope,dummy_intercept = simple_linear_regression(dummy_input,dummy_output)
print ("Slope : " , dummy_slope)
print ("Intercept :",dummy_intercept)
#Graph for dummy data:
plt.scatter([1,2,3,4,5],[1,2,3,4,5])
plt.plot([1,2,3,4,5],[1,2,3,4,5],color="red")
plt.show()
#Training the model with train data:
#Finding the coefficients of best fit line:
actual_slope,actual_intercept = simple_linear_regression(train["ENGINESIZE"],train["CO2EMISSIONS"])
print ("Slope: " ,actual_slope)
print ("Intercept: " ,actual_intercept)
#Define the prediction function:
def get_regression_prediction(input_features,slope,intercept):
predicted_value = actual_slope*input_features + actual_intercept
return predicted_value
#Predicting values based on prediction function:
my_engine_size = 5
estimated_emission = get_regression_prediction(my_engine_size,actual_slope,actual_intercept)
print ("Estimated Emission: ",estimated_emission)
#Predicting values for the whole dataset:
y_pred = get_regression_prediction(data["ENGINESIZE"],actual_slope,actual_intercept)
y_pred
#Plot the regression line for test data:
plt.scatter(test["ENGINESIZE"],test["CO2EMISSIONS"])
plt.plot(test["ENGINESIZE"],actual_slope*test["ENGINESIZE"]+actual_intercept,color="red")
plt.xlabel("Engine_Size")
plt.ylabel("Emission")
plt.show()
#Plot the regression line for the training data:
plt.scatter(train["ENGINESIZE"],train["CO2EMISSIONS"])
plt.plot(train["ENGINESIZE"],actual_slope*train["ENGINESIZE"]+actual_intercept,color="red")
plt.xlabel("Engine_Size")
plt.ylabel("Emission")
plt.show()
#Plot the regression line for complete data:
plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"])
plt.plot(data["ENGINESIZE"],actual_slope*data["ENGINESIZE"]+actual_intercept,color="red")
plt.xlabel("Engine_Size")
plt.ylabel("Emission")
plt.show()
#Create a dataframe for Actual and Predicted values:
A_P_data = pd.DataFrame({"Actual":data["CO2EMISSIONS"] , "Predicted" :y_pred})
print(A_P_data.head())
#Plot the bar graph for actual and predicted values:
A_P_data.head(10).plot(kind='bar',figsize=(12,6))
plt.show()
#Error calculation using Residual Sum of Squares:
def residual_sum_of_squares(input_feature,output,slope,intercept):
prediction = slope*input_feature + intercept
residual = (output - prediction)
RSS = (residual*residual).sum()
return(RSS)
#Calculating error in prediction for our dataset:
print ("RSS : ",residual_sum_of_squares(test["ENGINESIZE"],test["CO2EMISSIONS"],actual_slope,actual_intercept))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment