Pratik-Shukla-22/Simple_Linear_Regression_From_Scratch.py

## Simple_Linear_Regression_From_Scratch.py
#Downloading the data from GitHub:
!wget https://raw.githubusercontent.com/Pratik-Shukla-22/Simple-Linear-Regression/main/Fuel_Consumption.csv

#Import the required libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Read the csv file:
data = pd.read_csv("Fuel_Consumption.csv")
data.head()

#Columns in our dataset:
data.columns

#Find datatype of each column and check if any entry is null or not:
data.info()

#Select only those features from our dataset which we are going to use for predictions:
data = data[["ENGINESIZE","CO2EMISSIONS"]]
data.head()

#Plot a scatterplot of Enginesize vs. Co2emissions
plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"])
plt.title("ENGINESIZE VS CO2EMISSIONS")
plt.xlabel("Enginesize")
plt.ylabel("Emission")
plt.show()

#Taking 80% of the data for training and 20% for testing:
num = int(len(data)*0.8)

#Training data:
train = data[:num]

#Testing data:
test = data[num:]

print ("Data: ",len(data))
print ("Train: ",len(train))
print ("Test: ",len(test))

#Main function to find the coefficients of line:
def simple_linear_regression(input_feature,output):

    Xi = input_feature
    Yi = output

    #Total number of data points:
    n = len(Xi)

    #X bar:
    Xi_mean = Xi.mean()

    #Y bar:
    Yi_mean = Yi.mean()

    #Sum of X:
    S_Xi = (Xi).sum()

    #Sum of Y:
    S_Yi = (Yi).sum()

    #Sum of (X*Y) multiplied by n:
    S_XiYi = ((Xi*Yi).sum())*n

    #Sum of X*Sum of Y:
    S_Xi_S_Yi = S_Xi*S_Yi

    #Sum of (X*X) multiplied by n:
    S_XiXi = ((Xi*Xi).sum())*n

    #Square of sum of X:
    S_Xi_Square = S_Xi*S_Xi

    #Slope:
    slope = (S_XiYi- S_Xi_S_Yi) / (S_XiXi-S_Xi_Square)

    #Intercept:
    intercept = Yi_mean - slope * Xi_mean

    return slope,intercept

#Check the function with dummy data:

dummy_input = np.array((1,2,3,4,5))
dummy_output = np.array((1,2,3,4,5))

dummy_slope,dummy_intercept = simple_linear_regression(dummy_input,dummy_output)

print ("Slope : " , dummy_slope)
print ("Intercept :",dummy_intercept)

#Graph for dummy data:
plt.scatter([1,2,3,4,5],[1,2,3,4,5])
plt.plot([1,2,3,4,5],[1,2,3,4,5],color="red")
plt.show()

#Training the model with train data:
#Finding the coefficients of best fit line:
actual_slope,actual_intercept = simple_linear_regression(train["ENGINESIZE"],train["CO2EMISSIONS"])

print ("Slope: " ,actual_slope)
print ("Intercept: " ,actual_intercept)

#Define the prediction function:
def get_regression_prediction(input_features,slope,intercept):
    predicted_value = actual_slope*input_features + actual_intercept
    return predicted_value

#Predicting values based on prediction function:
my_engine_size = 5
estimated_emission = get_regression_prediction(my_engine_size,actual_slope,actual_intercept)
print ("Estimated Emission: ",estimated_emission)

#Predicting values for the whole dataset:
y_pred = get_regression_prediction(data["ENGINESIZE"],actual_slope,actual_intercept)
y_pred

#Plot the regression line for test data:
plt.scatter(test["ENGINESIZE"],test["CO2EMISSIONS"])
plt.plot(test["ENGINESIZE"],actual_slope*test["ENGINESIZE"]+actual_intercept,color="red")
plt.xlabel("Engine_Size")
plt.ylabel("Emission")
plt.show()

#Plot the regression line for the training data:
plt.scatter(train["ENGINESIZE"],train["CO2EMISSIONS"])
plt.plot(train["ENGINESIZE"],actual_slope*train["ENGINESIZE"]+actual_intercept,color="red")
plt.xlabel("Engine_Size")
plt.ylabel("Emission")
plt.show()

#Plot the regression line for complete data:
plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"])
plt.plot(data["ENGINESIZE"],actual_slope*data["ENGINESIZE"]+actual_intercept,color="red")
plt.xlabel("Engine_Size")
plt.ylabel("Emission")
plt.show()

#Create a dataframe for Actual and Predicted values:
A_P_data = pd.DataFrame({"Actual":data["CO2EMISSIONS"] , "Predicted" :y_pred})
print(A_P_data.head())

#Plot the bar graph for actual and predicted values:
A_P_data.head(10).plot(kind='bar',figsize=(12,6))
plt.show()

#Error calculation using Residual Sum of Squares:
def residual_sum_of_squares(input_feature,output,slope,intercept):
    prediction = slope*input_feature + intercept

    residual = (output - prediction)

    RSS = (residual*residual).sum()

    return(RSS)

#Calculating error in prediction for our dataset:
print ("RSS : ",residual_sum_of_squares(test["ENGINESIZE"],test["CO2EMISSIONS"],actual_slope,actual_intercept))
	#Downloading the data from GitHub:
	!wget https://raw.githubusercontent.com/Pratik-Shukla-22/Simple-Linear-Regression/main/Fuel_Consumption.csv

	#Import the required libraries:
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	#Read the csv file:
	data = pd.read_csv("Fuel_Consumption.csv")
	data.head()

	#Columns in our dataset:
	data.columns

	#Find datatype of each column and check if any entry is null or not:
	data.info()

	#Select only those features from our dataset which we are going to use for predictions:
	data = data[["ENGINESIZE","CO2EMISSIONS"]]
	data.head()

	#Plot a scatterplot of Enginesize vs. Co2emissions
	plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"])
	plt.title("ENGINESIZE VS CO2EMISSIONS")
	plt.xlabel("Enginesize")
	plt.ylabel("Emission")
	plt.show()

	#Taking 80% of the data for training and 20% for testing:
	num = int(len(data)*0.8)

	#Training data:
	train = data[:num]

	#Testing data:
	test = data[num:]

	print ("Data: ",len(data))
	print ("Train: ",len(train))
	print ("Test: ",len(test))

	#Main function to find the coefficients of line:
	def simple_linear_regression(input_feature,output):

	Xi = input_feature
	Yi = output

	#Total number of data points:
	n = len(Xi)

	#X bar:
	Xi_mean = Xi.mean()

	#Y bar:
	Yi_mean = Yi.mean()

	#Sum of X:
	S_Xi = (Xi).sum()

	#Sum of Y:
	S_Yi = (Yi).sum()

	#Sum of (X*Y) multiplied by n:
	S_XiYi = ((XiYi).sum())n

	#Sum of X*Sum of Y:
	S_Xi_S_Yi = S_Xi*S_Yi

	#Sum of (X*X) multiplied by n:
	S_XiXi = ((XiXi).sum())n

	#Square of sum of X:
	S_Xi_Square = S_Xi*S_Xi

	#Slope:
	slope = (S_XiYi- S_Xi_S_Yi) / (S_XiXi-S_Xi_Square)

	#Intercept:
	intercept = Yi_mean - slope * Xi_mean

	return slope,intercept

	#Check the function with dummy data:

	dummy_input = np.array((1,2,3,4,5))
	dummy_output = np.array((1,2,3,4,5))

	dummy_slope,dummy_intercept = simple_linear_regression(dummy_input,dummy_output)

	print ("Slope : " , dummy_slope)
	print ("Intercept :",dummy_intercept)

	#Graph for dummy data:
	plt.scatter([1,2,3,4,5],[1,2,3,4,5])
	plt.plot([1,2,3,4,5],[1,2,3,4,5],color="red")
	plt.show()

	#Training the model with train data:
	#Finding the coefficients of best fit line:
	actual_slope,actual_intercept = simple_linear_regression(train["ENGINESIZE"],train["CO2EMISSIONS"])

	print ("Slope: " ,actual_slope)
	print ("Intercept: " ,actual_intercept)

	#Define the prediction function:
	def get_regression_prediction(input_features,slope,intercept):
	predicted_value = actual_slope*input_features + actual_intercept
	return predicted_value

	#Predicting values based on prediction function:
	my_engine_size = 5
	estimated_emission = get_regression_prediction(my_engine_size,actual_slope,actual_intercept)
	print ("Estimated Emission: ",estimated_emission)

	#Predicting values for the whole dataset:
	y_pred = get_regression_prediction(data["ENGINESIZE"],actual_slope,actual_intercept)
	y_pred

	#Plot the regression line for test data:
	plt.scatter(test["ENGINESIZE"],test["CO2EMISSIONS"])
	plt.plot(test["ENGINESIZE"],actual_slope*test["ENGINESIZE"]+actual_intercept,color="red")
	plt.xlabel("Engine_Size")
	plt.ylabel("Emission")
	plt.show()

	#Plot the regression line for the training data:
	plt.scatter(train["ENGINESIZE"],train["CO2EMISSIONS"])
	plt.plot(train["ENGINESIZE"],actual_slope*train["ENGINESIZE"]+actual_intercept,color="red")
	plt.xlabel("Engine_Size")
	plt.ylabel("Emission")
	plt.show()

	#Plot the regression line for complete data:
	plt.scatter(data["ENGINESIZE"],data["CO2EMISSIONS"])
	plt.plot(data["ENGINESIZE"],actual_slope*data["ENGINESIZE"]+actual_intercept,color="red")
	plt.xlabel("Engine_Size")
	plt.ylabel("Emission")
	plt.show()

	#Create a dataframe for Actual and Predicted values:
	A_P_data = pd.DataFrame({"Actual":data["CO2EMISSIONS"] , "Predicted" :y_pred})
	print(A_P_data.head())

	#Plot the bar graph for actual and predicted values:
	A_P_data.head(10).plot(kind='bar',figsize=(12,6))
	plt.show()

	#Error calculation using Residual Sum of Squares:
	def residual_sum_of_squares(input_feature,output,slope,intercept):
	prediction = slope*input_feature + intercept

	residual = (output - prediction)

	RSS = (residual*residual).sum()

	return(RSS)

	#Calculating error in prediction for our dataset:
	print ("RSS : ",residual_sum_of_squares(test["ENGINESIZE"],test["CO2EMISSIONS"],actual_slope,actual_intercept))