tharunpeddisetty/LinearRegression.py

## LinearRegression.py

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm

#Do not forget to change your file path. I haven't changed mine for your reference
dataset = pd.read_csv('/Users/tharunpeddisetty/Desktop/Machine Learning/Python/Salary_Data.csv')


X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

#Splitting data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =  train_test_split(X,Y,test_size=0.2,random_state=1)

#Training the model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)


#Predicting on test set
y_pred = regressor.predict(X_test)

#Visualizing the training set results
plt.scatter(X_train,Y_train, color='red')
plt.plot(X_train,regressor.predict(X_train),color='blue') #plots the curve of a function
plt.title('Salary Vs Experience (Training Set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

#Visualizing the test set results
plt.scatter(X_test,Y_test, color='red')
plt.plot(X_train,regressor.predict(X_train),color='blue') #plots the curve of a function. arguments do not change because the reg line is made from a unique equation from training set. If we use X_test, y_pred. We get same line.
plt.title('Salary Vs Experience (Testing Set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

#finding intercerpt and coeff
print(regressor.coef_)
print(regressor.intercept_)

#finding prediction for 12 years of experience
print(regressor.predict([[12]])) #since the predict method always expects a 2D array as input

## Salary_Data.csv

          
            YearsExperience
            Salary

            
              1.1
              39343.00

            
              1.3
              46205.00

            
              1.5
              37731.00

            
              2.0
              43525.00

            
              2.2
              39891.00

            
              2.9
              56642.00

            
              3.0
              60150.00

            
              3.2
              54445.00

            
              3.2
              64445.00

            
              3.7
              57189.00

            
              3.9
              63218.00

            
              4.0
              55794.00

            
              4.0
              56957.00

            
              4.1
              57081.00

            
              4.5
              61111.00

            
              4.9
              67938.00

            
              5.1
              66029.00

            
              5.3
              83088.00

            
              5.9
              81363.00

            
              6.0
              93940.00

            
              6.8
              91738.00

            
              7.1
              98273.00

            
              7.9
              101302.00

            
              8.2
              113812.00

            
              8.7
              109431.00

            
              9.0
              105582.00

            
              9.5
              116969.00

            
              9.6
              112635.00

            
              10.3
              122391.00

            
              10.5
              121872.00

	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	import statsmodels.api as sm

	#Do not forget to change your file path. I haven't changed mine for your reference
	dataset = pd.read_csv('/Users/tharunpeddisetty/Desktop/Machine Learning/Python/Salary_Data.csv')


	X = dataset.iloc[:,:-1].values
	Y = dataset.iloc[:,-1].values

	#Splitting data into training and testing set
	from sklearn.model_selection import train_test_split
	X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

	#Training the model
	from sklearn.linear_model import LinearRegression
	regressor = LinearRegression()
	regressor.fit(X_train, Y_train)


	#Predicting on test set
	y_pred = regressor.predict(X_test)

	#Visualizing the training set results
	plt.scatter(X_train,Y_train, color='red')
	plt.plot(X_train,regressor.predict(X_train),color='blue') #plots the curve of a function
	plt.title('Salary Vs Experience (Training Set)')
	plt.xlabel('Years of Experience')
	plt.ylabel('Salary')
	plt.show()

	#Visualizing the test set results
	plt.scatter(X_test,Y_test, color='red')
	plt.plot(X_train,regressor.predict(X_train),color='blue') #plots the curve of a function. arguments do not change because the reg line is made from a unique equation from training set. If we use X_test, y_pred. We get same line.
	plt.title('Salary Vs Experience (Testing Set)')
	plt.xlabel('Years of Experience')
	plt.ylabel('Salary')
	plt.show()

	#finding intercerpt and coeff
	print(regressor.coef_)
	print(regressor.intercept_)

	#finding prediction for 12 years of experience
	print(regressor.predict([[12]])) #since the predict method always expects a 2D array as input
	YearsExperience	Salary
	1.1	39343.00
	1.3	46205.00
	1.5	37731.00
	2.0	43525.00
	2.2	39891.00
	2.9	56642.00
	3.0	60150.00
	3.2	54445.00
	3.2	64445.00
	3.7	57189.00
	3.9	63218.00
	4.0	55794.00
	4.0	56957.00
	4.1	57081.00
	4.5	61111.00
	4.9	67938.00
	5.1	66029.00
	5.3	83088.00
	5.9	81363.00
	6.0	93940.00
	6.8	91738.00
	7.1	98273.00
	7.9	101302.00
	8.2	113812.00
	8.7	109431.00
	9.0	105582.00
	9.5	116969.00
	9.6	112635.00
	10.3	122391.00
	10.5	121872.00