Vatsal596/classification.py

## classification.py
import pandas as pd  ## To load the data and create DataFrame
import matplotlib.pyplot as plt ## For plotting of data
import seaborn as sns ## For plotting of data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#1. CLASSIFICATION USING KNN
from sklearn.neighbors import KNeighborsClassifier  ## For KNN Classification method
# Load Dataset
df = pd.read_csv('/content/sample_data/Iris.csv')

#Dividing Data Into Features and Labels
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Assign values to the X and y variables:
X = df[feature_columns].values
y = df['Species'].values

# Assign values to the X and y variables: Alternative method
# X= df.iloc[:, [1,5]].values
# y= df.iloc[:, 5].values

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Standardize features by removing mean and scaling to unit variance:
# scaler = StandardScaler()
# scaler.fit(X_train)

# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

# Predict y data with classifier:
y_predict = classifier.predict(X_test)

# Print results:
print("CONFUSION MATRIX : ")
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))


#------------------ PLOTTING USING HEATMAP--------------------------

# cm = confusion_matrix(y_test, y_predict)

# # # Transform to df for easier plotting
# cm_df = pd.DataFrame(cm,
#                       index = ['setosa','versicolor','virginica'],
#                       columns = ['setosa','versicolor','virginica'])

# plt.figure(figsize=(5.5,4))
# sns.heatmap(cm_df, annot=True)
# plt.title('KNN \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, y_predict)))
# plt.ylabel('True label')
# plt.xlabel('Predicted label')
# plt.show()


#2. CLASSIFICATION USING SVM

#Import SVM model
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

# Load Dataset
df = pd.read_csv('/content/sample_data/Iris.csv')

#Dividing Data Into Features and Labels
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Assign values to the X and y variables:
X = df[feature_columns].values
y = df['Species'].values


# label_encoder_y= LabelEncoder()
# y= label_encoder_y.fit_transform(y)

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Use the SVM classifier to fit data:
classifier1 = svm.SVC(kernel='sigmoid') # linear, sigmoid, rbf
classifier1.fit(X_train, y_train)

# # Predict y data with classifier:
y_predict = classifier1.predict(X_test)

# Print results:
print("CONFUSION MATRIX : ")
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

#3. CLASSIFICATION USING DECISION TREE

#Import Decision Tree model
from sklearn.tree import DecisionTreeClassifier

# Load Dataset
df = pd.read_csv('/content/sample_data/Iris.csv')


#Dividing Data Into Features and Labels
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Assign values to the X and y variables:
X = df[feature_columns].values
y = df['Species'].values

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

# Use the Decision Tree classifier to fit data:
classifier2 = DecisionTreeClassifier(criterion="gini")
# train the model
classifier2.fit(X_train, y_train)

# Predict y data with classifier:
y_predict = classifier2.predict(X_test)

# Print results:
print("CONFUSION MATRIX : ")
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))
print("Accuracy:",accuracy_score(y_test, y_predict))

#4. CLASSIFICATION USING GAUSSIAN NAIVE BAYES

#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# Load Dataset
df = pd.read_csv('/content/sample_data/Iris.csv')

#Dividing Data Into Features and Labels
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Assign values to the X and y variables:
X = df[feature_columns].values
y = df['Species'].values

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use the Decision Tree classifier to fit data:
classifier3 = GaussianNB()
# train the model
classifier3.fit(X_train, y_train)
# Predict y data with classifier:
y_predict = classifier3.predict(X_test)

# Print results:
print("CONFUSION MATRIX : ")
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

#5. CLASSIFICATION USING RANDOM FOREST

#Import RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier

# Load Dataset
df = pd.read_csv('/content/sample_data/Iris.csv')

#Dividing Data Into Features and Labels
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Assign values to the X and y variables:
X = df[feature_columns].values
y = df['Species'].values

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Use the Decision Tree classifier to fit data:
classifier4 = RandomForestClassifier()
# train the model
classifier4.fit(X_train, y_train)
# Predict y data with classifier:
y_predict = classifier4.predict(X_test)

# Print results:
print("CONFUSION MATRIX : ")
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

#5. REGRESSION USING LOGISTIC REGRESSION

#Import LOGISTIC REGRESSION CLASSIFIER
from sklearn.linear_model import LogisticRegression

# Load Dataset
df = pd.read_csv('/content/sample_data/Iris.csv')

#Dividing Data Into Features and Labels
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Assign values to the X and y variables:
X = df[feature_columns].values
y = df['Species'].values

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use the Decision Tree classifier to fit data:
classifier5 = LogisticRegression()
# train the model
classifier5.fit(X_train, y_train)
# Predict y data with classifier:
y_predict = classifier5.predict(X_test)

# Print results:
print("CONFUSION MATRIX : ")
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

df.head(10)
len(df)
df.shape
df.query('SepalLengthCm > 4.5')
df.iloc[0:3]
df['SepalLengthCm'].max
df['SepalLengthCm'].min
df['SepalLengthCm'].count
df['Species'].replace(["Iris-setosa", "Iris-virginica"],["V","B"])
df.rename(columns={'Species':"Hello-World"})
df[df['Id'].isnull()]
df.drop('Id',axis=1)
df.drop(['Id','SepalLengthCm'],axis=1)
df.drop(df.columns[1],axis=1)
df.drop([0,1])
df['SepalLengthCm'].unique()
df['SepalLengthCm'].value_counts()
df["SepalLengthCm"].nunique()
df.groupby('SepalLengthCm').mean()
df.head(10)
df.groupby(['SepalLengthCm','SepalWidthCm']).mean()
df.head(10)
#matplotlib
x=df['SepalLengthCm']
y=df['SepalWidthCm']
plt.plot(x,y)
plt.title("Graph")
plt.xlabel("Lenght")
plt.ylabel("Weight")

df.plot(kind="scatter", x="Id", y="SepalWidthCm")
plt.show()

df['SepalWidthCm'].hist()

plt.scatter(df["Id"],df['SepalWidthCm'],linewidth=1,marker="o",edgecolor="black",s=200)
plt.show()

#numpy
x=np.array([0,1,2,3])
y=np.array([3,5,1,6])

plt.subplot(1,3,1)
plt.plot(x,y)

plt.subplot(1,3,2)
plt.plot(x,y)

plt.subplot(1,3,3)
plt.plot(x,y)

#seaborm
sns.histplot(df['SepalWidthCm'].head(10),kde=True, bins=10)

x=df["SepalLengthCm"]
plt.hist(x,bins=10,color="Red")
plt.title("Histogram")
plt.xlabel("Number")
plt.ylabel("Length")
plt.show()

x=df[["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]]
print(x.describe())

x.boxplot()

#sklearn
feature_columns=["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]
print(feature_columns)

x=df[feature_columns].values
y=df['Species'].values

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

scaler=StandardScaler()
scaler.fit(x_train)

x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train,y_train)

y_predict=classifier.predict(x_test)

print("Confusion Matrix:- ")
print(confusion_matrix(y_test,y_predict))
print(classification_report(y_test,y_predict))

cm=confusion_matrix(y_test,y_predict)
cm_df=pd.DataFrame(cm,index=['Sentosa','Versicolor','Virginica'], columns=['Sentosa','Versicolor','Virginica'])

plt.figure(figsize=(5.5,4))

sns.heatmap(cm_df,annot=True)
plt.title("HeatMap")
plt.xlabel("True Label")
plt.ylabel("Predict Label")
plt.show()
	import pandas as pd ## To load the data and create DataFrame
	import matplotlib.pyplot as plt ## For plotting of data
	import seaborn as sns ## For plotting of data
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

	#1. CLASSIFICATION USING KNN
	from sklearn.neighbors import KNeighborsClassifier ## For KNN Classification method
	# Load Dataset
	df = pd.read_csv('/content/sample_data/Iris.csv')

	#Dividing Data Into Features and Labels
	feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

	# Assign values to the X and y variables:
	X = df[feature_columns].values
	y = df['Species'].values

	# Assign values to the X and y variables: Alternative method
	# X= df.iloc[:, [1,5]].values
	# y= df.iloc[:, 5].values

	# Split dataset into random train and test subsets:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	# Standardize features by removing mean and scaling to unit variance:
	# scaler = StandardScaler()
	# scaler.fit(X_train)

	# X_train = scaler.transform(X_train)
	# X_test = scaler.transform(X_test)

	# Use the KNN classifier to fit data:
	classifier = KNeighborsClassifier(n_neighbors=5)
	classifier.fit(X_train, y_train)

	# Predict y data with classifier:
	y_predict = classifier.predict(X_test)

	# Print results:
	print("CONFUSION MATRIX : ")
	print(confusion_matrix(y_test, y_predict))
	print(classification_report(y_test, y_predict))


	#------------------ PLOTTING USING HEATMAP--------------------------

	# cm = confusion_matrix(y_test, y_predict)

	# # # Transform to df for easier plotting
	# cm_df = pd.DataFrame(cm,
	# index = ['setosa','versicolor','virginica'],
	# columns = ['setosa','versicolor','virginica'])

	# plt.figure(figsize=(5.5,4))
	# sns.heatmap(cm_df, annot=True)
	# plt.title('KNN \nAccuracy:{0:.3f}'.format(accuracy_score(y_test, y_predict)))
	# plt.ylabel('True label')
	# plt.xlabel('Predicted label')
	# plt.show()


	#2. CLASSIFICATION USING SVM

	#Import SVM model
	from sklearn import svm
	from sklearn.preprocessing import LabelEncoder

	# Load Dataset
	df = pd.read_csv('/content/sample_data/Iris.csv')

	#Dividing Data Into Features and Labels
	feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

	# Assign values to the X and y variables:
	X = df[feature_columns].values
	y = df['Species'].values


	# label_encoder_y= LabelEncoder()
	# y= label_encoder_y.fit_transform(y)

	# Split dataset into random train and test subsets:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


	# Use the SVM classifier to fit data:
	classifier1 = svm.SVC(kernel='sigmoid') # linear, sigmoid, rbf
	classifier1.fit(X_train, y_train)

	# # Predict y data with classifier:
	y_predict = classifier1.predict(X_test)

	# Print results:
	print("CONFUSION MATRIX : ")
	print(confusion_matrix(y_test, y_predict))
	print(classification_report(y_test, y_predict))

	#3. CLASSIFICATION USING DECISION TREE

	#Import Decision Tree model
	from sklearn.tree import DecisionTreeClassifier

	# Load Dataset
	df = pd.read_csv('/content/sample_data/Iris.csv')


	#Dividing Data Into Features and Labels
	feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

	# Assign values to the X and y variables:
	X = df[feature_columns].values
	y = df['Species'].values

	# Split dataset into random train and test subsets:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

	# Use the Decision Tree classifier to fit data:
	classifier2 = DecisionTreeClassifier(criterion="gini")
	# train the model
	classifier2.fit(X_train, y_train)

	# Predict y data with classifier:
	y_predict = classifier2.predict(X_test)

	# Print results:
	print("CONFUSION MATRIX : ")
	print(confusion_matrix(y_test, y_predict))
	print(classification_report(y_test, y_predict))
	print("Accuracy:",accuracy_score(y_test, y_predict))

	#4. CLASSIFICATION USING GAUSSIAN NAIVE BAYES

	#Import Gaussian Naive Bayes model
	from sklearn.naive_bayes import GaussianNB

	# Load Dataset
	df = pd.read_csv('/content/sample_data/Iris.csv')

	#Dividing Data Into Features and Labels
	feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

	# Assign values to the X and y variables:
	X = df[feature_columns].values
	y = df['Species'].values

	# Split dataset into random train and test subsets:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	# Use the Decision Tree classifier to fit data:
	classifier3 = GaussianNB()
	# train the model
	classifier3.fit(X_train, y_train)
	# Predict y data with classifier:
	y_predict = classifier3.predict(X_test)

	# Print results:
	print("CONFUSION MATRIX : ")
	print(confusion_matrix(y_test, y_predict))
	print(classification_report(y_test, y_predict))

	#5. CLASSIFICATION USING RANDOM FOREST

	#Import RANDOM FOREST CLASSIFIER
	from sklearn.ensemble import RandomForestClassifier

	# Load Dataset
	df = pd.read_csv('/content/sample_data/Iris.csv')

	#Dividing Data Into Features and Labels
	feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

	# Assign values to the X and y variables:
	X = df[feature_columns].values
	y = df['Species'].values

	# Split dataset into random train and test subsets:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

	# Use the Decision Tree classifier to fit data:
	classifier4 = RandomForestClassifier()
	# train the model
	classifier4.fit(X_train, y_train)
	# Predict y data with classifier:
	y_predict = classifier4.predict(X_test)

	# Print results:
	print("CONFUSION MATRIX : ")
	print(confusion_matrix(y_test, y_predict))
	print(classification_report(y_test, y_predict))

	#5. REGRESSION USING LOGISTIC REGRESSION

	#Import LOGISTIC REGRESSION CLASSIFIER
	from sklearn.linear_model import LogisticRegression

	# Load Dataset
	df = pd.read_csv('/content/sample_data/Iris.csv')

	#Dividing Data Into Features and Labels
	feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

	# Assign values to the X and y variables:
	X = df[feature_columns].values
	y = df['Species'].values

	# Split dataset into random train and test subsets:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	# Use the Decision Tree classifier to fit data:
	classifier5 = LogisticRegression()
	# train the model
	classifier5.fit(X_train, y_train)
	# Predict y data with classifier:
	y_predict = classifier5.predict(X_test)

	# Print results:
	print("CONFUSION MATRIX : ")
	print(confusion_matrix(y_test, y_predict))
	print(classification_report(y_test, y_predict))

	df.head(10)
	len(df)
	df.shape
	df.query('SepalLengthCm > 4.5')
	df.iloc[0:3]
	df['SepalLengthCm'].max
	df['SepalLengthCm'].min
	df['SepalLengthCm'].count
	df['Species'].replace(["Iris-setosa", "Iris-virginica"],["V","B"])
	df.rename(columns={'Species':"Hello-World"})
	df[df['Id'].isnull()]
	df.drop('Id',axis=1)
	df.drop(['Id','SepalLengthCm'],axis=1)
	df.drop(df.columns[1],axis=1)
	df.drop([0,1])
	df['SepalLengthCm'].unique()
	df['SepalLengthCm'].value_counts()
	df["SepalLengthCm"].nunique()
	df.groupby('SepalLengthCm').mean()
	df.head(10)
	df.groupby(['SepalLengthCm','SepalWidthCm']).mean()
	df.head(10)
	#matplotlib
	x=df['SepalLengthCm']
	y=df['SepalWidthCm']
	plt.plot(x,y)
	plt.title("Graph")
	plt.xlabel("Lenght")
	plt.ylabel("Weight")

	df.plot(kind="scatter", x="Id", y="SepalWidthCm")
	plt.show()

	df['SepalWidthCm'].hist()

	plt.scatter(df["Id"],df['SepalWidthCm'],linewidth=1,marker="o",edgecolor="black",s=200)
	plt.show()

	#numpy
	x=np.array([0,1,2,3])
	y=np.array([3,5,1,6])

	plt.subplot(1,3,1)
	plt.plot(x,y)

	plt.subplot(1,3,2)
	plt.plot(x,y)

	plt.subplot(1,3,3)
	plt.plot(x,y)

	#seaborm
	sns.histplot(df['SepalWidthCm'].head(10),kde=True, bins=10)

	x=df["SepalLengthCm"]
	plt.hist(x,bins=10,color="Red")
	plt.title("Histogram")
	plt.xlabel("Number")
	plt.ylabel("Length")
	plt.show()

	x=df[["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]]
	print(x.describe())

	x.boxplot()

	#sklearn
	feature_columns=["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm"]
	print(feature_columns)

	x=df[feature_columns].values
	y=df['Species'].values

	x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

	scaler=StandardScaler()
	scaler.fit(x_train)

	x_train=scaler.transform(x_train)
	x_test=scaler.transform(x_test)

	classifier=KNeighborsClassifier(n_neighbors=5)
	classifier.fit(x_train,y_train)

	y_predict=classifier.predict(x_test)

	print("Confusion Matrix:- ")
	print(confusion_matrix(y_test,y_predict))
	print(classification_report(y_test,y_predict))

	cm=confusion_matrix(y_test,y_predict)
	cm_df=pd.DataFrame(cm,index=['Sentosa','Versicolor','Virginica'], columns=['Sentosa','Versicolor','Virginica'])

	plt.figure(figsize=(5.5,4))

	sns.heatmap(cm_df,annot=True)
	plt.title("HeatMap")
	plt.xlabel("True Label")
	plt.ylabel("Predict Label")
	plt.show()