Last active
December 4, 2022 14:17
-
-
Save Axhat/8383aa34ff216c4e4c5272fc998ba71b to your computer and use it in GitHub Desktop.
ok ok ok
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. Find S | |
import pandas as pd | |
import numpy as np | |
def train(X, y): | |
# Create the most specific hypothesis | |
hypothesis = [] | |
for i in range(len(X[0])): | |
hypothesis.append("null") | |
# Iterate over the features | |
for i in range(len(X)): | |
# Check if the concept is true | |
if y[i] == "yes": | |
# Compare all the features in the example | |
for j in range(len(X[i])): | |
feature = X[i][j] | |
if hypothesis[j] == "null": | |
hypothesis[j] = feature | |
elif hypothesis[j] != feature: | |
hypothesis[j] = "general" | |
return hypothesis | |
# Read csv and split into dependent and independent variable vectors | |
df = pd.read_csv('./a1.csv') | |
X = np.array(df.iloc[:, :-1]) | |
y = np.array(df.iloc[:, -1]) | |
hypothesis = train(X, y) | |
print(hypothesis) | |
2. Candidate Elimination | |
import pandas as pd | |
import numpy as np | |
def train(X, y): | |
# For general hypotheses we use 1 | |
general_h = np.full((len(X[0]), len(X[0])), fill_value="?", dtype=object) | |
# For specific hypotheses we use 0 | |
specific_h = X[0].copy() | |
for i in range(len(X)): | |
# If value is yes make, update specific hypotheses | |
if y[i] == "yes": | |
for j in range(len(X[i])): | |
if X[i][j] != specific_h[j]: | |
specific_h[j] = '?' | |
general_h[j][j] = '?' | |
# Else update specific hypotheses | |
elif y[i] == "no": | |
for j in range(len(X[i])): | |
if X[i][j] != specific_h[j]: | |
general_h[j][j] = specific_h[j] | |
else: | |
general_h[j][j] = '?' | |
# Remove the most general hypotheses | |
indices = [] | |
ref = np.full(len(X[0]), fill_value="?", dtype=object) | |
for i in range(len(general_h)): | |
if np.array_equal(general_h[i], ref): | |
indices.append(i) | |
general_h = np.delete(general_h, indices, axis=0) | |
return [general_h, specific_h] | |
df = pd.read_csv('./a1.csv') | |
X = np.array(df.iloc[:, :-1]) | |
y = np.array(df.iloc[:, -1]) | |
general_h, specific_h = train(X, y) | |
print(general_h) | |
print(specific_h) | |
3.KNN | |
from math import sqrt | |
from re import I | |
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix, accuracy_score | |
k = 7 | |
def find_distance(x, y): | |
distance = 0 | |
for i in range(0, len(x)): | |
if i == 0: | |
if x[i] == y[i]: | |
distance += 1.00 | |
else: | |
distance += round(pow(x[i]-y[i], 2)) | |
return round(sqrt(distance), 2) | |
def k_nearest_neighbours(X, y_train, x): | |
distances = {} | |
for i in range(0, len(X)): | |
distance = find_distance(X[i], x) | |
distances[i] = distance | |
nearest_neighbours = sorted(distances.items(), key=lambda x: x[1]) | |
return dict(nearest_neighbours[0:k]) | |
def predict(X_train, y_train, X): | |
neighbours = k_nearest_neighbours(X_train, y_train, X) | |
classes = {} | |
max_val = 0 | |
max_class = "" | |
for key in neighbours.keys(): | |
item_class = y_train[key] | |
if item_class in classes: | |
classes[item_class] += 1 | |
else: | |
classes[item_class] = 1 | |
if classes[item_class] > max_val: | |
max_val = classes[item_class] | |
max_class = item_class | |
return max_class | |
def main(): | |
df = pd.read_csv("./Climate_Data.csv") | |
df = df.dropna(axis=0, subset=["AUST"]) | |
X = np.array(df.iloc[:, :-1]) | |
y = np.array(df.iloc[:, -1]) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.2, random_state=0) | |
pred_df = {"Actual": [], "Predicted": []} | |
for i in range(0, len(X_test)): | |
pred_df["Actual"].append(y_test[i]) | |
pred_df["Predicted"].append(predict(X_train, y_train, X_test[i])) | |
pred_df = pd.DataFrame.from_dict(pred_df) | |
print(pred_df.head()) | |
cf_matrix = confusion_matrix(pred_df["Actual"], pred_df["Predicted"]) | |
print(cf_matrix) | |
main() | |
4.K-Means | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib import style | |
style.use('ggplot') | |
class K_Means: | |
def __init__(self, k=3, tol=0.001, max_iter=300): | |
self.k = k | |
self.tol = tol | |
self.max_iter = max_iter | |
def fit(self, data): | |
self.centroids = {} | |
for i in range(self.k): | |
self.centroids[i] = data[i] | |
for i in range(self.max_iter): | |
self.classifications = {} | |
for i in range(self.k): | |
self.classifications[i] = [] | |
for featureset in data: | |
distances = [np.linalg.norm( | |
featureset-self.centroids[centroid]) for centroid in self.centroids] | |
classification = distances.index(min(distances)) | |
self.classifications[classification].append(featureset) | |
prev_centroids = dict(self.centroids) | |
for classification in self.classifications: | |
self.centroids[classification] = np.average( | |
self.classifications[classification], axis=0) | |
optimized = True | |
for c in self.centroids: | |
original_centroid = prev_centroids[c] | |
current_centroid = self.centroids[c] | |
if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol: | |
print(np.sum((current_centroid-original_centroid) / | |
original_centroid*100.0)) | |
optimized = False | |
if optimized: | |
break | |
def predict(self, data): | |
distances = [np.linalg.norm(data-self.centroids[centroid]) | |
for centroid in self.centroids] | |
classification = distances.index(min(distances)) | |
return classification | |
X = 1 + np.random.rand(20, 2) | |
X1 = -1 + np.random.rand(20, 2) | |
X2 = 5 + np.random.rand(20, 2) | |
X = np.concatenate((X, X1, X2)) | |
plt.scatter(X[:, 0], X[:, 1]) | |
plt.show() | |
colors = 10*["g", "r", "c", "b", "k"] | |
clf = K_Means() | |
clf.fit(X) | |
for centroid in clf.centroids: | |
plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], | |
marker="o", color="k", s=150, linewidths=5) | |
for classification in clf.classifications: | |
color = colors[classification] | |
for featureset in clf.classifications[classification]: | |
plt.scatter(featureset[0], featureset[1], | |
marker="x", color=color, s=150, linewidths=5) | |
plt.show() | |
5.K-Means (Using KNN) | |
import numpy as np | |
import pandas as pd | |
from math import sqrt | |
df =pd.read_csv('kmean.csv') | |
print(df) | |
X=df['x'].values | |
print(X) | |
Y=df['y'].values | |
print(Y) | |
n = len(X) | |
pairs = [] | |
for i in range(0,n): | |
pairs.append((X[i],Y[i])) | |
# print(pairs) | |
T = 4 | |
clusters=[] | |
clusters.append([]) | |
clusters[0].append(pairs[0]) | |
def euc_dist(C,X): | |
x,y=C[0],C[1] | |
x1,y1 = X[0],X[1] | |
return sqrt(((x-x1)**2)+ ((y-y1)**2)) | |
n=0 | |
for i in range(1,len(df)): | |
temp_clus = clusters[0] | |
min_d = float('inf') | |
count=0 | |
for j in clusters: | |
for k in j : | |
dist = euc_dist(pairs[i],k) | |
if dist>=T: | |
continue | |
else: | |
if dist<min_d: | |
count=1 | |
min_d=dist | |
temp_clus=j | |
if count==0: | |
n=n+1 | |
clusters.append([]) | |
clusters[n].append(pairs[i]) | |
else: | |
temp_clus.append(pairs[i]) | |
# for cluster in clusters: | |
# print(cluster) | |
6. Gradient Descent | |
import numpy as np | |
import pandas as pd | |
data = pd.read_csv("Iris.csv") | |
def grdient_descent(x1,x2,x3,x4,y): | |
m1=m2=m3=m4=b=0 | |
n = len(x1) | |
alpha=0.01 | |
for i in range(100): | |
y_pred = m1*x1 + m2*x2 + m3*x3 + m4*x4 + b | |
md1 = -(2/n)sum(x1(y-y_pred)) | |
md2 = -(2/n)sum(x2(y-y_pred)) | |
md3 = -(2/n)sum(x3(y-y_pred)) | |
md4 = -(2/n)sum(x4(y-y_pred)) | |
bd = -(2/n)*sum((y-y_pred)) | |
m1 = m1 - md1*alpha | |
m2 = m2 - md2*alpha | |
m3 = m3 - md3*alpha | |
m4 = m4 - md4*alpha | |
b = b - bd*alpha | |
print(m1,'x1 + ',m2,'x2 + ',m3,'x3 + ',m4,'x4 + ',b) | |
x1=data['SepalLengthCm'] | |
x2=data['SepalWidthCm'] | |
x3=data['PetalLengthCm'] | |
x4=data['PetalWidthCm'] | |
y=data['Species'] | |
grdient_descent(x1,x2,x3,x4,y) | |
7. Linear Regression | |
import pandas as pd | |
import numpy as np | |
def train(X, y): | |
# Function to train the model | |
# We are using Ordinary Least Squares function for this | |
X_mean = np.mean(X) | |
y_mean = np.mean(y) | |
# Squared sum of Xy and XX | |
sum_xy = 0 | |
sum_xx = 0 | |
# Calculate the numerator and denominator | |
for i in range(len(X)): | |
x_diff = X[i] - X_mean | |
y_diff = y[i] - y_mean | |
sum_xy = sum_xy + (x_diff * y_diff) | |
sum_xx = sum_xx + pow(X[i] - X_mean, 2) | |
# Calculate the coefficients | |
b1 = sum_xy/sum_xx | |
b0 = y_mean - (X_mean * b1) | |
# Return the parameters | |
return [b0[0], b1[0]] | |
def main(): | |
df = pd.read_csv('./LR.csv') | |
print(df.head()) | |
X = np.array(df.iloc[:, :-1]) | |
y = np.array(df.iloc[:, -1]) | |
print(X) | |
print(y) | |
b0, b1 = train(X, y) | |
b0 = round(b0, 2) | |
b1 = round(b1, 2) | |
print('The equation for linear regression is y =', b0, '+', b1, '* x') | |
main() | |
8. Logistic Regression | |
9. SVM | |
# importing some basic libraries | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
from matplotlib import style | |
style.use('ggplot') | |
import numpy as np | |
class SVM(object): | |
def __init__(self,visualization=True): | |
self.visualization = visualization | |
self.colors = {1:'r',-1:'b'} | |
if self.visualization: | |
self.fig = plt.figure() | |
self.ax = self.fig.add_subplot(1,1,1) | |
def fit(self,data): | |
#train with data | |
self.data = data | |
# { |\w\|:{w,b}} | |
opt_dict = {} | |
transforms = [[1,1],[-1,1],[-1,-1],[1,-1]] | |
all_data = np.array([]) | |
for yi in self.data: | |
all_data = np.append(all_data,self.data[yi]) | |
self.max_feature_value = max(all_data) | |
self.min_feature_value = min(all_data) | |
all_data = None | |
#with smaller steps our margins and db will be more precise | |
step_sizes = [self.max_feature_value * 0.1, | |
self.max_feature_value * 0.01, | |
#point of expense | |
self.max_feature_value * 0.001,] | |
#extremly expensise | |
b_range_multiple = 5 | |
#we dont need to take as small step as w | |
b_multiple = 5 | |
latest_optimum = self.max_feature_value*10 | |
""" | |
objective is to satisfy yi(x.w)+b>=1 for all training dataset such that ||w|| is minimum | |
for this we will start with random w, and try to satisfy it with making b bigger and bigger | |
""" | |
#making step smaller and smaller to get precise value | |
for step in step_sizes: | |
w = np.array([latest_optimum,latest_optimum]) | |
#we can do this because convex | |
optimized = False | |
while not optimized: | |
for b in np.arange(-1*self.max_feature_value*b_range_multiple, | |
self.max_feature_value*b_range_multiple, | |
step*b_multiple): | |
for transformation in transforms: | |
w_t = w*transformation | |
found_option = True | |
#weakest link in SVM fundamentally | |
#SMO attempts to fix this a bit | |
# ti(xi.w+b) >=1 | |
for i in self.data: | |
for xi in self.data[i]: | |
yi=i | |
if not yi*(np.dot(w_t,xi)+b)>=1: | |
found_option=False | |
if found_option: | |
""" | |
all points in dataset satisfy y(w.x)+b>=1 for this cuurent w_t, b | |
then put w,b in dict with ||w|| as key | |
""" | |
opt_dict[np.linalg.norm(w_t)]=[w_t,b] | |
#after w[0] or w[1]<0 then values of w starts repeating itself because of transformation | |
#Think about it, it is easy | |
#print(w,len(opt_dict)) Try printing to understand | |
if w[0]<0: | |
optimized=True | |
print("optimized a step") | |
else: | |
w = w-step | |
# sorting ||w|| to put the smallest ||w|| at poition 0 | |
norms = sorted([n for n in opt_dict]) | |
#optimal values of w,b | |
opt_choice = opt_dict[norms[0]] | |
self.w=opt_choice[0] | |
self.b=opt_choice[1] | |
#start with new latest_optimum (initial values for w) | |
latest_optimum = opt_choice[0][0]+step*2 | |
def predict(self,features): | |
#sign(x.w+b) | |
classification = np.sign(np.dot(np.array(features),self.w)+self.b) | |
if classification!=0 and self.visualization: | |
self.ax.scatter(features[0],features[1],s=200,marker='*',c=self.colors[classification]) | |
return (classification,np.dot(np.array(features),self.w)+self.b) | |
def visualize(self): | |
[[self.ax.scatter(x[0],x[1],s=100,c=self.colors[i]) for x in data_dict[i]] for i in data_dict] | |
# hyperplane = x.w+b (actually its a line) | |
# v = x0.w0+x1.w1+b -> x1 = (v-w[0].x[0]-b)/w1 | |
#psv = 1 psv line -> x.w+b = 1a small value of b we will increase it later | |
#nsv = -1 nsv line -> x.w+b = -1 | |
# dec = 0 db line -> x.w+b = 0 | |
def hyperplane(x,w,b,v): | |
#returns a x2 value on line when given x1 | |
return (-w[0]*x-b+v)/w[1] | |
hyp_x_min= self.min_feature_value*0.9 | |
hyp_x_max = self.max_feature_value*1.1 | |
# (w.x+b)=1 | |
# positive support vector hyperplane | |
pav1 = hyperplane(hyp_x_min,self.w,self.b,1) | |
pav2 = hyperplane(hyp_x_max,self.w,self.b,1) | |
self.ax.plot([hyp_x_min,hyp_x_max],[pav1,pav2],'k') | |
# (w.x+b)=-1 | |
# negative support vector hyperplane | |
nav1 = hyperplane(hyp_x_min,self.w,self.b,-1) | |
nav2 = hyperplane(hyp_x_max,self.w,self.b,-1) | |
self.ax.plot([hyp_x_min,hyp_x_max],[nav1,nav2],'k') | |
# (w.x+b)=0 | |
# db support vector hyperplane | |
db1 = hyperplane(hyp_x_min,self.w,self.b,0) | |
db2 = hyperplane(hyp_x_max,self.w,self.b,0) | |
self.ax.plot([hyp_x_min,hyp_x_max],[db1,db2],'y--') | |
data_dict = {-1:np.array([[1,7],[2,8],[3,8]]),1:np.array([[5,1],[6,-1],[7,3]])} | |
svm = SVM() # Linear Kernel | |
svm.fit(data=data_dict) | |
svm.visualize() | |
svm.predict([3,8]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment