Skip to content

Instantly share code, notes, and snippets.

@Axhat
Last active December 4, 2022 14:17
Show Gist options
  • Save Axhat/8383aa34ff216c4e4c5272fc998ba71b to your computer and use it in GitHub Desktop.
Save Axhat/8383aa34ff216c4e4c5272fc998ba71b to your computer and use it in GitHub Desktop.
ok ok ok
1. Find S
import pandas as pd
import numpy as np
def train(X, y):
# Create the most specific hypothesis
hypothesis = []
for i in range(len(X[0])):
hypothesis.append("null")
# Iterate over the features
for i in range(len(X)):
# Check if the concept is true
if y[i] == "yes":
# Compare all the features in the example
for j in range(len(X[i])):
feature = X[i][j]
if hypothesis[j] == "null":
hypothesis[j] = feature
elif hypothesis[j] != feature:
hypothesis[j] = "general"
return hypothesis
# Read csv and split into dependent and independent variable vectors
df = pd.read_csv('./a1.csv')
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])
hypothesis = train(X, y)
print(hypothesis)
2. Candidate Elimination
import pandas as pd
import numpy as np
def train(X, y):
# For general hypotheses we use 1
general_h = np.full((len(X[0]), len(X[0])), fill_value="?", dtype=object)
# For specific hypotheses we use 0
specific_h = X[0].copy()
for i in range(len(X)):
# If value is yes make, update specific hypotheses
if y[i] == "yes":
for j in range(len(X[i])):
if X[i][j] != specific_h[j]:
specific_h[j] = '?'
general_h[j][j] = '?'
# Else update specific hypotheses
elif y[i] == "no":
for j in range(len(X[i])):
if X[i][j] != specific_h[j]:
general_h[j][j] = specific_h[j]
else:
general_h[j][j] = '?'
# Remove the most general hypotheses
indices = []
ref = np.full(len(X[0]), fill_value="?", dtype=object)
for i in range(len(general_h)):
if np.array_equal(general_h[i], ref):
indices.append(i)
general_h = np.delete(general_h, indices, axis=0)
return [general_h, specific_h]
df = pd.read_csv('./a1.csv')
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])
general_h, specific_h = train(X, y)
print(general_h)
print(specific_h)
3.KNN
from math import sqrt
from re import I
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
k = 7
def find_distance(x, y):
distance = 0
for i in range(0, len(x)):
if i == 0:
if x[i] == y[i]:
distance += 1.00
else:
distance += round(pow(x[i]-y[i], 2))
return round(sqrt(distance), 2)
def k_nearest_neighbours(X, y_train, x):
distances = {}
for i in range(0, len(X)):
distance = find_distance(X[i], x)
distances[i] = distance
nearest_neighbours = sorted(distances.items(), key=lambda x: x[1])
return dict(nearest_neighbours[0:k])
def predict(X_train, y_train, X):
neighbours = k_nearest_neighbours(X_train, y_train, X)
classes = {}
max_val = 0
max_class = ""
for key in neighbours.keys():
item_class = y_train[key]
if item_class in classes:
classes[item_class] += 1
else:
classes[item_class] = 1
if classes[item_class] > max_val:
max_val = classes[item_class]
max_class = item_class
return max_class
def main():
df = pd.read_csv("./Climate_Data.csv")
df = df.dropna(axis=0, subset=["AUST"])
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
pred_df = {"Actual": [], "Predicted": []}
for i in range(0, len(X_test)):
pred_df["Actual"].append(y_test[i])
pred_df["Predicted"].append(predict(X_train, y_train, X_test[i]))
pred_df = pd.DataFrame.from_dict(pred_df)
print(pred_df.head())
cf_matrix = confusion_matrix(pred_df["Actual"], pred_df["Predicted"])
print(cf_matrix)
main()
4.K-Means
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
class K_Means:
def __init__(self, k=3, tol=0.001, max_iter=300):
self.k = k
self.tol = tol
self.max_iter = max_iter
def fit(self, data):
self.centroids = {}
for i in range(self.k):
self.centroids[i] = data[i]
for i in range(self.max_iter):
self.classifications = {}
for i in range(self.k):
self.classifications[i] = []
for featureset in data:
distances = [np.linalg.norm(
featureset-self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classifications[classification].append(featureset)
prev_centroids = dict(self.centroids)
for classification in self.classifications:
self.centroids[classification] = np.average(
self.classifications[classification], axis=0)
optimized = True
for c in self.centroids:
original_centroid = prev_centroids[c]
current_centroid = self.centroids[c]
if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
print(np.sum((current_centroid-original_centroid) /
original_centroid*100.0))
optimized = False
if optimized:
break
def predict(self, data):
distances = [np.linalg.norm(data-self.centroids[centroid])
for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
X = 1 + np.random.rand(20, 2)
X1 = -1 + np.random.rand(20, 2)
X2 = 5 + np.random.rand(20, 2)
X = np.concatenate((X, X1, X2))
plt.scatter(X[:, 0], X[:, 1])
plt.show()
colors = 10*["g", "r", "c", "b", "k"]
clf = K_Means()
clf.fit(X)
for centroid in clf.centroids:
plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1],
marker="o", color="k", s=150, linewidths=5)
for classification in clf.classifications:
color = colors[classification]
for featureset in clf.classifications[classification]:
plt.scatter(featureset[0], featureset[1],
marker="x", color=color, s=150, linewidths=5)
plt.show()
5.K-Means (Using KNN)
import numpy as np
import pandas as pd
from math import sqrt
df =pd.read_csv('kmean.csv')
print(df)
X=df['x'].values
print(X)
Y=df['y'].values
print(Y)
n = len(X)
pairs = []
for i in range(0,n):
pairs.append((X[i],Y[i]))
# print(pairs)
T = 4
clusters=[]
clusters.append([])
clusters[0].append(pairs[0])
def euc_dist(C,X):
x,y=C[0],C[1]
x1,y1 = X[0],X[1]
return sqrt(((x-x1)**2)+ ((y-y1)**2))
n=0
for i in range(1,len(df)):
temp_clus = clusters[0]
min_d = float('inf')
count=0
for j in clusters:
for k in j :
dist = euc_dist(pairs[i],k)
if dist>=T:
continue
else:
if dist<min_d:
count=1
min_d=dist
temp_clus=j
if count==0:
n=n+1
clusters.append([])
clusters[n].append(pairs[i])
else:
temp_clus.append(pairs[i])
# for cluster in clusters:
# print(cluster)
6. Gradient Descent
import numpy as np
import pandas as pd
data = pd.read_csv("Iris.csv")
def grdient_descent(x1,x2,x3,x4,y):
m1=m2=m3=m4=b=0
n = len(x1)
alpha=0.01
for i in range(100):
y_pred = m1*x1 + m2*x2 + m3*x3 + m4*x4 + b
md1 = -(2/n)sum(x1(y-y_pred))
md2 = -(2/n)sum(x2(y-y_pred))
md3 = -(2/n)sum(x3(y-y_pred))
md4 = -(2/n)sum(x4(y-y_pred))
bd = -(2/n)*sum((y-y_pred))
m1 = m1 - md1*alpha
m2 = m2 - md2*alpha
m3 = m3 - md3*alpha
m4 = m4 - md4*alpha
b = b - bd*alpha
print(m1,'x1 + ',m2,'x2 + ',m3,'x3 + ',m4,'x4 + ',b)
x1=data['SepalLengthCm']
x2=data['SepalWidthCm']
x3=data['PetalLengthCm']
x4=data['PetalWidthCm']
y=data['Species']
grdient_descent(x1,x2,x3,x4,y)
7. Linear Regression
import pandas as pd
import numpy as np
def train(X, y):
# Function to train the model
# We are using Ordinary Least Squares function for this
X_mean = np.mean(X)
y_mean = np.mean(y)
# Squared sum of Xy and XX
sum_xy = 0
sum_xx = 0
# Calculate the numerator and denominator
for i in range(len(X)):
x_diff = X[i] - X_mean
y_diff = y[i] - y_mean
sum_xy = sum_xy + (x_diff * y_diff)
sum_xx = sum_xx + pow(X[i] - X_mean, 2)
# Calculate the coefficients
b1 = sum_xy/sum_xx
b0 = y_mean - (X_mean * b1)
# Return the parameters
return [b0[0], b1[0]]
def main():
df = pd.read_csv('./LR.csv')
print(df.head())
X = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])
print(X)
print(y)
b0, b1 = train(X, y)
b0 = round(b0, 2)
b1 = round(b1, 2)
print('The equation for linear regression is y =', b0, '+', b1, '* x')
main()
8. Logistic Regression
9. SVM
# importing some basic libraries
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
class SVM(object):
def __init__(self,visualization=True):
self.visualization = visualization
self.colors = {1:'r',-1:'b'}
if self.visualization:
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1,1,1)
def fit(self,data):
#train with data
self.data = data
# { |\w\|:{w,b}}
opt_dict = {}
transforms = [[1,1],[-1,1],[-1,-1],[1,-1]]
all_data = np.array([])
for yi in self.data:
all_data = np.append(all_data,self.data[yi])
self.max_feature_value = max(all_data)
self.min_feature_value = min(all_data)
all_data = None
#with smaller steps our margins and db will be more precise
step_sizes = [self.max_feature_value * 0.1,
self.max_feature_value * 0.01,
#point of expense
self.max_feature_value * 0.001,]
#extremly expensise
b_range_multiple = 5
#we dont need to take as small step as w
b_multiple = 5
latest_optimum = self.max_feature_value*10
"""
objective is to satisfy yi(x.w)+b>=1 for all training dataset such that ||w|| is minimum
for this we will start with random w, and try to satisfy it with making b bigger and bigger
"""
#making step smaller and smaller to get precise value
for step in step_sizes:
w = np.array([latest_optimum,latest_optimum])
#we can do this because convex
optimized = False
while not optimized:
for b in np.arange(-1*self.max_feature_value*b_range_multiple,
self.max_feature_value*b_range_multiple,
step*b_multiple):
for transformation in transforms:
w_t = w*transformation
found_option = True
#weakest link in SVM fundamentally
#SMO attempts to fix this a bit
# ti(xi.w+b) >=1
for i in self.data:
for xi in self.data[i]:
yi=i
if not yi*(np.dot(w_t,xi)+b)>=1:
found_option=False
if found_option:
"""
all points in dataset satisfy y(w.x)+b>=1 for this cuurent w_t, b
then put w,b in dict with ||w|| as key
"""
opt_dict[np.linalg.norm(w_t)]=[w_t,b]
#after w[0] or w[1]<0 then values of w starts repeating itself because of transformation
#Think about it, it is easy
#print(w,len(opt_dict)) Try printing to understand
if w[0]<0:
optimized=True
print("optimized a step")
else:
w = w-step
# sorting ||w|| to put the smallest ||w|| at poition 0
norms = sorted([n for n in opt_dict])
#optimal values of w,b
opt_choice = opt_dict[norms[0]]
self.w=opt_choice[0]
self.b=opt_choice[1]
#start with new latest_optimum (initial values for w)
latest_optimum = opt_choice[0][0]+step*2
def predict(self,features):
#sign(x.w+b)
classification = np.sign(np.dot(np.array(features),self.w)+self.b)
if classification!=0 and self.visualization:
self.ax.scatter(features[0],features[1],s=200,marker='*',c=self.colors[classification])
return (classification,np.dot(np.array(features),self.w)+self.b)
def visualize(self):
[[self.ax.scatter(x[0],x[1],s=100,c=self.colors[i]) for x in data_dict[i]] for i in data_dict]
# hyperplane = x.w+b (actually its a line)
# v = x0.w0+x1.w1+b -> x1 = (v-w[0].x[0]-b)/w1
#psv = 1 psv line -> x.w+b = 1a small value of b we will increase it later
#nsv = -1 nsv line -> x.w+b = -1
# dec = 0 db line -> x.w+b = 0
def hyperplane(x,w,b,v):
#returns a x2 value on line when given x1
return (-w[0]*x-b+v)/w[1]
hyp_x_min= self.min_feature_value*0.9
hyp_x_max = self.max_feature_value*1.1
# (w.x+b)=1
# positive support vector hyperplane
pav1 = hyperplane(hyp_x_min,self.w,self.b,1)
pav2 = hyperplane(hyp_x_max,self.w,self.b,1)
self.ax.plot([hyp_x_min,hyp_x_max],[pav1,pav2],'k')
# (w.x+b)=-1
# negative support vector hyperplane
nav1 = hyperplane(hyp_x_min,self.w,self.b,-1)
nav2 = hyperplane(hyp_x_max,self.w,self.b,-1)
self.ax.plot([hyp_x_min,hyp_x_max],[nav1,nav2],'k')
# (w.x+b)=0
# db support vector hyperplane
db1 = hyperplane(hyp_x_min,self.w,self.b,0)
db2 = hyperplane(hyp_x_max,self.w,self.b,0)
self.ax.plot([hyp_x_min,hyp_x_max],[db1,db2],'y--')
data_dict = {-1:np.array([[1,7],[2,8],[3,8]]),1:np.array([[5,1],[6,-1],[7,3]])}
svm = SVM() # Linear Kernel
svm.fit(data=data_dict)
svm.visualize()
svm.predict([3,8])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment