Skip to content

Instantly share code, notes, and snippets.

@cjue25
Last active February 4, 2018 16:30
Show Gist options
  • Save cjue25/2fa205de97fb56233fe69513bc28263c to your computer and use it in GitHub Desktop.
Save cjue25/2fa205de97fb56233fe69513bc28263c to your computer and use it in GitHub Desktop.
Use Python for Research_edX_Case_Study_3
numpy.power(兩點相減,相差的平方)
max(dictionary) = max(dictionary.keys())
max(dictionary.values()) 是找值
scipy.stats.mstats.mode
直接丟入list( or array),可以轉換成(最多出現的值,次數)
但是呈現規則為「次數皆相同者,只會秀出最小的值」,沒有考慮重複的問題
np.argsort(np.array) #從給定的np.array中由小排到大的index位置
scipy.stats.norm(0,1).rvs((n,2)
建立一個n列、2欄的array,取值來自平均值為0,標準差為1的分布
numpy.repeat(0,n)
建立一個1維的array,重複n個0
xs=numpy.arange(x_min,x_max,xh)
ys=numpy.arange(y_min,y_max,yh)
xx,yy=numpy.meshgrid(xs,ys)
想成是一個網格
直接舉例
x=np.arange(0,4,2) → x=array([0, 2])
y=np.arange(2,10,3) → array([2, 5, 8])
xx,yy=np.meshgrid(x,y)
xx
array([[0, 2],
[0, 2],
[0, 2]])
yy
array([[2, 2],
[5, 5],
[8, 8]])
import numpy as np
import random
import matplotlib.pyplot as plt
#算距離
def distance(p1,p2):
p1=np.array(p1)
p2=np.array(p2)
return np.sqrt(np.sum(np.power(p1-p2,2)))
def find_nearest_neibors(p,points,k): #points的數量應要大於k的數量(?)
distances=np.zeros(points.shape[0])
for i in range(len(points)):
distances[i]=distance(p,points[i])
ind=np.argsort(distances) #從距離當中由小排到大的index位置
return ind[:k]
##測試
#points=np.array([[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3]])
#p=np.array([2.5,2])
#print (find_nearest_neibors(p,points,k=5))
#算票數,有同票的話用random去選
def majority_vote(votes):
vote_counts={}
for vote in votes:
if vote in vote_counts:
vote_counts[vote]+=1
else:
vote_counts[vote]=1
winners=[]
max_counter=max(vote_counts.values())
for vote, count in vote_counts.items():
if max_counter==count:
winners.append(vote)
return random.choice(winners) #以防重複
#print (majority_vote([1,2,3,2,1,1,2,3]))
##快速算法,缺點是若有重複只會選取第一個值###
def majority_vote_short(votes):
"""
scipy.stats.mstats.mode 直接丟入list,可以轉換成(最多出現的值,次數)
但是呈現規則為「次數皆相同者,只會秀出最小的值」,沒有考慮重複的問題
"""
mode, count = ss.mstats.mode(votes)
return mode
#print (majority_vote_short([1,2,3,2,1,1,2,3]))
def knn_predict(p,points,outcomes,k):
ind=find_nearest_neibors(p,points,k)
return majority_vote(outcomes[ind])
#outcomes = np.array([0,0,0,0,1,1,1,1,1])
#print (knn_predict(np.array([2.5,2.7]),points,outcomes,k=2))
#創建可預料的資料庫來檢測KNN預測模型
def generate_synth_data(n=50):
"""Create two sets of points from bivariate normal distributions."""
#norm後面 平均、標準差 rvs接 row column
points=np.concatenate((ss.norm(0,1).rvs((n,2)),ss.norm(1,1).rvs((n,2))),axis=0)
outcomes=np.concatenate((np.repeat(0,n),np.repeat(1,n)))
return (points,outcomes)
#畫畫看
##n=20
##points,outcomes=generate_synth_data(n=20)
##plt.figure()
##plt.plot(points[:n,0],points[:n,1],"ro")
##plt.plot(points[n:,0],points[n:,1],"bo")
##plt.savefig("bivardata.pdf.pdf")
#轉化為prediction grid的模式
def make_prediction_grid(predictors,outcomes,limits,h,k):
"""classify each point on the prediction grid."""
(x_min,x_max,y_min,y_max)=limits
xs=np.arange(x_min,x_max,h)
ys=np.arange(y_min,y_max,h)
xx,yy = np.meshgrid(xs,ys)
prediction_grid=np.zeros(xx.shape, dtype=int)
for i, x in enumerate(xs):
for j, y in enumerate(ys):
p=np.array([x,y])
prediction_grid[j,i]=knn_predict(p,predictors,outcomes,k)
return (xx,yy,prediction_grid)
#畫出預測結果
def plot_prediction_grid (xx, yy, prediction_grid, filename):
""" Plot KNN predictions for every point on the grid."""
from matplotlib.colors import ListedColormap
background_colormap = ListedColormap (["hotpink","lightskyblue", "yellowgreen"])
observation_colormap = ListedColormap (["red","blue","green"])
plt.figure(figsize =(10,10))
plt.pcolormesh(xx, yy, prediction_grid, cmap = background_colormap, alpha = 0.5)
plt.scatter(predictors[:,0], predictors [:,1], c = outcomes, cmap = observation_colormap ,s = 50)
plt.xlabel('Variable 1'); plt.ylabel('Variable 2')
plt.xticks(()); plt.yticks(())
plt.xlim (np.min(xx), np.max(xx))
plt.ylim (np.min(yy), np.max(yy))
plt.savefig(filename)
(predictors,outcomes)=generate_synth_data()
k=5;filename="knn_synth_5.pdf";limits=(-3,4,-3,4) ;h=0.1
(xx,yy,prediction_grid)=make_prediction_grid(predictors,outcomes,limits,h,k)
#plot_prediction_grid(xx,yy,prediction_grid,filename)
k=50;filename="knn_synth_50.pdf";limits=(-3,4,-3,4) ;h=0.1
(xx,yy,prediction_grid)=make_prediction_grid(predictors,outcomes,limits,h,k)
#plot_prediction_grid(xx,yy,prediction_grid,filename)
#利用sklearn的iris資料庫,比較sklearn的KNN模型及自己撰寫的預測結果差異
from sklearn import datasets
iris=datasets.load_iris()
predictors=iris.data[:,0:2] #影片先只取兩個特徵
outcomes=iris.target
plt.plot(predictors[outcomes==0][:,0],predictors[outcomes==0][:,1],'ro')
plt.plot(predictors[outcomes==1][:,0],predictors[outcomes==1][:,1],'go')
plt.plot(predictors[outcomes==2][:,0],predictors[outcomes==2][:,1],'bo')
k=5;filename="iris_grid.pdf";limits=(4,8,1.5,4.5) ;h=0.1
(xx,yy,prediction_grid)=make_prediction_grid(predictors,outcomes,limits,h,k)
plot_prediction_grid(xx,yy,prediction_grid,filename)
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(predictors,outcomes)
sk_predictions=knn.predict(predictors)
my_predictions=np.array([knn_predict(p,predictors,outcomes,5) for p in predictors])
print (100*np.mean(sk_predictions ==my_predictions))
print (100*np.mean(sk_predictions ==iris.target))
print (100*np.mean(my_predictions ==iris.target))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment