Last active
February 4, 2018 16:30
-
-
Save cjue25/2fa205de97fb56233fe69513bc28263c to your computer and use it in GitHub Desktop.
Use Python for Research_edX_Case_Study_3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
numpy.power(兩點相減,相差的平方) | |
max(dictionary) = max(dictionary.keys()) | |
max(dictionary.values()) 是找值 | |
scipy.stats.mstats.mode | |
直接丟入list( or array),可以轉換成(最多出現的值,次數) | |
但是呈現規則為「次數皆相同者,只會秀出最小的值」,沒有考慮重複的問題 | |
np.argsort(np.array) #從給定的np.array中由小排到大的index位置 | |
scipy.stats.norm(0,1).rvs((n,2) | |
建立一個n列、2欄的array,取值來自平均值為0,標準差為1的分布 | |
numpy.repeat(0,n) | |
建立一個1維的array,重複n個0 | |
xs=numpy.arange(x_min,x_max,xh) | |
ys=numpy.arange(y_min,y_max,yh) | |
xx,yy=numpy.meshgrid(xs,ys) | |
想成是一個網格 | |
直接舉例 | |
x=np.arange(0,4,2) → x=array([0, 2]) | |
y=np.arange(2,10,3) → array([2, 5, 8]) | |
xx,yy=np.meshgrid(x,y) | |
xx | |
array([[0, 2], | |
[0, 2], | |
[0, 2]]) | |
yy | |
array([[2, 2], | |
[5, 5], | |
[8, 8]]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import random | |
import matplotlib.pyplot as plt | |
#算距離 | |
def distance(p1,p2): | |
p1=np.array(p1) | |
p2=np.array(p2) | |
return np.sqrt(np.sum(np.power(p1-p2,2))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_nearest_neibors(p,points,k): #points的數量應要大於k的數量(?) | |
distances=np.zeros(points.shape[0]) | |
for i in range(len(points)): | |
distances[i]=distance(p,points[i]) | |
ind=np.argsort(distances) #從距離當中由小排到大的index位置 | |
return ind[:k] | |
##測試 | |
#points=np.array([[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3]]) | |
#p=np.array([2.5,2]) | |
#print (find_nearest_neibors(p,points,k=5)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#算票數,有同票的話用random去選 | |
def majority_vote(votes): | |
vote_counts={} | |
for vote in votes: | |
if vote in vote_counts: | |
vote_counts[vote]+=1 | |
else: | |
vote_counts[vote]=1 | |
winners=[] | |
max_counter=max(vote_counts.values()) | |
for vote, count in vote_counts.items(): | |
if max_counter==count: | |
winners.append(vote) | |
return random.choice(winners) #以防重複 | |
#print (majority_vote([1,2,3,2,1,1,2,3])) | |
##快速算法,缺點是若有重複只會選取第一個值### | |
def majority_vote_short(votes): | |
""" | |
scipy.stats.mstats.mode 直接丟入list,可以轉換成(最多出現的值,次數) | |
但是呈現規則為「次數皆相同者,只會秀出最小的值」,沒有考慮重複的問題 | |
""" | |
mode, count = ss.mstats.mode(votes) | |
return mode | |
#print (majority_vote_short([1,2,3,2,1,1,2,3])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def knn_predict(p,points,outcomes,k): | |
ind=find_nearest_neibors(p,points,k) | |
return majority_vote(outcomes[ind]) | |
#outcomes = np.array([0,0,0,0,1,1,1,1,1]) | |
#print (knn_predict(np.array([2.5,2.7]),points,outcomes,k=2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#創建可預料的資料庫來檢測KNN預測模型 | |
def generate_synth_data(n=50): | |
"""Create two sets of points from bivariate normal distributions.""" | |
#norm後面 平均、標準差 rvs接 row column | |
points=np.concatenate((ss.norm(0,1).rvs((n,2)),ss.norm(1,1).rvs((n,2))),axis=0) | |
outcomes=np.concatenate((np.repeat(0,n),np.repeat(1,n))) | |
return (points,outcomes) | |
#畫畫看 | |
##n=20 | |
##points,outcomes=generate_synth_data(n=20) | |
##plt.figure() | |
##plt.plot(points[:n,0],points[:n,1],"ro") | |
##plt.plot(points[n:,0],points[n:,1],"bo") | |
##plt.savefig("bivardata.pdf.pdf") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#轉化為prediction grid的模式 | |
def make_prediction_grid(predictors,outcomes,limits,h,k): | |
"""classify each point on the prediction grid.""" | |
(x_min,x_max,y_min,y_max)=limits | |
xs=np.arange(x_min,x_max,h) | |
ys=np.arange(y_min,y_max,h) | |
xx,yy = np.meshgrid(xs,ys) | |
prediction_grid=np.zeros(xx.shape, dtype=int) | |
for i, x in enumerate(xs): | |
for j, y in enumerate(ys): | |
p=np.array([x,y]) | |
prediction_grid[j,i]=knn_predict(p,predictors,outcomes,k) | |
return (xx,yy,prediction_grid) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#畫出預測結果 | |
def plot_prediction_grid (xx, yy, prediction_grid, filename): | |
""" Plot KNN predictions for every point on the grid.""" | |
from matplotlib.colors import ListedColormap | |
background_colormap = ListedColormap (["hotpink","lightskyblue", "yellowgreen"]) | |
observation_colormap = ListedColormap (["red","blue","green"]) | |
plt.figure(figsize =(10,10)) | |
plt.pcolormesh(xx, yy, prediction_grid, cmap = background_colormap, alpha = 0.5) | |
plt.scatter(predictors[:,0], predictors [:,1], c = outcomes, cmap = observation_colormap ,s = 50) | |
plt.xlabel('Variable 1'); plt.ylabel('Variable 2') | |
plt.xticks(()); plt.yticks(()) | |
plt.xlim (np.min(xx), np.max(xx)) | |
plt.ylim (np.min(yy), np.max(yy)) | |
plt.savefig(filename) | |
(predictors,outcomes)=generate_synth_data() | |
k=5;filename="knn_synth_5.pdf";limits=(-3,4,-3,4) ;h=0.1 | |
(xx,yy,prediction_grid)=make_prediction_grid(predictors,outcomes,limits,h,k) | |
#plot_prediction_grid(xx,yy,prediction_grid,filename) | |
k=50;filename="knn_synth_50.pdf";limits=(-3,4,-3,4) ;h=0.1 | |
(xx,yy,prediction_grid)=make_prediction_grid(predictors,outcomes,limits,h,k) | |
#plot_prediction_grid(xx,yy,prediction_grid,filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#利用sklearn的iris資料庫,比較sklearn的KNN模型及自己撰寫的預測結果差異 | |
from sklearn import datasets | |
iris=datasets.load_iris() | |
predictors=iris.data[:,0:2] #影片先只取兩個特徵 | |
outcomes=iris.target | |
plt.plot(predictors[outcomes==0][:,0],predictors[outcomes==0][:,1],'ro') | |
plt.plot(predictors[outcomes==1][:,0],predictors[outcomes==1][:,1],'go') | |
plt.plot(predictors[outcomes==2][:,0],predictors[outcomes==2][:,1],'bo') | |
k=5;filename="iris_grid.pdf";limits=(4,8,1.5,4.5) ;h=0.1 | |
(xx,yy,prediction_grid)=make_prediction_grid(predictors,outcomes,limits,h,k) | |
plot_prediction_grid(xx,yy,prediction_grid,filename) | |
from sklearn.neighbors import KNeighborsClassifier | |
knn=KNeighborsClassifier(n_neighbors=5) | |
knn.fit(predictors,outcomes) | |
sk_predictions=knn.predict(predictors) | |
my_predictions=np.array([knn_predict(p,predictors,outcomes,5) for p in predictors]) | |
print (100*np.mean(sk_predictions ==my_predictions)) | |
print (100*np.mean(sk_predictions ==iris.target)) | |
print (100*np.mean(my_predictions ==iris.target)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment