Skip to content

Instantly share code, notes, and snippets.

@tpgmartin
Last active November 14, 2017 08:59

Revisions

  1. tpgmartin revised this gist Nov 14, 2017. 1 changed file with 0 additions and 4 deletions.
    4 changes: 0 additions & 4 deletions simple_knn_classifier.py
    Original file line number Diff line number Diff line change
    @@ -27,10 +27,6 @@ def __closest(self, row):
    return self.__vote(sorted_distances)

    def __vote(self, distances):
    # labels = []
    # for i in range(self.n_neighbors):
    # labels.append(distances[i][0])
    # return Counter(labels).most_common(1)[0][0]
    return Counter(x[0] for x in distances[:self.n_neighbors]).most_common(1)[0][0]

    # test.py
  2. tpgmartin revised this gist Nov 7, 2017. 1 changed file with 55 additions and 46 deletions.
    101 changes: 55 additions & 46 deletions simple_knn_classifier.py
    Original file line number Diff line number Diff line change
    @@ -1,69 +1,78 @@
    import math
    # main.py
    from scipy.spatial import distance
    from collections import Counter

    class KNN():

    def euc(a, b):
    return distance.euclidean(a,b)
    def __init__(self, n_neighbors=1):
    self.n_neighbors = n_neighbors


    class ScrappyKNN():

    def fit(self, X_train, y_train, k):
    def fit(self, X_train, y_train):
    self.X_train = X_train
    self.y_train = y_train
    self.k = k

    def predict(self, X_test):
    predictions = []
    for row in X_test:
    # label = self.closest(row)
    predictions.append(label)
    prediction = self.__closest(row)
    predictions.append(prediction)
    return predictions

    # def closest(self, row):
    # best_dist = euc(row, self.X_train[0])
    # best_index = 0
    # for i in range(1, len(self.X_train)):
    # dist = euc(row, self.X_train[i])
    # if dist < best_dist:
    # best_dist = dist
    # best_index = i
    # return self.y_train[best_index]

    def closest(self, row):
    # best_dist = euc(row, self.X_train[0])
    # best_index = 0
    def __closest(self, row):
    distances = []
    for i in range(1, len(self.X_train)):
    dist = euc(row, self.X_train[i])
    distances.append([self.y_train[i], dist])
    return sorted(distances, key=lambda x: x[1])
    for i in range(len(self.X_train)):
    dist = distance.euclidean(row, self.X_train[i])
    distances.append((self.y_train[i], dist))
    sorted_distances = sorted(distances, key=lambda x: x[1])
    return self.__vote(sorted_distances)

    def __vote(self, distances):
    # labels = []
    # for i in range(self.n_neighbors):
    # labels.append(distances[i][0])
    # return Counter(labels).most_common(1)[0][0]
    return Counter(x[0] for x in distances[:self.n_neighbors]).most_common(1)[0][0]

    # test.py
    import pytest
    from main import KNN

    X_train = [
    [0, 0, 0, 0],
    [1, 1, 1, 1],
    [1, 1, 1, 1],
    [2, 2, 2, 2],
    [2, 2, 2, 2],
    [2, 2, 2, 2],
    [2, 2, 2, 2]
    ]
    y_train = [0, 1, 1, 2, 2, 2, 2]


    @pytest.mark.parametrize(('n_neighbors'),[1,3,5])
    def test_KNN_should_be_initialised_with_n_neighbors(n_neighbors):
    clf = KNN(n_neighbors)

    def vote(self, row):
    distances = self.closest(row)
    labels = []
    for i in range(self.k):
    labels.append(distances[i][0])
    return Counter(labels).most_common(1)[0][0]
    clf.fit(X_train, y_train)

    assert clf.n_neighbors == n_neighbors

    from sklearn import datasets
    iris = datasets.load_iris()
    @pytest.mark.parametrize(('n_neighbors'),[1,3,5])
    def test_should_be_able_to_pass_training_data_to_classifier(n_neighbors):
    clf = KNN(n_neighbors)

    X = iris.data
    y = iris.target
    clf.fit(X_train, y_train)

    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)
    assert clf.X_train == X_train
    assert clf.y_train == y_train

    # from sklearn.neighbors import KNeighborsClassifier
    clf = ScrappyKNN()
    X_test = [[0, 0, 0, 0]]
    @pytest.mark.parametrize(('n_neighbors', 'y_test'),[(1, [0]),(3, [1]), (7, [2])])
    def test_predict_should_return_label_for_test_data(n_neighbors, y_test):
    clf = KNN(n_neighbors)

    clf.fit(X_train, y_train, 3)
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)
    predictions = clf.predict(X_test)

    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, predictions))
    assert predictions == y_test
  3. tpgmartin created this gist Nov 5, 2017.
    69 changes: 69 additions & 0 deletions simple_knn_classifier.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,69 @@
    import math
    from scipy.spatial import distance
    from collections import Counter


    def euc(a, b):
    return distance.euclidean(a,b)


    class ScrappyKNN():

    def fit(self, X_train, y_train, k):
    self.X_train = X_train
    self.y_train = y_train
    self.k = k

    def predict(self, X_test):
    predictions = []
    for row in X_test:
    # label = self.closest(row)
    predictions.append(label)
    return predictions

    # def closest(self, row):
    # best_dist = euc(row, self.X_train[0])
    # best_index = 0
    # for i in range(1, len(self.X_train)):
    # dist = euc(row, self.X_train[i])
    # if dist < best_dist:
    # best_dist = dist
    # best_index = i
    # return self.y_train[best_index]

    def closest(self, row):
    # best_dist = euc(row, self.X_train[0])
    # best_index = 0
    distances = []
    for i in range(1, len(self.X_train)):
    dist = euc(row, self.X_train[i])
    distances.append([self.y_train[i], dist])
    return sorted(distances, key=lambda x: x[1])

    def vote(self, row):
    distances = self.closest(row)
    labels = []
    for i in range(self.k):
    labels.append(distances[i][0])
    return Counter(labels).most_common(1)[0][0]


    from sklearn import datasets
    iris = datasets.load_iris()

    X = iris.data
    y = iris.target

    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

    # from sklearn.neighbors import KNeighborsClassifier
    clf = ScrappyKNN()

    clf.fit(X_train, y_train, 3)

    predictions = clf.predict(X_test)

    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, predictions))