tpgmartin/simple_knn_classifier.py

## 4 changes: 0 additions & 4 deletions simple_knn_classifier.py
@@ -27,10 +27,6 @@ def __closest(self, row):

            return self.__vote(sorted_distances)
        return self.__vote(sorted_distances)


        def __vote(self, distances):
    def __vote(self, distances):

            # labels = []
        # labels = []

            # for i in range(self.n_neighbors):
        # for i in range(self.n_neighbors):

            #     labels.append(distances[i][0])
        #     labels.append(distances[i][0])

            # return Counter(labels).most_common(1)[0][0]
        # return Counter(labels).most_common(1)[0][0]

            return Counter(x[0] for x in distances[:self.n_neighbors]).most_common(1)[0][0]
        return Counter(x[0] for x in distances[:self.n_neighbors]).most_common(1)[0][0]


    # test.py
# test.py


## 101 changes: 55 additions & 46 deletions simple_knn_classifier.py
@@ -1,69 +1,78 @@

    import math
import math

    # main.py
# main.py

    from scipy.spatial import distance
from scipy.spatial import distance

    from collections import Counter
from collections import Counter


    class KNN():
class KNN():


    def euc(a, b):
def euc(a, b):

        return distance.euclidean(a,b)
    return distance.euclidean(a,b)

        def __init__(self, n_neighbors=1):
    def __init__(self, n_neighbors=1):

            self.n_neighbors = n_neighbors
        self.n_neighbors = n_neighbors


    class ScrappyKNN():
class ScrappyKNN():


        def fit(self, X_train, y_train, k):
    def fit(self, X_train, y_train, k):

        def fit(self, X_train, y_train):
    def fit(self, X_train, y_train):

            self.X_train = X_train
        self.X_train = X_train

            self.y_train = y_train
        self.y_train = y_train

            self.k = k
        self.k = k


        def predict(self, X_test):
    def predict(self, X_test):

            predictions = []
        predictions = []

            for row in X_test:
        for row in X_test:

                # label = self.closest(row)
            # label = self.closest(row)

                predictions.append(label)
            predictions.append(label)

                prediction = self.__closest(row)
            prediction = self.__closest(row)

                predictions.append(prediction)
            predictions.append(prediction)

            return predictions
        return predictions


        # def closest(self, row):
    # def closest(self, row):

        #   best_dist = euc(row, self.X_train[0])
    #   best_dist = euc(row, self.X_train[0])

        #   best_index = 0
    #   best_index = 0

        #   for i in range(1, len(self.X_train)):
    #   for i in range(1, len(self.X_train)):

        #     dist = euc(row, self.X_train[i])
    #     dist = euc(row, self.X_train[i])

        #     if dist < best_dist:
    #     if dist < best_dist:

        #       best_dist = dist
    #       best_dist = dist

        #       best_index = i
    #       best_index = i

        #   return self.y_train[best_index]
    #   return self.y_train[best_index]


        def closest(self, row):
    def closest(self, row):

            # best_dist = euc(row, self.X_train[0])
        # best_dist = euc(row, self.X_train[0])

            # best_index = 0
        # best_index = 0

        def __closest(self, row):
    def __closest(self, row):

            distances = []
        distances = []

            for i in range(1, len(self.X_train)):
        for i in range(1, len(self.X_train)):

                dist = euc(row, self.X_train[i])
            dist = euc(row, self.X_train[i])

                distances.append([self.y_train[i], dist])
            distances.append([self.y_train[i], dist])

            return sorted(distances, key=lambda x: x[1])
        return sorted(distances, key=lambda x: x[1])

            for i in range(len(self.X_train)):
        for i in range(len(self.X_train)):

                dist = distance.euclidean(row, self.X_train[i])
            dist = distance.euclidean(row, self.X_train[i])

                distances.append((self.y_train[i], dist))
            distances.append((self.y_train[i], dist))

            sorted_distances = sorted(distances, key=lambda x: x[1])
        sorted_distances = sorted(distances, key=lambda x: x[1])

            return self.__vote(sorted_distances)
        return self.__vote(sorted_distances)


        def __vote(self, distances):
    def __vote(self, distances):

            # labels = []
        # labels = []

            # for i in range(self.n_neighbors):
        # for i in range(self.n_neighbors):

            #     labels.append(distances[i][0])
        #     labels.append(distances[i][0])

            # return Counter(labels).most_common(1)[0][0]
        # return Counter(labels).most_common(1)[0][0]

            return Counter(x[0] for x in distances[:self.n_neighbors]).most_common(1)[0][0]
        return Counter(x[0] for x in distances[:self.n_neighbors]).most_common(1)[0][0]


    # test.py
# test.py

    import pytest
import pytest

    from main import KNN
from main import KNN


    X_train = [
X_train = [

        [0, 0, 0, 0],
    [0, 0, 0, 0],

        [1, 1, 1, 1],
    [1, 1, 1, 1],

        [1, 1, 1, 1],
    [1, 1, 1, 1],

        [2, 2, 2, 2],
    [2, 2, 2, 2],

        [2, 2, 2, 2],
    [2, 2, 2, 2],

        [2, 2, 2, 2],
    [2, 2, 2, 2],

        [2, 2, 2, 2]
    [2, 2, 2, 2]

    ]
]

    y_train = [0, 1, 1, 2, 2, 2, 2]
y_train = [0, 1, 1, 2, 2, 2, 2]


    @pytest.mark.parametrize(('n_neighbors'),[1,3,5])
@pytest.mark.parametrize(('n_neighbors'),[1,3,5])

    def test_KNN_should_be_initialised_with_n_neighbors(n_neighbors):
def test_KNN_should_be_initialised_with_n_neighbors(n_neighbors):

        clf = KNN(n_neighbors)
    clf = KNN(n_neighbors)


        def vote(self, row):
    def vote(self, row):

            distances = self.closest(row)
        distances = self.closest(row)

            labels = []
        labels = []

            for i in range(self.k):
        for i in range(self.k):

                labels.append(distances[i][0])
            labels.append(distances[i][0])

            return Counter(labels).most_common(1)[0][0]
        return Counter(labels).most_common(1)[0][0]

        clf.fit(X_train, y_train)
    clf.fit(X_train, y_train)


        assert clf.n_neighbors == n_neighbors
    assert clf.n_neighbors == n_neighbors


    from sklearn import datasets
from sklearn import datasets

    iris = datasets.load_iris()
iris = datasets.load_iris()

    @pytest.mark.parametrize(('n_neighbors'),[1,3,5])
@pytest.mark.parametrize(('n_neighbors'),[1,3,5])

    def test_should_be_able_to_pass_training_data_to_classifier(n_neighbors):
def test_should_be_able_to_pass_training_data_to_classifier(n_neighbors):

        clf = KNN(n_neighbors)
    clf = KNN(n_neighbors)


    X = iris.data
X = iris.data

    y = iris.target
y = iris.target

        clf.fit(X_train, y_train)
    clf.fit(X_train, y_train)


    from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(
X_train, X_test, y_train, y_test = train_test_split(

        X, y, test_size=0.5, random_state=0)
    X, y, test_size=0.5, random_state=0)

        assert clf.X_train == X_train
    assert clf.X_train == X_train

        assert clf.y_train == y_train
    assert clf.y_train == y_train


    # from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neighbors import KNeighborsClassifier

    clf = ScrappyKNN()
clf = ScrappyKNN()

    X_test = [[0, 0, 0, 0]]
X_test = [[0, 0, 0, 0]]

    @pytest.mark.parametrize(('n_neighbors', 'y_test'),[(1, [0]),(3, [1]), (7, [2])])
@pytest.mark.parametrize(('n_neighbors', 'y_test'),[(1, [0]),(3, [1]), (7, [2])])

    def test_predict_should_return_label_for_test_data(n_neighbors, y_test):
def test_predict_should_return_label_for_test_data(n_neighbors, y_test):

        clf = KNN(n_neighbors)
    clf = KNN(n_neighbors)


    clf.fit(X_train, y_train, 3)
clf.fit(X_train, y_train, 3)

        clf.fit(X_train, y_train)
    clf.fit(X_train, y_train)


    predictions = clf.predict(X_test)
predictions = clf.predict(X_test)

        predictions = clf.predict(X_test)
    predictions = clf.predict(X_test)


    from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score

    print(accuracy_score(y_test, predictions))
print(accuracy_score(y_test, predictions))

        assert predictions == y_test
    assert predictions == y_test

## 69 changes: 69 additions & 0 deletions simple_knn_classifier.py
@@ -0,0 +1,69 @@

    import math
import math

    from scipy.spatial import distance
from scipy.spatial import distance

    from collections import Counter
from collections import Counter


    def euc(a, b):
def euc(a, b):

        return distance.euclidean(a,b)
    return distance.euclidean(a,b)


    class ScrappyKNN():
class ScrappyKNN():


        def fit(self, X_train, y_train, k):
    def fit(self, X_train, y_train, k):

            self.X_train = X_train
        self.X_train = X_train

            self.y_train = y_train
        self.y_train = y_train

            self.k = k
        self.k = k


        def predict(self, X_test):
    def predict(self, X_test):

            predictions = []
        predictions = []

            for row in X_test:
        for row in X_test:

                # label = self.closest(row)
            # label = self.closest(row)

                predictions.append(label)
            predictions.append(label)

            return predictions
        return predictions


        # def closest(self, row):
    # def closest(self, row):

        #   best_dist = euc(row, self.X_train[0])
    #   best_dist = euc(row, self.X_train[0])

        #   best_index = 0
    #   best_index = 0

        #   for i in range(1, len(self.X_train)):
    #   for i in range(1, len(self.X_train)):

        #     dist = euc(row, self.X_train[i])
    #     dist = euc(row, self.X_train[i])

        #     if dist < best_dist:
    #     if dist < best_dist:

        #       best_dist = dist
    #       best_dist = dist

        #       best_index = i
    #       best_index = i

        #   return self.y_train[best_index]
    #   return self.y_train[best_index]


        def closest(self, row):
    def closest(self, row):

            # best_dist = euc(row, self.X_train[0])
        # best_dist = euc(row, self.X_train[0])

            # best_index = 0
        # best_index = 0

            distances = []
        distances = []

            for i in range(1, len(self.X_train)):
        for i in range(1, len(self.X_train)):

                dist = euc(row, self.X_train[i])
            dist = euc(row, self.X_train[i])

                distances.append([self.y_train[i], dist])
            distances.append([self.y_train[i], dist])

            return sorted(distances, key=lambda x: x[1])
        return sorted(distances, key=lambda x: x[1])


        def vote(self, row):
    def vote(self, row):

            distances = self.closest(row)
        distances = self.closest(row)

            labels = []
        labels = []

            for i in range(self.k):
        for i in range(self.k):

                labels.append(distances[i][0])
            labels.append(distances[i][0])

            return Counter(labels).most_common(1)[0][0]
        return Counter(labels).most_common(1)[0][0]


    from sklearn import datasets
from sklearn import datasets

    iris = datasets.load_iris()
iris = datasets.load_iris()


    X = iris.data
X = iris.data

    y = iris.target
y = iris.target


    from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(
X_train, X_test, y_train, y_test = train_test_split(

        X, y, test_size=0.5, random_state=0)
    X, y, test_size=0.5, random_state=0)


    # from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neighbors import KNeighborsClassifier

    clf = ScrappyKNN()
clf = ScrappyKNN()


    clf.fit(X_train, y_train, 3)
clf.fit(X_train, y_train, 3)


    predictions = clf.predict(X_test)
predictions = clf.predict(X_test)


    from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score

    print(accuracy_score(y_test, predictions))
print(accuracy_score(y_test, predictions))