Skip to content

Instantly share code, notes, and snippets.

@richard-to
Created November 4, 2014 06:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save richard-to/6730474dcec8f3985426 to your computer and use it in GitHub Desktop.
Save richard-to/6730474dcec8f3985426 to your computer and use it in GitHub Desktop.
kNN implementations with Pandas based on examples from ML in Action by Peter Harrington
import math
import numpy as np
def createDataSet():
"""
Creates a basic data set labels.
The labels are the classification given to the points. The data
is hardcoded in this toy example.
Returns:
A numpy array of (x,y) points and a corresponding list of labels
"""
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify(point, training_set, labels, k=1):
"""
Classify a given point using the training set and associated labels.
Args:
point: A tuple of (x,y) coordinates
training_set: An array of (x,y) coordinates
labels: Labels associate with training set
k: Number of neighbors to take into account
Returns:
Classification for given point
"""
# Calculate distance between points using Euclidean distance and sort closest
distances = []
for i, c in enumerate(training_set):
distances.append((math.sqrt((c[0] - point[0])**2 + (c[0] - point[1])**2), i))
distances.sort()
# Get the top closest points
top_knn = []
num = 1
for distance in distances:
top_knn.append(distance)
num += 1
if num > k:
break
# Count most labels in top_knn
label_count = {}
for _, i in top_knn:
if labels[i] in label_count:
label_count[labels[i]] += 1
else:
label_count[labels[i]] = 1
# Return classification with most matches
_, label = max([(c, l) for (l, c) in label_count.iteritems()])
return label
def main():
group, labels = createDataSet()
print classify([0, 0], group, labels, 2)
if __name__ == '__main__':
main()
import itertools
from ggplot import ggplot, aes, geom_point
import numpy as np
import pandas as pd
dating_test_set = '../sample/Ch02/datingTestSet.txt'
column_names = [
'Number of frequent flyer miles earned per year',
'Percentage of time spent playing video games',
'Liters of ice cream consumed per week',
'Category'
]
def load_file(filepath):
"""
Loads data in tab-separated format
Args:
filepath: Location of data file
Returns:
Data and labels extracted from text file
"""
data = []
labels = []
with open(filepath) as infile:
for line in infile:
row = line.strip().split('\t')
data.append(row[:-1])
labels.append(row[-1])
return data, labels
def normalize(df):
"""
Normalizes data to give equal weight to each features.
General formula:
norm_value = (value - min_value) / (max_value - min_value)
Args:
df: Pandas data frame with unnormalized data
Returns:
Normalized dataframe, range of values, min values
"""
min_values = df.min()
max_values = df.max()
range_values = max_values - min_values
norm_df = (df - min_values) / range_values
return norm_df, range_values, min_values
def classify(input_data, training_set, labels, k=1):
"""
Uses kNN algorithm to classify input data given a set of
known data.
Args:
input_data: Pandas Series of input data
training_set: Pandas Data frame of training data
labels: Pandas Series of classifications for training set
k: Number of neighbors to use
Returns:
Predicted classification for given input data
"""
distance_diff = training_set - input_data
distance_squared = distance_diff**2
distance = distance_squared.sum(axis=1)**0.5
distance_df = pd.concat([distance, labels], axis=1)
distance_df.sort(columns=[0], inplace=True)
top_knn = distance_df[:k]
return top_knn[1].value_counts().index.values[0]
def plot(df, x, y, color):
"""
Scatter plot with two of the features (x, y) grouped by classification (color)
Args:
df: Dataframe of data
x: Feature to plot on x axis
y: Feature to plot on y axis
color: Group by this column
"""
print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())
def main():
# Load data
raw_data, raw_labels = load_file(dating_test_set)
# Convert data to Pandas data structures
labels = pd.Series(raw_labels, name=column_names[3])
df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
df[column_names[3]] = labels
plot(df, column_names[1], column_names[2], column_names[3])
"""
# Normalize data since ranges of values are different
norm_df, range_values, min_values = normalize(df)
# Use first 10% of data for testing
num_test_rows = int(norm_df.shape[0] * .1)
# 90% training data
training_df = norm_df[num_test_rows:]
training_labels = labels[num_test_rows:]
# 10% training data
test_df = norm_df[:num_test_rows]
test_labels = labels[:num_test_rows]
# Apply kNN algorithm to all test data
result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)
# Calculate the number of correct predictions
error_df = result_df == test_labels
print error_df.value_counts()
"""
if __name__ == '__main__':
main()
import numpy as np
import pandas as pd
import os
def classify(input_data, training_set, labels, k=1):
"""
Uses kNN algorithm to classify input data given a set of
known data.
Args:
input_data: Pandas Series of input data
training_set: Pandas Data frame of training data
labels: Pandas Series of classifications for training set
k: Number of neighbors to use
Returns:
Predicted classification for given input data
"""
distance_diff = training_set - input_data
distance_squared = distance_diff**2
distance = distance_squared.sum(axis=1)**0.5
distance_df = pd.concat([distance, labels], axis=1)
distance_df.sort(columns=[0], inplace=True)
top_knn = distance_df[:k]
return top_knn[1].value_counts().index.values[0]
def load_data(directory):
"""
Loads text files of digits in directory as list of lists,
where each row is represents the digit in a series 0's and 1's
Each digit is 32 x 32.
Args:
directory: Directory that contains text files of digits
Returns:
List of lists containing 0's and 1's representing each digit
"""
dataset = []
labels = []
for filename in os.listdir(directory):
filepath = os.path.join(directory, filename)
with open(filepath) as infile:
vector = []
for line in infile:
vector.extend(line.strip())
dataset.append(vector)
labels.append(int(filename[0]))
return dataset, labels
def main():
# Load data
raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/')
raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/')
# Convert data into Pandas data structures
training_labels = pd.Series(raw_training_labels)
training_data = pd.DataFrame.from_records(np.array(raw_training_data, int))
test_labels = pd.Series(raw_test_labels)
test_data = pd.DataFrame.from_records(np.array(raw_test_data, int))
# Apply kNN algorithm to all test data
result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1)
# Calculate the number of correct predictions
error_df = result_df == test_labels
print error_df.value_counts()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment