-
-
Save savinay/0c48e8dc7f7a39de37b3876814f8bcc5 to your computer and use it in GitHub Desktop.
kNN implementations with Pandas based on examples from ML in Action by Peter Harrington
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import numpy as np | |
def createDataSet(): | |
""" | |
Creates a basic data set labels. | |
The labels are the classification given to the points. The data | |
is hardcoded in this toy example. | |
Returns: | |
A numpy array of (x,y) points and a corresponding list of labels | |
""" | |
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) | |
labels = ['A', 'A', 'B', 'B'] | |
return group, labels | |
def classify(point, training_set, labels, k=1): | |
""" | |
Classify a given point using the training set and associated labels. | |
Args: | |
point: A tuple of (x,y) coordinates | |
training_set: An array of (x,y) coordinates | |
labels: Labels associate with training set | |
k: Number of neighbors to take into account | |
Returns: | |
Classification for given point | |
""" | |
# Calculate distance between points using Euclidean distance and sort closest | |
distances = [] | |
for i, c in enumerate(training_set): | |
distances.append((math.sqrt((c[0] - point[0])**2 + (c[0] - point[1])**2), i)) | |
distances.sort() | |
# Get the top closest points | |
top_knn = [] | |
num = 1 | |
for distance in distances: | |
top_knn.append(distance) | |
num += 1 | |
if num > k: | |
break | |
# Count most labels in top_knn | |
label_count = {} | |
for _, i in top_knn: | |
if labels[i] in label_count: | |
label_count[labels[i]] += 1 | |
else: | |
label_count[labels[i]] = 1 | |
# Return classification with most matches | |
_, label = max([(c, l) for (l, c) in label_count.iteritems()]) | |
return label | |
def main(): | |
group, labels = createDataSet() | |
print classify([0, 0], group, labels, 2) | |
if __name__ == '__main__': | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
from ggplot import ggplot, aes, geom_point | |
import numpy as np | |
import pandas as pd | |
dating_test_set = '../sample/Ch02/datingTestSet.txt' | |
column_names = [ | |
'Number of frequent flyer miles earned per year', | |
'Percentage of time spent playing video games', | |
'Liters of ice cream consumed per week', | |
'Category' | |
] | |
def load_file(filepath): | |
""" | |
Loads data in tab-separated format | |
Args: | |
filepath: Location of data file | |
Returns: | |
Data and labels extracted from text file | |
""" | |
data = [] | |
labels = [] | |
with open(filepath) as infile: | |
for line in infile: | |
row = line.strip().split('\t') | |
data.append(row[:-1]) | |
labels.append(row[-1]) | |
return data, labels | |
def normalize(df): | |
""" | |
Normalizes data to give equal weight to each features. | |
General formula: | |
norm_value = (value - min_value) / (max_value - min_value) | |
Args: | |
df: Pandas data frame with unnormalized data | |
Returns: | |
Normalized dataframe, range of values, min values | |
""" | |
min_values = df.min() | |
max_values = df.max() | |
range_values = max_values - min_values | |
norm_df = (df - min_values) / range_values | |
return norm_df, range_values, min_values | |
def classify(input_data, training_set, labels, k=1): | |
""" | |
Uses kNN algorithm to classify input data given a set of | |
known data. | |
Args: | |
input_data: Pandas Series of input data | |
training_set: Pandas Data frame of training data | |
labels: Pandas Series of classifications for training set | |
k: Number of neighbors to use | |
Returns: | |
Predicted classification for given input data | |
""" | |
distance_diff = training_set - input_data | |
distance_squared = distance_diff**2 | |
distance = distance_squared.sum(axis=1)**0.5 | |
distance_df = pd.concat([distance, labels], axis=1) | |
distance_df.sort(columns=[0], inplace=True) | |
top_knn = distance_df[:k] | |
return top_knn[1].value_counts().index.values[0] | |
def plot(df, x, y, color): | |
""" | |
Scatter plot with two of the features (x, y) grouped by classification (color) | |
Args: | |
df: Dataframe of data | |
x: Feature to plot on x axis | |
y: Feature to plot on y axis | |
color: Group by this column | |
""" | |
print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point()) | |
def main(): | |
# Load data | |
raw_data, raw_labels = load_file(dating_test_set) | |
# Convert data to Pandas data structures | |
labels = pd.Series(raw_labels, name=column_names[3]) | |
df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3]) | |
df[column_names[3]] = labels | |
plot(df, column_names[1], column_names[2], column_names[3]) | |
""" | |
# Normalize data since ranges of values are different | |
norm_df, range_values, min_values = normalize(df) | |
# Use first 10% of data for testing | |
num_test_rows = int(norm_df.shape[0] * .1) | |
# 90% training data | |
training_df = norm_df[num_test_rows:] | |
training_labels = labels[num_test_rows:] | |
# 10% training data | |
test_df = norm_df[:num_test_rows] | |
test_labels = labels[:num_test_rows] | |
# Apply kNN algorithm to all test data | |
result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1) | |
# Calculate the number of correct predictions | |
error_df = result_df == test_labels | |
print error_df.value_counts() | |
""" | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import os | |
def classify(input_data, training_set, labels, k=1): | |
""" | |
Uses kNN algorithm to classify input data given a set of | |
known data. | |
Args: | |
input_data: Pandas Series of input data | |
training_set: Pandas Data frame of training data | |
labels: Pandas Series of classifications for training set | |
k: Number of neighbors to use | |
Returns: | |
Predicted classification for given input data | |
""" | |
distance_diff = training_set - input_data | |
distance_squared = distance_diff**2 | |
distance = distance_squared.sum(axis=1)**0.5 | |
distance_df = pd.concat([distance, labels], axis=1) | |
distance_df.sort(columns=[0], inplace=True) | |
top_knn = distance_df[:k] | |
return top_knn[1].value_counts().index.values[0] | |
def load_data(directory): | |
""" | |
Loads text files of digits in directory as list of lists, | |
where each row is represents the digit in a series 0's and 1's | |
Each digit is 32 x 32. | |
Args: | |
directory: Directory that contains text files of digits | |
Returns: | |
List of lists containing 0's and 1's representing each digit | |
""" | |
dataset = [] | |
labels = [] | |
for filename in os.listdir(directory): | |
filepath = os.path.join(directory, filename) | |
with open(filepath) as infile: | |
vector = [] | |
for line in infile: | |
vector.extend(line.strip()) | |
dataset.append(vector) | |
labels.append(int(filename[0])) | |
return dataset, labels | |
def main(): | |
# Load data | |
raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/') | |
raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/') | |
# Convert data into Pandas data structures | |
training_labels = pd.Series(raw_training_labels) | |
training_data = pd.DataFrame.from_records(np.array(raw_training_data, int)) | |
test_labels = pd.Series(raw_test_labels) | |
test_data = pd.DataFrame.from_records(np.array(raw_test_data, int)) | |
# Apply kNN algorithm to all test data | |
result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1) | |
# Calculate the number of correct predictions | |
error_df = result_df == test_labels | |
print error_df.value_counts() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment