savinay/knn1.py

## knn1.py
import math
import numpy as np


def createDataSet():
    """
    Creates a basic data set labels.

    The labels are the classification given to the points. The data
    is hardcoded in this toy example.

    Returns:
        A numpy array of (x,y) points and a corresponding list of labels
    """
    group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels


def classify(point, training_set, labels, k=1):
    """
    Classify a given point using the training set and associated labels.

    Args:
        point: A tuple of (x,y) coordinates
        training_set: An array of (x,y) coordinates
        labels: Labels associate with training set
        k: Number of neighbors to take into account

    Returns:
        Classification for given point
    """

    # Calculate distance between points using Euclidean distance and sort closest
    distances = []
    for i, c in enumerate(training_set):
        distances.append((math.sqrt((c[0] - point[0])**2 + (c[0] - point[1])**2), i))
    distances.sort()

    # Get the top closest points
    top_knn = []
    num = 1
    for distance in distances:
        top_knn.append(distance)
        num += 1
        if num > k:
            break

    # Count most labels in top_knn
    label_count = {}
    for _, i in top_knn:
        if labels[i] in label_count:
            label_count[labels[i]] += 1
        else:
            label_count[labels[i]] = 1

    # Return classification with most matches
    _, label = max([(c, l) for (l, c) in label_count.iteritems()])
    return label


def main():
    group, labels = createDataSet()
    print classify([0, 0], group, labels, 2)


if __name__ == '__main__':
    main()


## knn_2.py
import itertools
from ggplot import ggplot, aes, geom_point
import numpy as np
import pandas as pd

dating_test_set = '../sample/Ch02/datingTestSet.txt'

column_names = [
    'Number of frequent flyer miles earned per year',
    'Percentage of time spent playing video games',
    'Liters of ice cream consumed per week',
    'Category'
]


def load_file(filepath):
    """
    Loads data in tab-separated format

    Args:
        filepath: Location of data file

    Returns:
        Data and labels extracted from text file
    """
    data = []
    labels = []
    with open(filepath) as infile:
        for line in infile:
            row = line.strip().split('\t')
            data.append(row[:-1])
            labels.append(row[-1])
    return data, labels


def normalize(df):
    """
    Normalizes data to give equal weight to each features.

    General formula:

        norm_value = (value - min_value) / (max_value - min_value)

    Args:
        df: Pandas data frame with unnormalized data

    Returns:
        Normalized dataframe, range of values, min values
    """
    min_values = df.min()
    max_values = df.max()
    range_values = max_values - min_values
    norm_df = (df - min_values) / range_values
    return norm_df, range_values, min_values


def classify(input_data, training_set, labels, k=1):
    """
    Uses kNN algorithm to classify input data given a set of
    known data.

    Args:
        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use

    Returns:
        Predicted classification for given input data
    """
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]


def plot(df, x, y, color):
    """
    Scatter plot with two of the features (x, y) grouped by classification (color)

    Args:
        df: Dataframe of data
        x: Feature to plot on x axis
        y: Feature to plot on y axis
        color: Group by this column
    """
    print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())


def main():

    # Load data
    raw_data, raw_labels = load_file(dating_test_set)

    # Convert data to Pandas data structures
    labels = pd.Series(raw_labels, name=column_names[3])
    df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
    df[column_names[3]] = labels

    plot(df, column_names[1], column_names[2], column_names[3])

    """
    # Normalize data since ranges of values are different
    norm_df, range_values, min_values = normalize(df)

    # Use first 10% of data for testing
    num_test_rows = int(norm_df.shape[0] * .1)

    # 90% training data
    training_df = norm_df[num_test_rows:]
    training_labels = labels[num_test_rows:]

    # 10% training data
    test_df = norm_df[:num_test_rows]
    test_labels = labels[:num_test_rows]

    # Apply kNN algorithm to all test data
    result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)

    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()
    """

if __name__ == '__main__':
    main()

## knn_3.py
import numpy as np
import pandas as pd
import os


def classify(input_data, training_set, labels, k=1):
    """
    Uses kNN algorithm to classify input data given a set of
    known data.

    Args:
        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use

    Returns:
        Predicted classification for given input data
    """
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]


def load_data(directory):
    """
    Loads text files of digits in directory as list of lists,
    where each row is represents the digit in a series 0's and 1's

    Each digit is 32 x 32.

    Args:
        directory: Directory that contains text files of digits

    Returns:
        List of lists containing 0's and 1's representing each digit
    """
    dataset = []
    labels = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath) as infile:
            vector = []
            for line in infile:
                vector.extend(line.strip())
            dataset.append(vector)
        labels.append(int(filename[0]))
    return dataset, labels


def main():

    # Load data
    raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/')
    raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/')

    # Convert data into Pandas data structures
    training_labels = pd.Series(raw_training_labels)
    training_data = pd.DataFrame.from_records(np.array(raw_training_data, int))

    test_labels = pd.Series(raw_test_labels)
    test_data = pd.DataFrame.from_records(np.array(raw_test_data, int))

    # Apply kNN algorithm to all test data
    result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1)

    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()


if __name__ == '__main__':
    main()
	import math
	import numpy as np


	def createDataSet():
	"""
	Creates a basic data set labels.

	The labels are the classification given to the points. The data
	is hardcoded in this toy example.

	Returns:
	A numpy array of (x,y) points and a corresponding list of labels
	"""
	group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
	labels = ['A', 'A', 'B', 'B']
	return group, labels


	def classify(point, training_set, labels, k=1):
	"""
	Classify a given point using the training set and associated labels.

	Args:
	point: A tuple of (x,y) coordinates
	training_set: An array of (x,y) coordinates
	labels: Labels associate with training set
	k: Number of neighbors to take into account

	Returns:
	Classification for given point
	"""

	# Calculate distance between points using Euclidean distance and sort closest
	distances = []
	for i, c in enumerate(training_set):
	distances.append((math.sqrt((c[0] - point[0])2 + (c[0] - point[1])2), i))
	distances.sort()

	# Get the top closest points
	top_knn = []
	num = 1
	for distance in distances:
	top_knn.append(distance)
	num += 1
	if num > k:
	break

	# Count most labels in top_knn
	label_count = {}
	for _, i in top_knn:
	if labels[i] in label_count:
	label_count[labels[i]] += 1
	else:
	label_count[labels[i]] = 1

	# Return classification with most matches
	_, label = max([(c, l) for (l, c) in label_count.iteritems()])
	return label


	def main():
	group, labels = createDataSet()
	print classify([0, 0], group, labels, 2)


	if __name__ == '__main__':
	main()
	import itertools
	from ggplot import ggplot, aes, geom_point
	import numpy as np
	import pandas as pd

	dating_test_set = '../sample/Ch02/datingTestSet.txt'

	column_names = [
	'Number of frequent flyer miles earned per year',
	'Percentage of time spent playing video games',
	'Liters of ice cream consumed per week',
	'Category'
	]


	def load_file(filepath):
	"""
	Loads data in tab-separated format

	Args:
	filepath: Location of data file

	Returns:
	Data and labels extracted from text file
	"""
	data = []
	labels = []
	with open(filepath) as infile:
	for line in infile:
	row = line.strip().split('\t')
	data.append(row[:-1])
	labels.append(row[-1])
	return data, labels


	def normalize(df):
	"""
	Normalizes data to give equal weight to each features.

	General formula:

	norm_value = (value - min_value) / (max_value - min_value)

	Args:
	df: Pandas data frame with unnormalized data

	Returns:
	Normalized dataframe, range of values, min values
	"""
	min_values = df.min()
	max_values = df.max()
	range_values = max_values - min_values
	norm_df = (df - min_values) / range_values
	return norm_df, range_values, min_values


	def classify(input_data, training_set, labels, k=1):
	"""
	Uses kNN algorithm to classify input data given a set of
	known data.

	Args:
	input_data: Pandas Series of input data
	training_set: Pandas Data frame of training data
	labels: Pandas Series of classifications for training set
	k: Number of neighbors to use

	Returns:
	Predicted classification for given input data
	"""
	distance_diff = training_set - input_data
	distance_squared = distance_diff**2
	distance = distance_squared.sum(axis=1)**0.5
	distance_df = pd.concat([distance, labels], axis=1)
	distance_df.sort(columns=[0], inplace=True)
	top_knn = distance_df[:k]
	return top_knn[1].value_counts().index.values[0]


	def plot(df, x, y, color):
	"""
	Scatter plot with two of the features (x, y) grouped by classification (color)

	Args:
	df: Dataframe of data
	x: Feature to plot on x axis
	y: Feature to plot on y axis
	color: Group by this column
	"""
	print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())


	def main():

	# Load data
	raw_data, raw_labels = load_file(dating_test_set)

	# Convert data to Pandas data structures
	labels = pd.Series(raw_labels, name=column_names[3])
	df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
	df[column_names[3]] = labels

	plot(df, column_names[1], column_names[2], column_names[3])

	"""
	# Normalize data since ranges of values are different
	norm_df, range_values, min_values = normalize(df)

	# Use first 10% of data for testing
	num_test_rows = int(norm_df.shape[0] * .1)

	# 90% training data
	training_df = norm_df[num_test_rows:]
	training_labels = labels[num_test_rows:]

	# 10% training data
	test_df = norm_df[:num_test_rows]
	test_labels = labels[:num_test_rows]

	# Apply kNN algorithm to all test data
	result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)

	# Calculate the number of correct predictions
	error_df = result_df == test_labels
	print error_df.value_counts()
	"""

	if __name__ == '__main__':
	main()
	import numpy as np
	import pandas as pd
	import os


	def classify(input_data, training_set, labels, k=1):
	"""
	Uses kNN algorithm to classify input data given a set of
	known data.

	Args:
	input_data: Pandas Series of input data
	training_set: Pandas Data frame of training data
	labels: Pandas Series of classifications for training set
	k: Number of neighbors to use

	Returns:
	Predicted classification for given input data
	"""
	distance_diff = training_set - input_data
	distance_squared = distance_diff**2
	distance = distance_squared.sum(axis=1)**0.5
	distance_df = pd.concat([distance, labels], axis=1)
	distance_df.sort(columns=[0], inplace=True)
	top_knn = distance_df[:k]
	return top_knn[1].value_counts().index.values[0]


	def load_data(directory):
	"""
	Loads text files of digits in directory as list of lists,
	where each row is represents the digit in a series 0's and 1's

	Each digit is 32 x 32.

	Args:
	directory: Directory that contains text files of digits

	Returns:
	List of lists containing 0's and 1's representing each digit
	"""
	dataset = []
	labels = []
	for filename in os.listdir(directory):
	filepath = os.path.join(directory, filename)
	with open(filepath) as infile:
	vector = []
	for line in infile:
	vector.extend(line.strip())
	dataset.append(vector)
	labels.append(int(filename[0]))
	return dataset, labels


	def main():

	# Load data
	raw_training_data, raw_training_labels = load_data('../sample/Ch02/trainingDigits/')
	raw_test_data, raw_test_labels = load_data('../sample/Ch02/testDigits/')

	# Convert data into Pandas data structures
	training_labels = pd.Series(raw_training_labels)
	training_data = pd.DataFrame.from_records(np.array(raw_training_data, int))

	test_labels = pd.Series(raw_test_labels)
	test_data = pd.DataFrame.from_records(np.array(raw_test_data, int))

	# Apply kNN algorithm to all test data
	result_df = test_data.apply(lambda row: classify(row, training_data, training_labels, k=3), axis=1)

	# Calculate the number of correct predictions
	error_df = result_df == test_labels
	print error_df.value_counts()


	if __name__ == '__main__':
	main()