rnowling/imbalanced_dataset_lr_comparison.py

## imbalanced_dataset_lr_comparison.py
"""
Script for comparing Logistic Regression and associated evaluation metrics on the imbalanced Media 6 Degrees dataset from the Doing Data Science book.  You'll need to download a copy of the dataset from the GitHub repo: https://github.com/oreillymedia/doing_data_science .

Copyright 2016 Ronald J. Nowling

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import csv
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import *
from sklearn.metrics import log_loss, roc_curve, roc_auc_score, confusion_matrix, recall_score, precision
from sklearn.cross_validation import train_test_split

def import_data():
    filename = "data/dds_ch5_binary-class-dataset.tsv"
    n_rows = None
    n_cols = None
    header = None
    data = []
    labels = []
    with open(filename) as fl:
        reader = csv.reader(fl, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        header = next(reader)
        n_cols = len(header) - 1
        for row in reader:
            data.append(map(float, row[:-1]))
            labels.append(float(row[-1]))
    return header, np.array(data), np.array(labels)

def split_by_label(data, labels, test_split_fraction):
    """
    Randomly split the data and labels into training and test sets, but do the split by label so that
    the class sizes are preserved.
    """
    from collections import defaultdict
    import random

    label_indices = defaultdict(lambda: list())
    for idx, label in enumerate(labels):
        label_indices[label].append(idx)

    test_indices = []
    train_indices = []
    for label, indices in label_indices.items():
        shuffled_indices = indices[:]
        random.shuffle(shuffled_indices)
        test_size = int(len(indices) * test_split_fraction)
        train_size = len(indices) - test_size
        test_indices.extend(shuffled_indices[:test_size])
        train_indices.extend(shuffled_indices[test_size:])

    test_labels = labels[test_indices]
    test_data = data[test_indices, :]

    train_labels = labels[train_indices]
    train_data = data[train_indices, :]

    return train_data, test_data, train_labels, test_labels

def upsample(data, labels):
    """
    Using upsampling to balance the classes. Note that every data point is included at least once
    and additional data points are added by sampling with replacement.
    """
    from collections import defaultdict
    import random

    label_indices = defaultdict(lambda: list())
    for idx, label in enumerate(labels):
        label_indices[label].append(idx)

    largest_class_size = max(map(lambda l: len(l), label_indices.values()))

    upsampled_indices = []
    for label, indices in label_indices.items():
        sampled_indices = indices[:]
        while len(sampled_indices) < largest_class_size:
            sampled_indices.append(random.choice(indices))
        upsampled_indices.extend(sampled_indices)

    upsampled_labels = labels[upsampled_indices]
    upsampled_data = data[upsampled_indices, :]

    return upsampled_data, upsampled_labels


if __name__ == "__main__":
    headers, data, labels = import_data()

    # Initial LR model
    roc_scores = []
    log_losses = []
    recall_scores = []
    plt.clf()
    plt.subplot(1, 2, 1)
    plt.hold(True)
    for i in xrange(20):
        train_data, test_data, train_labels, test_labels = train_test_split(
        data, labels, test_size=0.33)

        lr = LogisticRegression()
        lr.fit(train_data, train_labels)

        test_pred_proba = lr.predict_proba(test_data)
        test_pred_labels = lr.predict(test_data)

        cm = confusion_matrix(test_labels, test_pred_labels)
        recall_scores.append(recall_score(test_labels, test_pred_labels))

        log_losses.append(log_loss(test_labels, test_pred_proba))
        fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
        roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
        plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate", fontsize=16)
    plt.ylabel("True Positive Rate", fontsize=16)
    plt.title("Initial", fontsize=18)

    print cm
    print
    print "LR Average AUC:", np.mean(roc_scores)
    print "LR Std AUC:", np.std(roc_scores)
    print
    print "LR Average log loss:", np.mean(log_losses)
    print "LR Std log loss:", np.std(log_losses)
    print

    # Model based on data split by class and upsampled
    roc_scores = []
    log_losses = []
    recall_scores = []
    plt.subplot(1, 2, 2)
    plt.hold(True)
    for i in xrange(20):
        train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333)

        train_data, train_labels = upsample(train_data, train_labels)
        test_data, test_labels = upsample(test_data, test_labels)

        lr = LogisticRegression()
        lr.fit(train_data, train_labels)

        test_pred_proba = lr.predict_proba(test_data)
        test_pred_labels = lr.predict(test_data)

        cm = confusion_matrix(test_labels, test_pred_labels)
        recall_scores.append(recall_score(test_labels, test_pred_labels))

        log_losses.append(log_loss(test_labels, test_pred_proba))
        fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
        roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
        plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate", fontsize=16)
    plt.title("Split By Class+Upsampling", fontsize=18)

    print cm
    print
    print "LR Average AUC:", np.mean(roc_scores)
    print "LR Std AUC:", np.std(roc_scores)
    print
    print "LR Average log loss:", np.mean(log_losses)
    print "LR Std log loss:", np.std(log_losses)

    plt.savefig("roc_curves.png", DPI=300)

    plt.clf()
    plt.subplot(1, 2, 1)
    plt.hold(True)
    for i in xrange(20):
        train_data, test_data, train_labels, test_labels = train_test_split(
        data, labels, test_size=0.33)

        lr = LogisticRegression()
        lr.fit(train_data, train_labels)

        test_pred_proba = lr.predict_proba(test_data)
        test_pred_labels = lr.predict(test_data)

        log_losses.append(log_loss(test_labels, test_pred_proba))
        fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
        roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
        plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate", fontsize=16)
    plt.ylabel("True Positive Rate", fontsize=16)
    plt.title("Initial", fontsize=18)


    print cm
    print
    print "LR Average AUC:", np.mean(roc_scores)
    print "LR Std AUC:", np.std(roc_scores)
    print
    print "LR Average log loss:", np.mean(log_losses)
    print "LR Std log loss:", np.std(log_losses)
    print

    roc_scores = []
    log_losses = []
    recall_scores = []
    plt.subplot(1, 2, 2)
    plt.hold(True)
    for i in xrange(20):
        train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333)

        train_data, train_labels = upsample(train_data, train_labels)
        test_data, test_labels = upsample(test_data, test_labels)

        lr = LogisticRegression()
        lr.fit(train_data, train_labels)

        test_pred_proba = lr.predict_proba(test_data)
        test_pred_labels = lr.predict(test_data)

        cm = confusion_matrix(test_labels, test_pred_labels)
        recall_scores.append(recall_score(test_labels, test_pred_labels))

        log_losses.append(log_loss(test_labels, test_pred_proba))
        fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
        roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
        plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate", fontsize=16)
    plt.title("Split By Class+Upsampling", fontsize=18)

    print cm
    print
    print "LR Average AUC:", np.mean(roc_scores)
    print "LR Std AUC:", np.std(roc_scores)
    print
    print "LR Average log loss:", np.mean(log_losses)
    print "LR Std log loss:", np.std(log_losses)

    plt.savefig("roc_curves.png", DPI=300)
	"""
	Script for comparing Logistic Regression and associated evaluation metrics on the imbalanced Media 6 Degrees dataset from the Doing Data Science book. You'll need to download a copy of the dataset from the GitHub repo: https://github.com/oreillymedia/doing_data_science .

	Copyright 2016 Ronald J. Nowling

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	"""

	import csv
	import matplotlib.pyplot as plt
	import numpy as np

	from sklearn.linear_model import *
	from sklearn.metrics import log_loss, roc_curve, roc_auc_score, confusion_matrix, recall_score, precision
	from sklearn.cross_validation import train_test_split

	def import_data():
	filename = "data/dds_ch5_binary-class-dataset.tsv"
	n_rows = None
	n_cols = None
	header = None
	data = []
	labels = []
	with open(filename) as fl:
	reader = csv.reader(fl, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
	header = next(reader)
	n_cols = len(header) - 1
	for row in reader:
	data.append(map(float, row[:-1]))
	labels.append(float(row[-1]))
	return header, np.array(data), np.array(labels)

	def split_by_label(data, labels, test_split_fraction):
	"""
	Randomly split the data and labels into training and test sets, but do the split by label so that
	the class sizes are preserved.
	"""
	from collections import defaultdict
	import random

	label_indices = defaultdict(lambda: list())
	for idx, label in enumerate(labels):
	label_indices[label].append(idx)

	test_indices = []
	train_indices = []
	for label, indices in label_indices.items():
	shuffled_indices = indices[:]
	random.shuffle(shuffled_indices)
	test_size = int(len(indices) * test_split_fraction)
	train_size = len(indices) - test_size
	test_indices.extend(shuffled_indices[:test_size])
	train_indices.extend(shuffled_indices[test_size:])

	test_labels = labels[test_indices]
	test_data = data[test_indices, :]

	train_labels = labels[train_indices]
	train_data = data[train_indices, :]

	return train_data, test_data, train_labels, test_labels

	def upsample(data, labels):
	"""
	Using upsampling to balance the classes. Note that every data point is included at least once
	and additional data points are added by sampling with replacement.
	"""
	from collections import defaultdict
	import random

	label_indices = defaultdict(lambda: list())
	for idx, label in enumerate(labels):
	label_indices[label].append(idx)

	largest_class_size = max(map(lambda l: len(l), label_indices.values()))

	upsampled_indices = []
	for label, indices in label_indices.items():
	sampled_indices = indices[:]
	while len(sampled_indices) < largest_class_size:
	sampled_indices.append(random.choice(indices))
	upsampled_indices.extend(sampled_indices)

	upsampled_labels = labels[upsampled_indices]
	upsampled_data = data[upsampled_indices, :]

	return upsampled_data, upsampled_labels


	if __name__ == "__main__":
	headers, data, labels = import_data()

	# Initial LR model
	roc_scores = []
	log_losses = []
	recall_scores = []
	plt.clf()
	plt.subplot(1, 2, 1)
	plt.hold(True)
	for i in xrange(20):
	train_data, test_data, train_labels, test_labels = train_test_split(
	data, labels, test_size=0.33)

	lr = LogisticRegression()
	lr.fit(train_data, train_labels)

	test_pred_proba = lr.predict_proba(test_data)
	test_pred_labels = lr.predict(test_data)

	cm = confusion_matrix(test_labels, test_pred_labels)
	recall_scores.append(recall_score(test_labels, test_pred_labels))

	log_losses.append(log_loss(test_labels, test_pred_proba))
	fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
	roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
	plt.plot(fpr, tpr)
	plt.xlabel("False Positive Rate", fontsize=16)
	plt.ylabel("True Positive Rate", fontsize=16)
	plt.title("Initial", fontsize=18)

	print cm
	print
	print "LR Average AUC:", np.mean(roc_scores)
	print "LR Std AUC:", np.std(roc_scores)
	print
	print "LR Average log loss:", np.mean(log_losses)
	print "LR Std log loss:", np.std(log_losses)
	print

	# Model based on data split by class and upsampled
	roc_scores = []
	log_losses = []
	recall_scores = []
	plt.subplot(1, 2, 2)
	plt.hold(True)
	for i in xrange(20):
	train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333)

	train_data, train_labels = upsample(train_data, train_labels)
	test_data, test_labels = upsample(test_data, test_labels)

	lr = LogisticRegression()
	lr.fit(train_data, train_labels)

	test_pred_proba = lr.predict_proba(test_data)
	test_pred_labels = lr.predict(test_data)

	cm = confusion_matrix(test_labels, test_pred_labels)
	recall_scores.append(recall_score(test_labels, test_pred_labels))

	log_losses.append(log_loss(test_labels, test_pred_proba))
	fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
	roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
	plt.plot(fpr, tpr)
	plt.xlabel("False Positive Rate", fontsize=16)
	plt.title("Split By Class+Upsampling", fontsize=18)

	print cm
	print
	print "LR Average AUC:", np.mean(roc_scores)
	print "LR Std AUC:", np.std(roc_scores)
	print
	print "LR Average log loss:", np.mean(log_losses)
	print "LR Std log loss:", np.std(log_losses)

	plt.savefig("roc_curves.png", DPI=300)

	plt.clf()
	plt.subplot(1, 2, 1)
	plt.hold(True)
	for i in xrange(20):
	train_data, test_data, train_labels, test_labels = train_test_split(
	data, labels, test_size=0.33)

	lr = LogisticRegression()
	lr.fit(train_data, train_labels)

	test_pred_proba = lr.predict_proba(test_data)
	test_pred_labels = lr.predict(test_data)

	log_losses.append(log_loss(test_labels, test_pred_proba))
	fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
	roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
	plt.plot(fpr, tpr)
	plt.xlabel("False Positive Rate", fontsize=16)
	plt.ylabel("True Positive Rate", fontsize=16)
	plt.title("Initial", fontsize=18)


	print cm
	print
	print "LR Average AUC:", np.mean(roc_scores)
	print "LR Std AUC:", np.std(roc_scores)
	print
	print "LR Average log loss:", np.mean(log_losses)
	print "LR Std log loss:", np.std(log_losses)
	print

	roc_scores = []
	log_losses = []
	recall_scores = []
	plt.subplot(1, 2, 2)
	plt.hold(True)
	for i in xrange(20):
	train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333)

	train_data, train_labels = upsample(train_data, train_labels)
	test_data, test_labels = upsample(test_data, test_labels)

	lr = LogisticRegression()
	lr.fit(train_data, train_labels)

	test_pred_proba = lr.predict_proba(test_data)
	test_pred_labels = lr.predict(test_data)

	cm = confusion_matrix(test_labels, test_pred_labels)
	recall_scores.append(recall_score(test_labels, test_pred_labels))

	log_losses.append(log_loss(test_labels, test_pred_proba))
	fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
	roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
	plt.plot(fpr, tpr)
	plt.xlabel("False Positive Rate", fontsize=16)
	plt.title("Split By Class+Upsampling", fontsize=18)

	print cm
	print
	print "LR Average AUC:", np.mean(roc_scores)
	print "LR Std AUC:", np.std(roc_scores)
	print
	print "LR Average log loss:", np.mean(log_losses)
	print "LR Std log loss:", np.std(log_losses)

	plt.savefig("roc_curves.png", DPI=300)