Skip to content

Instantly share code, notes, and snippets.

@rnowling
Created August 27, 2016 06:25
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rnowling/91717eef5af2524a0fc1161ba1986b0f to your computer and use it in GitHub Desktop.
Save rnowling/91717eef5af2524a0fc1161ba1986b0f to your computer and use it in GitHub Desktop.
Imbalanced Dataset Logistic Regression Model Comparison
"""
Script for comparing Logistic Regression and associated evaluation metrics on the imbalanced Media 6 Degrees dataset from the Doing Data Science book. You'll need to download a copy of the dataset from the GitHub repo: https://github.com/oreillymedia/doing_data_science .
Copyright 2016 Ronald J. Nowling
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import *
from sklearn.metrics import log_loss, roc_curve, roc_auc_score, confusion_matrix, recall_score, precision
from sklearn.cross_validation import train_test_split
def import_data():
filename = "data/dds_ch5_binary-class-dataset.tsv"
n_rows = None
n_cols = None
header = None
data = []
labels = []
with open(filename) as fl:
reader = csv.reader(fl, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
header = next(reader)
n_cols = len(header) - 1
for row in reader:
data.append(map(float, row[:-1]))
labels.append(float(row[-1]))
return header, np.array(data), np.array(labels)
def split_by_label(data, labels, test_split_fraction):
"""
Randomly split the data and labels into training and test sets, but do the split by label so that
the class sizes are preserved.
"""
from collections import defaultdict
import random
label_indices = defaultdict(lambda: list())
for idx, label in enumerate(labels):
label_indices[label].append(idx)
test_indices = []
train_indices = []
for label, indices in label_indices.items():
shuffled_indices = indices[:]
random.shuffle(shuffled_indices)
test_size = int(len(indices) * test_split_fraction)
train_size = len(indices) - test_size
test_indices.extend(shuffled_indices[:test_size])
train_indices.extend(shuffled_indices[test_size:])
test_labels = labels[test_indices]
test_data = data[test_indices, :]
train_labels = labels[train_indices]
train_data = data[train_indices, :]
return train_data, test_data, train_labels, test_labels
def upsample(data, labels):
"""
Using upsampling to balance the classes. Note that every data point is included at least once
and additional data points are added by sampling with replacement.
"""
from collections import defaultdict
import random
label_indices = defaultdict(lambda: list())
for idx, label in enumerate(labels):
label_indices[label].append(idx)
largest_class_size = max(map(lambda l: len(l), label_indices.values()))
upsampled_indices = []
for label, indices in label_indices.items():
sampled_indices = indices[:]
while len(sampled_indices) < largest_class_size:
sampled_indices.append(random.choice(indices))
upsampled_indices.extend(sampled_indices)
upsampled_labels = labels[upsampled_indices]
upsampled_data = data[upsampled_indices, :]
return upsampled_data, upsampled_labels
if __name__ == "__main__":
headers, data, labels = import_data()
# Initial LR model
roc_scores = []
log_losses = []
recall_scores = []
plt.clf()
plt.subplot(1, 2, 1)
plt.hold(True)
for i in xrange(20):
train_data, test_data, train_labels, test_labels = train_test_split(
data, labels, test_size=0.33)
lr = LogisticRegression()
lr.fit(train_data, train_labels)
test_pred_proba = lr.predict_proba(test_data)
test_pred_labels = lr.predict(test_data)
cm = confusion_matrix(test_labels, test_pred_labels)
recall_scores.append(recall_score(test_labels, test_pred_labels))
log_losses.append(log_loss(test_labels, test_pred_proba))
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate", fontsize=16)
plt.ylabel("True Positive Rate", fontsize=16)
plt.title("Initial", fontsize=18)
print cm
print
print "LR Average AUC:", np.mean(roc_scores)
print "LR Std AUC:", np.std(roc_scores)
print
print "LR Average log loss:", np.mean(log_losses)
print "LR Std log loss:", np.std(log_losses)
print
# Model based on data split by class and upsampled
roc_scores = []
log_losses = []
recall_scores = []
plt.subplot(1, 2, 2)
plt.hold(True)
for i in xrange(20):
train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333)
train_data, train_labels = upsample(train_data, train_labels)
test_data, test_labels = upsample(test_data, test_labels)
lr = LogisticRegression()
lr.fit(train_data, train_labels)
test_pred_proba = lr.predict_proba(test_data)
test_pred_labels = lr.predict(test_data)
cm = confusion_matrix(test_labels, test_pred_labels)
recall_scores.append(recall_score(test_labels, test_pred_labels))
log_losses.append(log_loss(test_labels, test_pred_proba))
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate", fontsize=16)
plt.title("Split By Class+Upsampling", fontsize=18)
print cm
print
print "LR Average AUC:", np.mean(roc_scores)
print "LR Std AUC:", np.std(roc_scores)
print
print "LR Average log loss:", np.mean(log_losses)
print "LR Std log loss:", np.std(log_losses)
plt.savefig("roc_curves.png", DPI=300)
plt.clf()
plt.subplot(1, 2, 1)
plt.hold(True)
for i in xrange(20):
train_data, test_data, train_labels, test_labels = train_test_split(
data, labels, test_size=0.33)
lr = LogisticRegression()
lr.fit(train_data, train_labels)
test_pred_proba = lr.predict_proba(test_data)
test_pred_labels = lr.predict(test_data)
log_losses.append(log_loss(test_labels, test_pred_proba))
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate", fontsize=16)
plt.ylabel("True Positive Rate", fontsize=16)
plt.title("Initial", fontsize=18)
print cm
print
print "LR Average AUC:", np.mean(roc_scores)
print "LR Std AUC:", np.std(roc_scores)
print
print "LR Average log loss:", np.mean(log_losses)
print "LR Std log loss:", np.std(log_losses)
print
roc_scores = []
log_losses = []
recall_scores = []
plt.subplot(1, 2, 2)
plt.hold(True)
for i in xrange(20):
train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333)
train_data, train_labels = upsample(train_data, train_labels)
test_data, test_labels = upsample(test_data, test_labels)
lr = LogisticRegression()
lr.fit(train_data, train_labels)
test_pred_proba = lr.predict_proba(test_data)
test_pred_labels = lr.predict(test_data)
cm = confusion_matrix(test_labels, test_pred_labels)
recall_scores.append(recall_score(test_labels, test_pred_labels))
log_losses.append(log_loss(test_labels, test_pred_proba))
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1)
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1]))
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate", fontsize=16)
plt.title("Split By Class+Upsampling", fontsize=18)
print cm
print
print "LR Average AUC:", np.mean(roc_scores)
print "LR Std AUC:", np.std(roc_scores)
print
print "LR Average log loss:", np.mean(log_losses)
print "LR Std log loss:", np.std(log_losses)
plt.savefig("roc_curves.png", DPI=300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment