Created
February 2, 2018 14:38
-
-
Save rajatpaliwal/229dbe75b4933254534bf00648aed40a to your computer and use it in GitHub Desktop.
Finding Donors for charityML ("Value error problem")
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries necessary for this project | |
import numpy as np | |
import pandas as pd | |
from time import time | |
from IPython.display import display # Allows the use of display() for DataFrames | |
# Import supplementary visualization code visuals.py | |
import visuals as vs | |
# Pretty display for notebooks | |
%matplotlib inline | |
# Load the Census dataset | |
data = pd.read_csv("census.csv") | |
# Success - Display the first record | |
display(data.head(n=10)) | |
# TODO: Total number of records | |
n_records = len(data) | |
# TODO: Number of records where individual's income is more than $50,000 | |
n_greater_50k = data['income'].value_counts()['>50K'] | |
# TODO: Number of records where individual's income is at most $50,000 | |
n_at_most_50k = data['income'].value_counts()['<=50K'] | |
# TODO: Percentage of individuals whose income is more than $50,000 | |
greater_percent = (float(n_greater_50k)*100/float(n_records)) | |
# Print the results | |
print("Total number of records: {}".format(n_records)) | |
print("Individuals making more than $50,000: {}".format(n_greater_50k)) | |
print("Individuals making at most $50,000: {}".format(n_at_most_50k)) | |
print("Percentage of individuals making more than $50,000: {}%".format(greater_percent)) | |
# Split the data into features and target label | |
income_raw = data['income'] | |
features_raw = data.drop('income', axis = 1) | |
# Visualize skewed continuous features of original data | |
vs.distribution(data) | |
# Log-transform the skewed features | |
skewed = ['capital-gain', 'capital-loss'] | |
features_log_transformed = pd.DataFrame(data = features_raw) | |
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1)) | |
# Visualize the new log distributions | |
vs.distribution(features_log_transformed, transformed = True) | |
# Import sklearn.preprocessing.StandardScaler | |
from sklearn.preprocessing import MinMaxScaler | |
# Initialize a scaler, then apply it to the features | |
scaler = MinMaxScaler() # default=(0, 1) | |
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] | |
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed) | |
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical]) | |
# Show an example of a record with scaling applied | |
display(features_log_minmax_transform.head(n = 5)) | |
# TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies() | |
features_final = pd.get_dummies(features_log_minmax_transform) | |
# TODO: Encode the 'income_raw' data to numerical values | |
income = pd.get_dummies(income_raw) | |
# Print the number of features after one-hot encoding | |
encoded = list(features_final.columns) | |
print("{} total features after one-hot encoding.".format(len(encoded))) | |
# Uncomment the following line to see the encoded feature names | |
print encoded[:10] | |
# Import train_test_split | |
from sklearn.cross_validation import train_test_split | |
# Split the 'features' and 'income' data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(features_final, | |
income, | |
test_size = 0.2, | |
random_state = 0) | |
# Show the results of the split | |
print("Training set has {} samples.".format(X_train.shape[0])) | |
print("Testing set has {} samples.".format(X_test.shape[0])) | |
TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data | |
#encoded to numerical values done in the data preprocessing step. | |
FP = income.count() - TP # Specific to the naive case | |
TN = 0 # No predicted negatives in the naive case | |
FN = 0 # No predicted negatives in the naive case | |
# TODO: Calculate accuracy, precision and recall | |
accuracy = (greater_percent/100) | |
recall = 1.0 | |
precision = accuracy | |
# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall. | |
fscore = ((precision*recall)*(1+0.5*0.5))/(((0.5*0.5)*precision)+recall) | |
# Print the results | |
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore)) | |
# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import fbeta_score | |
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): | |
''' | |
inputs: | |
- learner: the learning algorithm to be trained and predicted on | |
- sample_size: the size of samples (number) to be drawn from training set | |
- X_train: features training set | |
- y_train: income training set | |
- X_test: features testing set | |
- y_test: income testing set | |
''' | |
results = {} | |
# TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:]) | |
start = time() # Get start time | |
learner = learner.fit(X_train[:sample_size], y_train[:sample_size]) | |
end = time() # Get end time | |
# TODO: Calculate the training time | |
results['train_time'] = end-start | |
# TODO: Get the predictions on the test set(X_test), | |
# then get predictions on the first 300 training samples(X_train) using .predict() | |
start = time() # Get start time | |
predictions_test = learner.predict(X_test) | |
predictions_train = learner.predict(X_train[:300]) | |
end = time() # Get end time | |
# TODO: Calculate the total prediction time | |
results['pred_time'] = end-start | |
# TODO: Compute accuracy on the first 300 training samples which is y_train[:300] | |
results['acc_train'] = accuracy_score(y_train[:300],predictions_train) | |
# TODO: Compute accuracy on test set using accuracy_score() | |
results['acc_test'] = accuracy_score(y_test,predictions_test) | |
# TODO: Compute F-score on the the first 300 training samples using fbeta_score() | |
results['f_train'] = fbeta_score(y_train[:300],predictions_train,beta=0.5,average= None) | |
# TODO: Compute F-score on the test set which is y_test | |
results['f_test'] = fbeta_score(y_test,predictions_test,beta=0.5,average= None) | |
# Success | |
print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size)) | |
# Return the results | |
return results | |
# TODO: Import the three supervised learning models from sklearn | |
from sklearn import tree | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.svm import SVC | |
# TODO: Initialize the three models | |
clf_A = SVC(random_state=0) | |
clf_B = KNeighborsClassifier() | |
clf_C = tree.DecisionTreeClassifier() | |
# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data | |
# HINT: samples_100 is the entire training set i.e. len(y_train) | |
# HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`) | |
# HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) | |
samples_100 = (len(X_train)) | |
samples_10 = int(len(X_train)/10) | |
samples_1 = int(len(X_train)/100) | |
# Collect results on the learners | |
results = {} | |
for clf in [clf_A, clf_B, clf_C]: | |
clf_name = clf.__class__.__name__ | |
results[clf_name] = {} | |
for i, samples in enumerate([samples_1, samples_10, samples_100]): | |
results[clf_name][i] = train_predict(clf,samples, X_train, y_train, X_test, y_test) | |
# Run metrics visualization for the three supervised learning models chosen | |
vs.evaluate(results, accuracy, fscore) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment