Created
November 18, 2014 11:21
-
-
Save petersen-poul/b460ac3a547f8cad2078 to your computer and use it in GitHub Desktop.
LendingClub
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bigml, sys, csv, time | |
from bigml.api import BigML | |
api = BigML(dev_mode=True) | |
source = api.create_source("s3://bigml-public/csv/lc_sample.csv.gz", { "name": "LC Source"} ) | |
api.ok(source) | |
soure = api.get_source(source) | |
lc_dataset = api.create_dataset(source, { "name": "LC Dataset" }) | |
api.ok(lc_dataset) | |
closed_dataset = api.create_dataset(lc_dataset, { | |
"name": "LC Closed", | |
"json_filter": [ "or", | |
[ "=", [ "field", "loan_status" ], "Fully Paid" ], | |
[ "=", [ "field", "loan_status" ], "Charged Off" ], | |
[ "=", [ "field", "loan_status" ], "Default" ] | |
] | |
}) | |
open_dataset = api.create_dataset(lc_dataset, { | |
"name": "LC Open", | |
"json_filter" : [ "or", | |
[ "=", [ "field", "loan_status" ], "Current" ], | |
[ "=", [ "field", "loan_status" ], "In Grace Period" ], | |
[ "=", [ "field", "loan_status" ], "Late (16-30 days)" ], | |
[ "=", [ "field", "loan_status" ], "Late (31-120 days)" ] | |
] | |
}) | |
api.ok(closed_dataset) | |
closed_dataset_quality = api.create_dataset(closed_dataset, { | |
"name": "LC Closed Quality Pre", | |
"new_fields": [ | |
{ | |
"field": '[ "if", [ "=", [ "field", "loan_status" ], "Fully Paid" ], "good", "bad" ]', | |
"name": "quality" | |
} | |
] | |
}) | |
api.ok(closed_dataset_quality) | |
closed_dataset_quality = api.create_dataset(closed_dataset_quality, { | |
"name": "LC Closed Quality", | |
"excluded_fields": [ "loan_status" ] | |
}) | |
api.ok(closed_dataset_quality) | |
# Create the single tree for eploring | |
cq_model = api.create_model(closed_dataset_quality, { | |
"name": "LC Closed Quality Model" | |
}) | |
# Create a training/test split | |
cq_train = api.create_dataset(closed_dataset_quality, { | |
"sample_rate": 0.80, | |
"out_of_bag": False, | |
"seed": "BigML", | |
"name": "LC Closed Quality Train" | |
}) | |
cq_test = api.create_dataset(closed_dataset_quality, { | |
"sample_rate": 0.80, | |
"out_of_bag": True, | |
"seed": "BigML", | |
"name": "LC Closed Quality Test" | |
}) | |
api.ok(cq_train) | |
# Build a model and evaluate it | |
cq_train_model = api.create_model(cq_train, { | |
"name": "LC Closed Quality Train Model" | |
}) | |
api.ok(cq_train_model) | |
cq_train_eval = api.create_evaluation(cq_train_model, cq_test, { | |
"name": "LC Closed Quality Train Eval" | |
}) | |
# Build a model with objective balancing and evaluate it | |
cq_train_balanced_model = api.create_model(cq_train, { | |
"name": "LC Closed Quality Train Balanced Model", | |
"balance_objective": True | |
}) | |
api.ok(cq_train_balanced_model) | |
cq_train_balanced_eval = api.create_evaluation(cq_train_balanced_model, cq_test, { | |
"name": "LC Closed Quality Train Balanced Model Eval" | |
}) | |
# Now try a RDF with balanced objective | |
cq_train_balanced_rdf = api.create_ensemble(cq_train, { | |
"name": "LC Closed Quality Train Balanced RDF", | |
"balance_objective": True, | |
"randomize": True | |
}) | |
api.ok(cq_train_balanced_rdf) | |
cq_train_balanced_rdf_eval = api.create_evaluation(cq_train_balanced_rdf, cq_test, { | |
"name": "LC Closed Quality Train Balanced RDF Eval" | |
}) | |
# Run a second evaluation with the threshold combiner | |
cq_train_rdf_threshold_eval = api.create_evaluation(cq_train_balanced_rdf, cq_test, { | |
"name": "LC Closed Quality Train Balanced RDF Threshold Eval", | |
"combiner": 3, | |
"threshold": { "k": 9, "class": "good" } | |
}) | |
# Return to the full dataset, and use batchprediction to score the open dataset | |
cq_model = api.create_ensemble(closed_dataset_quality, { | |
"name": "LC Closed Ensemble", | |
"balance_objective": True, | |
"randomize": True | |
}) | |
api.ok(cq_model) | |
results_batch_prediction = api.create_batch_prediction(cq_model, open_dataset, { | |
"name": "LC Result Quality", | |
"combiner": 3, | |
"threshold": { "k": 9, "class": "good" }, | |
"output_dataset": True | |
}) | |
time.sleep(5) | |
api.ok(results_batch_prediction) | |
results_quality = api.get_dataset(results_batch_prediction['object']['output_dataset_resource']) | |
closed_anomaly = api.create_anomaly(closed_dataset, { | |
"excluded_fields": [ "emp_title", "desc", "title" ] | |
}) | |
time.sleep(5) | |
api.ok(closed_anomaly) | |
results_batch_anomaly = api.create_batch_anomaly_score(closed_anomaly, results_quality, { | |
"name": "LC Result Quality Score", | |
"output_dataset": True | |
}) | |
api.ok(results_batch_anomaly) | |
results_quality_anomaly = api.get_dataset(results_batch_anomaly['object']['output_dataset_resource']) | |
# Now we transform the open set to label trouble loans | |
open_dataset_trouble = api.create_dataset(open_dataset, { | |
"name": "LC Open Trouble Pre", | |
"new_fields": [ | |
{ | |
"field": '''[ "or", | |
[ "!=", [ "field", "loan_status" ], "Current" ], | |
[ "!=", [ "field", "total_rec_late_fee" ], 0 ] | |
]''', | |
"name": "trouble" | |
} | |
] | |
}) | |
api.ok(open_dataset_trouble) | |
open_dataset_trouble = api.create_dataset(open_dataset_trouble, { | |
"name": "LC Open Trouble", | |
"excluded_fields": [ "loan_status" ] | |
}) | |
api.ok(open_dataset_trouble) | |
open_trouble_cluster = api.create_cluster(open_dataset_trouble, { | |
"name": "LC Open Trouble Cluster", | |
"k": 10, | |
"summary_fields": [ "00001a" ], | |
"excluded_fields": [ "000018" ] | |
}) | |
api.ok(open_trouble_cluster) | |
# Start hack to relabel cluster with trouble density | |
cluster = open_trouble_cluster | |
dataset = cluster['object']['dataset'] | |
centroids = api.create_batch_centroid(cluster, dataset, { "output_fields": [ "10001a" ]}) | |
api.ok(centroids) | |
data = csv.reader(api.download_batch_centroid(centroids)) | |
counts = {} | |
for row in data: | |
if not row[1] in counts: counts[row[1]] = [0,0] | |
if row[0] == 'true': | |
counts[row[1]][0] += 1 | |
counts[row[1]][1] += 1 | |
for group in cluster['object']['clusters']['clusters']: | |
perc = float(counts[group['name']][0]) / float(counts[group['name']][1]) * 100 | |
api.update_cluster(cluster, {"clusters": {group['id']: { "name": "%f Trouble" % perc }}}) | |
# Finally, added the trouble label with a batch centroid | |
results_quality_anomaly_trouble = api.create_batch_centroid(open_trouble_cluster, results_quality_anomaly, { | |
"name": "LC Result Quality Score Trouble" | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment