Skip to content

Instantly share code, notes, and snippets.

@petersen-poul
Created November 18, 2014 11:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petersen-poul/b460ac3a547f8cad2078 to your computer and use it in GitHub Desktop.
Save petersen-poul/b460ac3a547f8cad2078 to your computer and use it in GitHub Desktop.
LendingClub
import bigml, sys, csv, time
from bigml.api import BigML
api = BigML(dev_mode=True)
source = api.create_source("s3://bigml-public/csv/lc_sample.csv.gz", { "name": "LC Source"} )
api.ok(source)
soure = api.get_source(source)
lc_dataset = api.create_dataset(source, { "name": "LC Dataset" })
api.ok(lc_dataset)
closed_dataset = api.create_dataset(lc_dataset, {
"name": "LC Closed",
"json_filter": [ "or",
[ "=", [ "field", "loan_status" ], "Fully Paid" ],
[ "=", [ "field", "loan_status" ], "Charged Off" ],
[ "=", [ "field", "loan_status" ], "Default" ]
]
})
open_dataset = api.create_dataset(lc_dataset, {
"name": "LC Open",
"json_filter" : [ "or",
[ "=", [ "field", "loan_status" ], "Current" ],
[ "=", [ "field", "loan_status" ], "In Grace Period" ],
[ "=", [ "field", "loan_status" ], "Late (16-30 days)" ],
[ "=", [ "field", "loan_status" ], "Late (31-120 days)" ]
]
})
api.ok(closed_dataset)
closed_dataset_quality = api.create_dataset(closed_dataset, {
"name": "LC Closed Quality Pre",
"new_fields": [
{
"field": '[ "if", [ "=", [ "field", "loan_status" ], "Fully Paid" ], "good", "bad" ]',
"name": "quality"
}
]
})
api.ok(closed_dataset_quality)
closed_dataset_quality = api.create_dataset(closed_dataset_quality, {
"name": "LC Closed Quality",
"excluded_fields": [ "loan_status" ]
})
api.ok(closed_dataset_quality)
# Create the single tree for eploring
cq_model = api.create_model(closed_dataset_quality, {
"name": "LC Closed Quality Model"
})
# Create a training/test split
cq_train = api.create_dataset(closed_dataset_quality, {
"sample_rate": 0.80,
"out_of_bag": False,
"seed": "BigML",
"name": "LC Closed Quality Train"
})
cq_test = api.create_dataset(closed_dataset_quality, {
"sample_rate": 0.80,
"out_of_bag": True,
"seed": "BigML",
"name": "LC Closed Quality Test"
})
api.ok(cq_train)
# Build a model and evaluate it
cq_train_model = api.create_model(cq_train, {
"name": "LC Closed Quality Train Model"
})
api.ok(cq_train_model)
cq_train_eval = api.create_evaluation(cq_train_model, cq_test, {
"name": "LC Closed Quality Train Eval"
})
# Build a model with objective balancing and evaluate it
cq_train_balanced_model = api.create_model(cq_train, {
"name": "LC Closed Quality Train Balanced Model",
"balance_objective": True
})
api.ok(cq_train_balanced_model)
cq_train_balanced_eval = api.create_evaluation(cq_train_balanced_model, cq_test, {
"name": "LC Closed Quality Train Balanced Model Eval"
})
# Now try a RDF with balanced objective
cq_train_balanced_rdf = api.create_ensemble(cq_train, {
"name": "LC Closed Quality Train Balanced RDF",
"balance_objective": True,
"randomize": True
})
api.ok(cq_train_balanced_rdf)
cq_train_balanced_rdf_eval = api.create_evaluation(cq_train_balanced_rdf, cq_test, {
"name": "LC Closed Quality Train Balanced RDF Eval"
})
# Run a second evaluation with the threshold combiner
cq_train_rdf_threshold_eval = api.create_evaluation(cq_train_balanced_rdf, cq_test, {
"name": "LC Closed Quality Train Balanced RDF Threshold Eval",
"combiner": 3,
"threshold": { "k": 9, "class": "good" }
})
# Return to the full dataset, and use batchprediction to score the open dataset
cq_model = api.create_ensemble(closed_dataset_quality, {
"name": "LC Closed Ensemble",
"balance_objective": True,
"randomize": True
})
api.ok(cq_model)
results_batch_prediction = api.create_batch_prediction(cq_model, open_dataset, {
"name": "LC Result Quality",
"combiner": 3,
"threshold": { "k": 9, "class": "good" },
"output_dataset": True
})
time.sleep(5)
api.ok(results_batch_prediction)
results_quality = api.get_dataset(results_batch_prediction['object']['output_dataset_resource'])
closed_anomaly = api.create_anomaly(closed_dataset, {
"excluded_fields": [ "emp_title", "desc", "title" ]
})
time.sleep(5)
api.ok(closed_anomaly)
results_batch_anomaly = api.create_batch_anomaly_score(closed_anomaly, results_quality, {
"name": "LC Result Quality Score",
"output_dataset": True
})
api.ok(results_batch_anomaly)
results_quality_anomaly = api.get_dataset(results_batch_anomaly['object']['output_dataset_resource'])
# Now we transform the open set to label trouble loans
open_dataset_trouble = api.create_dataset(open_dataset, {
"name": "LC Open Trouble Pre",
"new_fields": [
{
"field": '''[ "or",
[ "!=", [ "field", "loan_status" ], "Current" ],
[ "!=", [ "field", "total_rec_late_fee" ], 0 ]
]''',
"name": "trouble"
}
]
})
api.ok(open_dataset_trouble)
open_dataset_trouble = api.create_dataset(open_dataset_trouble, {
"name": "LC Open Trouble",
"excluded_fields": [ "loan_status" ]
})
api.ok(open_dataset_trouble)
open_trouble_cluster = api.create_cluster(open_dataset_trouble, {
"name": "LC Open Trouble Cluster",
"k": 10,
"summary_fields": [ "00001a" ],
"excluded_fields": [ "000018" ]
})
api.ok(open_trouble_cluster)
# Start hack to relabel cluster with trouble density
cluster = open_trouble_cluster
dataset = cluster['object']['dataset']
centroids = api.create_batch_centroid(cluster, dataset, { "output_fields": [ "10001a" ]})
api.ok(centroids)
data = csv.reader(api.download_batch_centroid(centroids))
counts = {}
for row in data:
if not row[1] in counts: counts[row[1]] = [0,0]
if row[0] == 'true':
counts[row[1]][0] += 1
counts[row[1]][1] += 1
for group in cluster['object']['clusters']['clusters']:
perc = float(counts[group['name']][0]) / float(counts[group['name']][1]) * 100
api.update_cluster(cluster, {"clusters": {group['id']: { "name": "%f Trouble" % perc }}})
# Finally, added the trouble label with a batch centroid
results_quality_anomaly_trouble = api.create_batch_centroid(open_trouble_cluster, results_quality_anomaly, {
"name": "LC Result Quality Score Trouble"
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment