Skip to content

Instantly share code, notes, and snippets.

@satomacoto
Last active March 20, 2017 23:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save satomacoto/1987885e6604ce4dcbb5 to your computer and use it in GitHub Desktop.
Save satomacoto/1987885e6604ce4dcbb5 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn import grid_search, metrics
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveRegressor, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
features = FeatureUnion([
('km_2', KMeans(n_clusters=2)),
('km_4', KMeans(n_clusters=4)),
])
estimators = FeatureUnion([
('knn', ModelTransformer(KNeighborsRegressor(n_neighbors=5))),
('gbr', ModelTransformer(GradientBoostingRegressor())),
('dtr', ModelTransformer(DecisionTreeRegressor())),
('etr', ModelTransformer(ExtraTreesRegressor())),
('rfr', ModelTransformer(RandomForestRegressor())),
('par', ModelTransformer(PassiveAggressiveRegressor())),
('en', ModelTransformer(ElasticNet())),
('cluster', ModelTransformer(KMeans(n_clusters=2)))
])
clf = Pipeline([
('features', features),
('estimators', estimators),
('estimator', KNeighborsClassifier())
])
# Load Iris data
data = load_digits()
X, y = data.data, data.target
# Kappa Scorer
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)
params = {
'estimators__rfr__model__n_estimators' : [1, 2, 100],
'estimators__rfr__model__bootstrap' : [False, True],
}
# Initialize Grid Search Model
model = grid_search.RandomizedSearchCV(estimator=clf, param_distributions=params, scoring=kappa_scorer,
verbose=10, n_jobs=1, iid=True, refit=True, cv=2, n_iter=2)
# Fit Grid Search Model
model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(params.keys()):
print(param_name, best_parameters[param_name])
# Get best model
best_model = model.best_estimator_
# Fit model with best parameters optimized for quadratic_weighted_kappa
best_model.fit(X, y)
preds = best_model.predict(X)
# -*- coding: utf-8 -*-
"""
Beating the Benchmark
Search Results Relevance @ Kaggle
__author__ : Abhishek
"""
import pandas as pd
import numpy as np
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
"""
Returns the confusion matrix between rater's ratings
"""
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(rater_a + rater_b)
if max_rating is None:
max_rating = max(rater_a + rater_b)
num_ratings = int(max_rating - min_rating + 1)
conf_mat = [[0 for i in range(num_ratings)]
for j in range(num_ratings)]
for a, b in zip(rater_a, rater_b):
conf_mat[a - min_rating][b - min_rating] += 1
return conf_mat
def histogram(ratings, min_rating=None, max_rating=None):
"""
Returns the counts of each type of rating that a rater made
"""
if min_rating is None:
min_rating = min(ratings)
if max_rating is None:
max_rating = max(ratings)
num_ratings = int(max_rating - min_rating + 1)
hist_ratings = [0 for x in range(num_ratings)]
for r in ratings:
hist_ratings[r - min_rating] += 1
return hist_ratings
def quadratic_weighted_kappa(y, y_pred):
"""
Calculates the quadratic weighted kappa
axquadratic_weighted_kappa calculates the quadratic weighted kappa
value, which is a measure of inter-rater agreement between two raters
that provide discrete numeric ratings. Potential values range from -1
(representing complete disagreement) to 1 (representing complete
agreement). A kappa value of 0 is expected if all agreement is due to
chance.
quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
each correspond to a list of integer ratings. These lists must have the
same length.
The ratings should be integers, and it is assumed that they contain
the complete range of possible ratings.
quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
is the minimum possible rating, and max_rating is the maximum possible
rating
"""
rater_a = y
rater_b = y_pred
min_rating=None
max_rating=None
rater_a = np.array(rater_a, dtype=int)
rater_b = np.array(rater_b, dtype=int)
assert(len(rater_a) == len(rater_b))
if min_rating is None:
min_rating = min(min(rater_a), min(rater_b))
if max_rating is None:
max_rating = max(max(rater_a), max(rater_b))
conf_mat = confusion_matrix(rater_a, rater_b,
min_rating, max_rating)
num_ratings = len(conf_mat)
num_scored_items = float(len(rater_a))
hist_rater_a = histogram(rater_a, min_rating, max_rating)
hist_rater_b = histogram(rater_b, min_rating, max_rating)
numerator = 0.0
denominator = 0.0
for i in range(num_ratings):
for j in range(num_ratings):
expected_count = (hist_rater_a[i] * hist_rater_b[j]
/ num_scored_items)
d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
numerator += d * conf_mat[i][j] / num_scored_items
denominator += d * expected_count / num_scored_items
return (1.0 - numerator / denominator)
# -*- coding: utf-8 -*-
from sklearn.base import TransformerMixin, BaseEstimator
from pandas import DataFrame
class ModelTransformer(TransformerMixin, BaseEstimator):
def __init__(self, model, **params):
self.model = model
def fit(self, *args, **kwargs):
self.model.fit(*args, **kwargs)
return self
def transform(self, X, **transform_params):
return DataFrame(self.model.predict(X))
class ColumnExtractor(TransformerMixin):
def __init__(self, columns=[]):
self.columns = columns
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def transform(self, X, **transform_params):
return X[self.columns]
def fit(self, X, y=None, **fit_params):
return self
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment