Skip to content

Instantly share code, notes, and snippets.

@kudkudak
Created March 2, 2017 12:30
Show Gist options
  • Save kudkudak/0c758961f4f7b3cf11e5cc80c5c94c70 to your computer and use it in GitHub Desktop.
Save kudkudak/0c758961f4f7b3cf11e5cc80c5c94c70 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
This file that defines model factories.
It is quite messy
"""
import logging
import sklearn
from scipy.stats import spearmanr
from sklearn.base import BaseEstimator
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn_utils.metrics import wac_score
from src.experiments.transformers import SentenceEmbedder, DoubleListEmbedder, DoubleListDotProduct
class SentenceCNN(BaseEstimator):
def __init__(self, embedding, filter_length=3, nb_filter=500, hidden_dims=500,
nb_epoch=10, batch_size=32,
dense_W_l2=0.0, dense_activity_l2=0.01, dropout=0.2, optimizer="adam",
lr=0.001,
maxlen=60,
verbose=0, validation_split=0.):
self.embedding = embedding
self.filter_length = filter_length
self.nb_filter = nb_filter
self.hidden_dims = hidden_dims
self.nb_epoch = nb_epoch
self.batch_size = batch_size
self.dense_W_l2 = dense_W_l2
self.dense_activity_l2 = dense_activity_l2
self.dropout = dropout
self.optimizer = optimizer
self.maxlen = maxlen
self.lr = lr
self.verbose = verbose
self.validation_split = validation_split
def _build(self):
self.model = Sequential()
# TODO: Why not train
self.model.add(Embedding(
dropout=self.dropout,
trainable=False,
mask_zero=False,
output_dim=self.embedding_weights.shape[1],
input_dim=self.embedding_weights.shape[0],
weights=[self.embedding_weights]))
self.model.add(Convolution1D(nb_filter=self.nb_filter,
filter_length=self.filter_length,
border_mode='valid',
activation='relu',
subsample_length=1))
def max_1d(X):
return K.max(X, axis=1)
self.model.add(Lambda(max_1d, output_shape=(self.nb_filter,)))
if self.hidden_dims > 0:
self.model.add(Dense(self.hidden_dims, activation="relu", W_regularizer=l2(self.dense_W_l2),
activity_regularizer=activity_l2(self.dense_activity_l2)))
self.model.add(Dense(1, activation="sigmoid"))
self.model.compile(loss='binary_crossentropy',
optimizer=self._optimizer,
metrics=['accuracy'])
def _initialize(self):
if self.optimizer == "adam":
self._optimizer = optimizers.Adam(lr=self.lr)
words = self.embedding.vocabulary.words
vectors = self.embedding.vectors
self.index_dict = {}
for i, word in enumerate(words):
self.index_dict[word] = i + 1
word_vectors = {}
for word, vector in zip(words, vectors):
word_vectors[word] = vector
vocab_dim = vectors.shape[1]
n_symbols = len(self.index_dict) + 1 # adding 1 to account for 0th index (for masking)
self.embedding_weights = np.zeros((n_symbols + 1, vocab_dim))
for word, index in self.index_dict.items():
self.embedding_weights[index, :] = word_vectors[word]
self._build()
def _prepare_X(self, X):
X_tr = X.copy()
X_tr = X_tr.ravel()
for i, sentence in enumerate(X.ravel()):
X_tr[i] = [self.index_dict[word] for word in sentence]
X_tr = sequence.pad_sequences(X_tr, maxlen=self.maxlen)
return X_tr
def _prepare_y(self, y):
if self._y_type == "-1,1":
return (y + 1) / 2
elif self._y_type == "0,1":
return y
else:
raise RuntimeError()
def _transform_prediction(self, y_pred):
if self._y_type == "-1,1":
return 2 * (y_pred > 0.5).astype(np.int64).ravel() - 1
elif self._y_type == "0,1":
return (y_pred > 0.5).astype(np.int64).ravel()
else:
raise RuntimeError()
def fit(self, X, y):
if set(y) == set([0, 1]):
self._y_type = "0,1"
elif set(y) == set([-1, 1]):
self._y_type = "-1,1"
else:
raise RuntimeError()
self._initialize()
X_tr = self._prepare_X(X)
y_tr = self._prepare_y(y)
if self.validation_split != 0.0:
callbacks = [EarlyStopping(monitor='val_loss', patience=1, verbose=0, mode='auto')]
else:
callbacks = []
self.model.fit(X_tr, y_tr, batch_size=self.batch_size, nb_epoch=self.nb_epoch,
verbose=self.verbose, callbacks=callbacks,
validation_split=self.validation_split)
return self
def predict(self, X, y=None):
X_tr = self._prepare_X(X)
return self._transform_prediction(self.model.predict(X_tr))
class DummyModel(BaseEstimator):
def __init__(self):
pass
def fit(self, X, y):
return self
def predict(self, X):
return X.ravel()
logger = logging.getLogger(__name__)
class SKFGridSearchCV(GridSearchCV):
# Passing explicit parameters, otherwise get_params() won't work
def __init__(self, scoring, param_grid, estimator, skf_rng=777, cv=5, n_jobs=1):
assert (type(cv) is int)
assert (cv > 1)
GridSearchCV.__init__(self, cv=cv, scoring=scoring, param_grid=param_grid,
estimator=estimator, n_jobs=n_jobs)
self.skf_rng = skf_rng
def fit(self, X, y):
self.cv = StratifiedKFold(y, n_folds=self.cv, shuffle=True, random_state=self.skf_rng)
return GridSearchCV.fit(self, X, y)
class FallBackGridSearchCV(BaseEstimator):
"""
Simple class that fits GridSearchCV but performs also a cross validation of supplied fall_back_estimator
and if it is better picks it.
Notes
-----
For now it is assumed that fall_back_estimator is constant
"""
def __init__(self, fall_back_estimator, gcv):
self.gcv = None
assert gcv.scoring is not None, "gcv has to define explicitely scoring"
if isinstance(fall_back_estimator, GridSearchCV) or isinstance(fall_back_estimator, SKFGridSearchCV):
if hasattr(gcv, "scoring") and not fall_back_estimator.scoring == gcv.scoring:
raise RuntimeError("scorings must match")
if hasattr(gcv, "cv") and not fall_back_estimator.cv == gcv.cv:
raise RuntimeError("cv must match")
self.fall_back_cv = fall_back_estimator
else:
assert isinstance(gcv, GridSearchCV) or isinstance(gcv, SKFGridSearchCV)
self.fall_back_cv = sklearn.base.clone(gcv, safe=True)
self.fall_back_cv.estimator = fall_back_estimator
self.fall_back_cv.param_grid = {} # Hack, has to have a parameter grid actually
self.original_gcv = gcv
def fit(self, X, y):
self.original_gcv.fit(X, y)
self.fall_back_cv.fit(X, y)
if self.original_gcv.best_score_ >= self.fall_back_cv.best_score_:
# Score is always the larget the better
self.gcv = self.original_gcv
else:
self.gcv = self.fall_back_cv
return self
def predict(self, X):
return self.gcv.predict(X)
def predict_proba(self, X):
return self.gcv.predict_proba(X)
def __setattr__(self, k, v):
self.__dict__[k] = v
# Fun question: can you implement it any better?
def __getattr__(self, name):
try:
return self.__dict__[name]
except KeyError:
if name != "gcv" and self.gcv:
try:
return self.gcv.__dict__[name]
except KeyError:
raise AttributeError
else:
raise AttributeError
def lambda_fallback(fall_back_estimator, gcv):
def model(E):
return FallBackGridSearchCV(fall_back_estimator=fall_back_estimator(E), gcv=gcv(E))
return model
#############
# One word models #
#############
def OneWordLR(scorer=sklearn.metrics.make_scorer(wac_score)):
class_weight = "balanced" if scorer == sklearn.metrics.make_scorer(wac_score) else None
def lambda_model(E):
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")),
("lgr", LogisticRegression(class_weight=class_weight))])
model_lr = SKFGridSearchCV(skf_rng=777,
estimator=estimator,
scoring=scorer,
param_grid={"lgr__C": [10. ** i for i in range(-4, 5)]},
cv=3,
n_jobs=1)
return model_lr
return lambda_model
def OneWordRFRegularized3(scorer=sklearn.metrics.make_scorer(wac_score)):
class_weight = "balanced" if scorer == sklearn.metrics.make_scorer(wac_score) else None
def lambda_model(E):
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")),
("rf", RandomForestClassifier(n_estimators=500, class_weight=class_weight))])
model_RF = SKFGridSearchCV(skf_rng=777,
estimator=estimator,
scoring=scorer,
param_grid={"rf__max_depth": [None, 10, 5, 3],
"rf__min_samples_leaf": [1, 2, 5, 10],
"rf__max_features": ["sqrt"]},
cv=3,
n_jobs=1) # n_jobs=1, because joblib serializes to disk..
return model_RF
return lambda_model
# Old results are calling OneWordRFRegularized2()
def OneWordKnn(scorer=sklearn.metrics.make_scorer(wac_score)):
def lambda_model(E):
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")),
("scaler", StandardScaler()),
("knn", KNeighborsClassifier(n_neighbors=1))])
model = SKFGridSearchCV(skf_rng=777,
estimator=estimator,
scoring=scorer,
param_grid={"knn__n_neighbors": [2, 3, 5]},
cv=3, # TODO: check if this is important for fit quality
n_jobs=1) # TODO: I think n_jobs > 1 doesn't work correctyl..
return model
return lambda_model
from sklearn.naive_bayes import GaussianNB
def OneWordNB(scorer="accuracy"):
assert scorer == "accuracy"
def lambda_model(E):
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")),
("scaler", StandardScaler()),
("knn", GaussianNB())])
return estimator
return lambda_model
def OneWordSVMrbf(scorer=sklearn.metrics.make_scorer(wac_score)):
class_weight = "balanced" if scorer == sklearn.metrics.make_scorer(wac_score) else None
# 60 tasks
def lambda_model(E):
estimator = Pipeline([("emb", SentenceEmbedder(embedding=E, method="concat")),
("scaler", StandardScaler()),
("svm", SVC(kernel="rbf", gamma=0.01, max_iter=1e7, C=0.1, class_weight=class_weight))])
model_RF = SKFGridSearchCV(skf_rng=777,
estimator=estimator,
scoring=scorer,
param_grid={"svm__gamma": [10. ** i for i in range(-6, 0)],
"svm__C": [10. ** i for i in range(-6, 6)]},
cv=3,
n_jobs=3)
return model_RF
return lambda_model
############################################
## ##
## SENTENCE MODELS ##
## ##
############################################
def SentimentCNN(mode="extended"):
if mode == "vanilla":
def lambda_model(E, maxlen=200):
return SentenceCNN(embedding=E)
elif mode == "extended":
param_distributions = {
"nb_filter": [50, 100],
"filter_length": [3],
"hidden_dims": [0, 10, 50],
"nb_epoch": [30], # This is slight trick - we don't want to use valid holdout essentionally
"dropout": [0., 0.2, 0.4],
"optimizer": ["adam"],
"validation_split": [0.1],
"maxlen": [60],
"lr": [0.001, 0.01]
}
n_iter = 25
cv = 5
scoring = "accuracy"
def lambda_model(E):
model = RandomizedSearchCV(estimator=SentenceCNN(embedding=E), param_distributions=param_distributions,
n_iter=n_iter, random_state=777, n_jobs=20, cv=cv, scoring=scoring)
return model
else:
raise ValueError()
return lambda_model
def AvgSVMRbf2():
def lambda_model(E):
pipe = [("emb", SentenceEmbedder(embedding=E, on_missing="raise"))]
pipe.append(("scaler", StandardScaler()))
pipe.append(('svm', SVC(kernel="rbf", max_iter=5e7)))
param_grid = {
"svm__gamma": [10. ** i for i in range(-6, 0)],
"svm__C": [10. ** i for i in range(-5, 5)]
}
return SKFGridSearchCV(estimator=Pipeline(pipe),
param_grid=param_grid,
n_jobs=3,
skf_rng=777,
scoring="accuracy")
return lambda_model
def AvgLR():
def lambda_model(E):
pipe = [("emb", SentenceEmbedder(embedding=E, on_missing="raise"))]
pipe.append(("scaler", StandardScaler()))
pipe.append(("lgr", LogisticRegression()))
estimator = Pipeline(pipe)
model = SKFGridSearchCV(estimator=estimator,
scoring="accuracy",
param_grid={"lgr__C": [10. ** i for i in range(-4, 5)]},
skf_rng=777,
n_jobs=3)
return model
return lambda_model
def AvgGNB():
def lambda_model(E):
pipe = [("emb", SentenceEmbedder(embedding=E, on_missing="raise"))]
pipe.append(("scaler", StandardScaler()))
pipe.append(("nb", GaussianNB()))
estimator = Pipeline(pipe)
return estimator
return lambda_model
############################################
## ##
## SIMILARITY MODELS ##
## ##
############################################
def SimilarityDotProduct():
def lambda_model(E):
return Pipeline([('emb', DoubleListEmbedder(E)),
('dot', DoubleListDotProduct(method="single", pairs=[[0, 1]])),
('identity', DummyModel())])
return lambda_model
def SimilarityRidge(mode="vanilla"):
if mode == "vanilla":
alpha_grid = [10.0 ** n for n in range(-5, 6)]
method_grid = ["diagonal", "double_diagonal", "quadruple_diagonal", "concat"]
n_folds = 5
elif mode == "linear":
alpha_grid = [10.0 ** n for n in range(-5, 6)]
method_grid = ["diagonal", "triple_diagonal", "concat"]
n_folds = 5
else:
raise ValueError()
def scorer(estimator, X, y):
return spearmanr(estimator.predict(X), y).correlation
def lambda_model(E):
return GridSearchCV(
estimator=Pipeline([('emb', DoubleListEmbedder(E)), ('dot', DoubleListDotProduct()), ('ridge', Ridge())]),
param_grid={'ridge__alpha': alpha_grid, 'dot__method': method_grid},
scoring=scorer,
cv=n_folds)
return lambda_model
def SimilarityRandomForestDiagonal(mode="vanilla"):
if mode == "vanilla":
n_estimators_grid = [50, 500]
max_features_grid = [None]
max_depth_grid = [None]
n_folds = 5
elif mode == "extended":
n_estimators_grid = [50, 500]
max_features_grid = ["sqrt", "log2", None]
max_depth_grid = [3, 10, None]
n_folds = 5
else:
raise ValueError()
def scorer(estimator, X, y):
return spearmanr(estimator.predict(X), y).correlation
def lambda_model(E):
estimator = Pipeline([('emb', DoubleListEmbedder(E)), ('dot', DoubleListDotProduct(method="diagonal")),
('rfr', RandomForestRegressor())])
param_grid = {
'rfr__n_estimators': n_estimators_grid,
'rfr__max_features': max_features_grid,
'rfr__max_depth': max_depth_grid,
}
return GridSearchCV(estimator=estimator,
param_grid=param_grid,
scoring=scorer,
cv=n_folds)
return lambda_model
def SimilaritySVR(mode="vanilla"):
if mode == "vanilla":
C_grid = [0.01, 0.1, 1.0, 3.0]
gamma_grid = [0.01, 0.001, 0.0001]
method_grid = ["diagonal", "double_diagonal", "quadruple_diagonal", "concat"]
n_folds = 5
else:
raise ValueError()
def lambda_model(E):
estimator = Pipeline([
('emb', DoubleListEmbedder(E)),
('dot', DoubleListDotProduct()),
('scaler', StandardScaler()),
('svm', SVR(kernel="rbf", max_iter=1e7))])
param_grid = {
'dot__method': method_grid,
'svm__C': C_grid,
'svm__gamma': gamma_grid,
}
def scorer(estimator, X, y):
return spearmanr(estimator.predict(X), y).correlation
return GridSearchCV(estimator=estimator,
param_grid=param_grid,
cv=n_folds,
scoring=scorer)
return lambda_model
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment