Skip to content

Instantly share code, notes, and snippets.

@kayibal
Created February 4, 2017 16:50
Show Gist options
  • Save kayibal/e616f51a21118abf5cb3355a05656ba8 to your computer and use it in GitHub Desktop.
Save kayibal/e616f51a21118abf5cb3355a05656ba8 to your computer and use it in GitHub Desktop.
Best models for mlp
import re
import nltk
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost.sklearn import XGBModel
# some monkeypatching to get probability estimates from xgboost
def xgb_pred_proba(self, X):
pred = self.predict(X).reshape(-1, 1)
return np.hstack([np.zeros(len(pred)).reshape(-1, 1), pred])
XGBModel.predict_proba = xgb_pred_proba
__tokenizer = RegexpTokenizer(r"\w+")
__stop = set(stopwords.words("german"))
__word_regex = re.compile("\D+")
def _preprocess_doc(doc):
return [w for w in __tokenizer.tokenize(doc.lower()) if w not in __stop and re.fullmatch(__word_regex, w)]
features = {}
models = {}
features['mlp_00'] = \
make_pipeline(TfidfVectorizer(
analyzer='char_wb',
strip_accents='unicode',
tokenizer=_preprocess_doc),
SelectPercentile(
chi2,
percentile=90)
)
models['mlp_00'] = {
'BNB': BernoulliNB(alpha=0.118),
'DTREE': OneVsRestClassifier(XGBModel(nthread=1,
objective="binary:logistic",
colsample_bytree=0.5,
learning_rate=0.1,
max_depth=20,
min_child_weight=1),
n_jobs=-1
),
'LDA':Pipeline([
("reduction", TruncatedSVD(n_components=2000)),
("lda", LDA(n_components=13)),
("clf", SVC(kernel="rbf", C=5))
])
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment