Skip to content

Instantly share code, notes, and snippets.

@abarmat
Last active May 30, 2017 18:27
Show Gist options
  • Save abarmat/8ac1f90a5c81927110966269d3502f52 to your computer and use it in GitHub Desktop.
Save abarmat/8ac1f90a5c81927110966269d3502f52 to your computer and use it in GitHub Desktop.
import pprint
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, cross_validation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC, SVC
from nltk.stem.snowball import SpanishStemmer
from nltk.corpus import stopwords
import xgboost
class MultiItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return [{self.key: e} for idx, e in data_dict[self.key].iteritems()]
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
def read_file(filename):
d = pd.read_csv(filename, sep=';')
return d.ix[:,:-1], d['Clase']
def save_prediction(filename, data):
with open(filename, 'w') as f:
for item in data:
proba = map(lambda e: str(round(e, 4)), item[1])
f.write(';'.join([str(item[0])] + proba) + '\n')
def parse_doc(title, body):
title = '' if title is np.nan else title
body = '' if body is np.nan else body
doc = (title + ' ' + body).strip()
return doc.decode('latin-1')
def process(X, y):
# Feature selection
attr_list = ['tit', 'des']
# Convert to array
X_body = [parse_doc(e[0], e[1]) for e in X[attr_list].values.tolist()]
X['body'] = pd.Series(X_body, index=X.index)
y = np.array(y)
return (X, y)
def show_feature_importance(clf):
fn = clf.named_steps['vec'].get_feature_names()
fn = np.asarray(fn)
for class_id in range(5):
top = np.argsort(clf.named_steps['clf'].coef_[class_id])[-10:]
print('[{}]'.format(class_id + 1))
for idx, feature in enumerate(fn[top]):
print('+ {} : {}'.format(feature, top[idx]))
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
class CustomTokenizer(object):
def __init__(self):
self.stemmer = SpanishStemmer()
def __call__(self, doc):
pattern = re.compile(r"(?u)\b[a-zA-Z][a-zA-Z]+\b")
tokens = pattern.findall(doc)
tokens = [token.lower() for token in tokens]
# tokens = [self.stemmer.stem(token) for token in tokens]
return tokens
def main():
rng = np.random.RandomState(100)
# Options
opt_cross_validate = False
opt_show_report = True
opt_save_prediction = True
opt_show_feature_importance = False
# Read data
X_train, y_train = read_file('tp2-work.train-1.csv')
X_test, y_test = read_file('tp2-work.test-1.csv')
# Preprocess
X_train, y_train = process(X_train, y_train)
X_test, y_test = process(X_test, y_test)
txt_fields = 'body'
num_fields = ['anio']
# Classification pipeline
vectorizer = CountVectorizer(
ngram_range=(2,4),
strip_accents='ascii',
lowercase=False,
stop_words=stopwords.words('spanish') + ['argentina', 'id', 'mls'],
tokenizer=CustomTokenizer(),
max_df=0.10
)
clf = Pipeline([
('union', FeatureUnion(transformer_list=[
('num', Pipeline([
('sel', MultiItemSelector(num_fields)),
('dic', DictVectorizer(sparse=False))
])),
('txt', Pipeline([
('sel', ItemSelector(txt_fields)),
('vec', vectorizer)
]))
])),
('clf', SVC(probability=True, kernel='linear'))
])
# Cross-validate
if opt_cross_validate:
k_fold = cross_validation.KFold(n=len(X_train), n_folds=10, random_state=rng)
results = cross_validation.cross_val_score(
clf, X_train, y_train, cv=k_fold, n_jobs=-1
)
print(results)
print(sum(results)/len(results))
# Fit
# X_what = clf.named_steps['txt'].named_steps['sel'].fit_transform(X_train)
# print X_what
# print X_what.shape[0], y_train.shape[0]
# X_what = clf.named_steps['txt'].named_steps['dic'].fit_transform(X_what)
# print X_what.shape[0], y_train.shape[0]
# print(clf.named_steps['clf'].fit(X_what, y_train))
clf = clf.fit(X_train, y_train)
# Test
y_pred = clf.predict(X_test)
if opt_show_report:
# Model performance
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
if False:
plt.figure()
plot_confusion_matrix(cm, range(1,6))
plt.show()
report = metrics.classification_report(y_test, y_pred)
print(report)
if opt_show_feature_importance:
show_feature_importance(clf)
if opt_save_prediction:
save_prediction('svm-pred-test.csv', zip(y_pred, clf.predict_proba(X_test)))
# Feature importance
if False:
feat_imp_vals = clf.named_steps['clf'].booster().get_fscore()
feat_names = clf.named_steps['vec'].get_feature_names()
feat_imp = {
f: feat_imp_vals.get('f{}'.format(idx), 0) for idx, f in enumerate(feat_names)
}
total = np.array(feat_imp.values()).sum()
A = {k:v/float(total) for k,v in feat_imp.items()}
B = sorted(A.items(), key=lambda e: e[1], reverse=True)[:20]
# for e in B:
# print e[0]
df = pd.DataFrame(sorted(B, key=lambda e: e[1]), columns=['feature', 'fscore'])
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 20))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment