import pprint
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, cross_validation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC, SVC
from nltk.stem.snowball import SpanishStemmer
from nltk.corpus import stopwords
import xgboost
class MultiItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return [{self.key: e} for idx, e in data_dict[self.key].iteritems()]
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
def read_file(filename):
d = pd.read_csv(filename, sep=';')
return d.ix[:,:-1], d['Clase']
def save_prediction(filename, data):
with open(filename, 'w') as f:
for item in data:
proba = map(lambda e: str(round(e, 4)), item[1])
f.write(';'.join([str(item[0])] + proba) + '\n')
def parse_doc(title, body):
title = '' if title is np.nan else title
body = '' if body is np.nan else body
doc = (title + ' ' + body).strip()
return doc.decode('latin-1')
def process(X, y):
# Feature selection
attr_list = ['tit', 'des']
# Convert to array
X_body = [parse_doc(e[0], e[1]) for e in X[attr_list].values.tolist()]
X['body'] = pd.Series(X_body, index=X.index)
y = np.array(y)
return (X, y)
def show_feature_importance(clf):
fn = clf.named_steps['vec'].get_feature_names()
fn = np.asarray(fn)
for class_id in range(5):
top = np.argsort(clf.named_steps['clf'].coef_[class_id])[-10:]
print('[{}]'.format(class_id + 1))
for idx, feature in enumerate(fn[top]):
print('+ {} : {}'.format(feature, top[idx]))
def plot_confusion_matrix(cm, classes, title='Confusion matrix',
plt.imshow(cm, interpolation='nearest', cmap=cmap)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.ylabel('True label')
plt.xlabel('Predicted label')
class CustomTokenizer(object):
def __init__(self):
self.stemmer = SpanishStemmer()
def __call__(self, doc):
pattern = re.compile(r"(?u)\b[a-zA-Z][a-zA-Z]+\b")
tokens = pattern.findall(doc)
tokens = [token.lower() for token in tokens]
# tokens = [self.stemmer.stem(token) for token in tokens]
return tokens
def main():
rng = np.random.RandomState(100)
# Options
opt_cross_validate = False
opt_show_report = True
opt_save_prediction = True
opt_show_feature_importance = False
# Read data
X_train, y_train = read_file('tp2-work.train-1.csv')
X_test, y_test = read_file('tp2-work.test-1.csv')
# Preprocess
X_train, y_train = process(X_train, y_train)
X_test, y_test = process(X_test, y_test)
txt_fields = 'body'
num_fields = ['anio']
# Classification pipeline
vectorizer = CountVectorizer(
stop_words=stopwords.words('spanish') + ['argentina', 'id', 'mls'],
clf = Pipeline([
('union', FeatureUnion(transformer_list=[
('num', Pipeline([
('sel', MultiItemSelector(num_fields)),
('dic', DictVectorizer(sparse=False))
('txt', Pipeline([
('sel', ItemSelector(txt_fields)),
('vec', vectorizer)
('clf', SVC(probability=True, kernel='linear'))
# Cross-validate
if opt_cross_validate:
k_fold = cross_validation.KFold(n=len(X_train), n_folds=10, random_state=rng)
results = cross_validation.cross_val_score(
clf, X_train, y_train, cv=k_fold, n_jobs=-1
# Fit
# X_what = clf.named_steps['txt'].named_steps['sel'].fit_transform(X_train)
# print X_what
# print X_what.shape[0], y_train.shape[0]
# X_what = clf.named_steps['txt'].named_steps['dic'].fit_transform(X_what)
# print X_what.shape[0], y_train.shape[0]
# print(clf.named_steps['clf'].fit(X_what, y_train))
clf =, y_train)
# Test
y_pred = clf.predict(X_test)
if opt_show_report:
# Model performance
accuracy = metrics.accuracy_score(y_test, y_pred)
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, y_pred)
if False:
plot_confusion_matrix(cm, range(1,6))
report = metrics.classification_report(y_test, y_pred)
if opt_show_feature_importance:
if opt_save_prediction:
save_prediction('svm-pred-test.csv', zip(y_pred, clf.predict_proba(X_test)))
# Feature importance
if False:
feat_imp_vals = clf.named_steps['clf'].booster().get_fscore()
feat_names = clf.named_steps['vec'].get_feature_names()
feat_imp = {
f: feat_imp_vals.get('f{}'.format(idx), 0) for idx, f in enumerate(feat_names)
total = np.array(feat_imp.values()).sum()
A = {k:v/float(total) for k,v in feat_imp.items()}
B = sorted(A.items(), key=lambda e: e[1], reverse=True)[:20]
# for e in B:
# print e[0]
df = pd.DataFrame(sorted(B, key=lambda e: e[1]), columns=['feature', 'fscore'])
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 20))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
if __name__ == '__main__':
