Instantly share code, notes, and snippets.

# larsmans/supervised_tf.py

Created October 9, 2014 11:19
Show Gist options
• Save larsmans/239fecd3fc6b49e50da9 to your computer and use it in GitHub Desktop.
Supervised tf (tf-chi², tf-rf) for scikit-learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 import numpy as np #from scipy.special import chdtrc from scipy.sparse import spdiags from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import LabelBinarizer def _chisquare(f_obs, f_exp, reduce): """Replacement for scipy.stats.chisquare with custom reduction. Version from https://github.com/scipy/scipy/pull/2525 with additional optimizations. """ f_obs = np.asarray(f_obs, dtype=np.float64) k = len(f_obs) # Reuse f_obs for chi-squared statistics chisq = f_obs chisq -= f_exp chisq **= 2 chisq /= f_exp chisq = reduce(chisq, axis=0) return chisq #, chdtrc(k - 1, chisq) def _chi2(X, y, alpha, reduce): Y = LabelBinarizer().fit_transform(y) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) observed = Y.T * X feature_count = np.asarray(X.sum(axis=0)) class_prob = np.asarray(Y.mean(axis=0)).reshape(-1, 1) expected = np.dot(class_prob, feature_count) observed += alpha expected += alpha return _chisquare(observed, expected, reduce) def _rf(X, y, alpha, reduce): """Relevance frequency. Ignores alpha.""" Y = LabelBinarizer().fit_transform(y) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Per class "document frequencies" (# of samples containing each feature). rf = ((Y.T * X) > 0).astype(np.float64) for i in range(Y.shape[1]): # rf.sum(axis=0) - rf[i] is the sum of all rows except i rf[i] /= np.maximum(1., rf.sum(axis=0) - rf[i]) # XXX original uses log2(2 + rf) return reduce(np.log1p(rf, out=rf), axis=0) class SupervisedTermWeights(BaseEstimator, TransformerMixin): """Supervised term weighting transformer. This estimator learns term weights in a supervised way, taking into account the frequency with which features occur in the distinct classes of a classification problem. It produces weighted frequencies by multiplying term frequencies by the learned weights, to get combinations such as tf-chi2, i.e., term frequency times chi2 test statistic. Such weightings have been found to outperform the unsupervised tf-idf weighting on a variety of text classification tasks (using linear classifiers). Parameters ---------- weighting : {'chi2', 'rf'}, default = 'chi2' Weighting scheme. 'chi2' is the chi^2 test statistic; 'rf' is the relevance frequency of Lan et al. reduce : {'max', 'mean', 'sum'}, default = 'max' How to reduce per-class scores for each feature into a single score: take the max, mean or sum over the classes. References ---------- Man Lan, Chew Lim Tan and Jian Su (2007). Supervised and Traditional Term Weighting Methods for Automatic Text Categorization. PAMI. """ _WEIGHTING = {'chi2': _chi2, 'rf': _rf} _REDUCE = {'max': np.max, 'mean': np.mean, 'sum': np.sum} _REDUCE = {'mean': np.mean, 'max': np.max} def __init__(self, weighting="chi2", reduce="max", alpha=1): self.reduce = reduce self.smooth = 1 self.weighting = weighting def fit(self, X, y): """Learn supervised term weights from training set X, y.""" reduce = self._REDUCE[self.reduce] weighting = self._WEIGHTING[self.weighting] def transform(self, X, y=None): """Transform term frequency matrix X into a weighted frequency matrix. """ n_features = self.weights_.shape[0] return X * spdiags(self.weights_, 0, n_features, n_features)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 import scipy.sparse as sp from sklearn.feature_extraction.text import CountVectorizer from supervised_tf import SupervisedTermWeights from sklearn.utils.testing import assert_array_equal, assert_equal, assert_true # Let's do language guessing. docs = ["an apple a day keeps the doctor away", "time flies like an arrow", "the more the merrier", "the quick brown fox jumps over the lazy dog", "quod licet Iovi non licet bovi", "ut desint vires, tamen laudanda est voluntas", "gallia est omnis divisa in partes tres", "ceterum censeo carthaginem delendam esse", ] y = ["en", "en", "en", "en", "la", "la", "la", "la"] v = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)) X = v.fit_transform(docs) def test_supervised_term_weights(): X_a = X.toarray() for weighting in SupervisedTermWeights._WEIGHTING: for reduction in SupervisedTermWeights._REDUCE: sup_tw = SupervisedTermWeights(weighting=weighting, reduce=reduction) X1 = sup_tw.fit_transform(X, y) X2 = sup_tw.fit(X, y).transform(X) assert_true(sp.isspmatrix(X1)) assert_true(sp.isspmatrix(X2)) assert_equal(X1.shape, X2.shape) X1_a = X1.toarray() assert_array_equal(X1_a, X_a * sup_tw.weights_)

### wanglan0605 commented Jan 14, 2015

Hi, when I try to implement
sup_tw = SupervisedTermWeights(weighting='chi2',reduce="max")
X_sup = sup_tw.fit(X, y).transform(X)
print X_sup

An error ''X_sup = sup_tw.fit(X, y).transform(X)
AttributeError: 'NoneType' object has no attribute 'transform" occurred.

### pauldes commented May 21, 2018 • edited

Hi, thanks for this code ! but I think it is incomplete.
I already added a line "return self" at the end of the fit function, but now I have an error : 'SupervisedTermWeights' object has no attribute 'weights_'.