Skip to content

Instantly share code, notes, and snippets.

@larsmans
Created October 9, 2014 11:19
Show Gist options
  • Save larsmans/239fecd3fc6b49e50da9 to your computer and use it in GitHub Desktop.
Save larsmans/239fecd3fc6b49e50da9 to your computer and use it in GitHub Desktop.
Supervised tf (tf-chi², tf-rf) for scikit-learn
import numpy as np
#from scipy.special import chdtrc
from scipy.sparse import spdiags
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer
def _chisquare(f_obs, f_exp, reduce):
"""Replacement for scipy.stats.chisquare with custom reduction.
Version from https://github.com/scipy/scipy/pull/2525 with additional
optimizations.
"""
f_obs = np.asarray(f_obs, dtype=np.float64)
k = len(f_obs)
# Reuse f_obs for chi-squared statistics
chisq = f_obs
chisq -= f_exp
chisq **= 2
chisq /= f_exp
chisq = reduce(chisq, axis=0)
return chisq #, chdtrc(k - 1, chisq)
def _chi2(X, y, alpha, reduce):
Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
Y = np.append(1 - Y, Y, axis=1)
observed = Y.T * X
feature_count = np.asarray(X.sum(axis=0))
class_prob = np.asarray(Y.mean(axis=0)).reshape(-1, 1)
expected = np.dot(class_prob, feature_count)
observed += alpha
expected += alpha
return _chisquare(observed, expected, reduce)
def _rf(X, y, alpha, reduce):
"""Relevance frequency. Ignores alpha."""
Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
Y = np.append(1 - Y, Y, axis=1)
# Per class "document frequencies" (# of samples containing each feature).
rf = ((Y.T * X) > 0).astype(np.float64)
for i in range(Y.shape[1]):
# rf.sum(axis=0) - rf[i] is the sum of all rows except i
rf[i] /= np.maximum(1., rf.sum(axis=0) - rf[i])
# XXX original uses log2(2 + rf)
return reduce(np.log1p(rf, out=rf), axis=0)
class SupervisedTermWeights(BaseEstimator, TransformerMixin):
"""Supervised term weighting transformer.
This estimator learns term weights in a supervised way, taking into account
the frequency with which features occur in the distinct classes of a
classification problem. It produces weighted frequencies by multiplying
term frequencies by the learned weights, to get combinations such as
tf-chi2, i.e., term frequency times chi2 test statistic.
Such weightings have been found to outperform the unsupervised tf-idf
weighting on a variety of text classification tasks (using linear
classifiers).
Parameters
----------
weighting : {'chi2', 'rf'}, default = 'chi2'
Weighting scheme. 'chi2' is the chi^2 test statistic; 'rf' is the
relevance frequency of Lan et al.
reduce : {'max', 'mean', 'sum'}, default = 'max'
How to reduce per-class scores for each feature into a single score:
take the max, mean or sum over the classes.
References
----------
Man Lan, Chew Lim Tan and Jian Su (2007). Supervised and Traditional Term
Weighting Methods for Automatic Text Categorization. PAMI.
"""
_WEIGHTING = {'chi2': _chi2, 'rf': _rf}
_REDUCE = {'max': np.max, 'mean': np.mean, 'sum': np.sum}
_REDUCE = {'mean': np.mean, 'max': np.max}
def __init__(self, weighting="chi2", reduce="max", alpha=1):
self.reduce = reduce
self.smooth = 1
self.weighting = weighting
def fit(self, X, y):
"""Learn supervised term weights from training set X, y."""
reduce = self._REDUCE[self.reduce]
weighting = self._WEIGHTING[self.weighting]
def transform(self, X, y=None):
"""Transform term frequency matrix X into a weighted frequency matrix.
"""
n_features = self.weights_.shape[0]
return X * spdiags(self.weights_, 0, n_features, n_features)
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from supervised_tf import SupervisedTermWeights
from sklearn.utils.testing import assert_array_equal, assert_equal, assert_true
# Let's do language guessing.
docs = ["an apple a day keeps the doctor away",
"time flies like an arrow",
"the more the merrier",
"the quick brown fox jumps over the lazy dog",
"quod licet Iovi non licet bovi",
"ut desint vires, tamen laudanda est voluntas",
"gallia est omnis divisa in partes tres",
"ceterum censeo carthaginem delendam esse",
]
y = ["en", "en", "en", "en", "la", "la", "la", "la"]
v = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
X = v.fit_transform(docs)
def test_supervised_term_weights():
X_a = X.toarray()
for weighting in SupervisedTermWeights._WEIGHTING:
for reduction in SupervisedTermWeights._REDUCE:
sup_tw = SupervisedTermWeights(weighting=weighting,
reduce=reduction)
X1 = sup_tw.fit_transform(X, y)
X2 = sup_tw.fit(X, y).transform(X)
assert_true(sp.isspmatrix(X1))
assert_true(sp.isspmatrix(X2))
assert_equal(X1.shape, X2.shape)
X1_a = X1.toarray()
assert_array_equal(X1_a, X_a * sup_tw.weights_)
@wanglan0605
Copy link

Hi, when I try to implement
sup_tw = SupervisedTermWeights(weighting='chi2',reduce="max")
X_sup = sup_tw.fit(X, y).transform(X)
print X_sup

An error ''X_sup = sup_tw.fit(X, y).transform(X)
AttributeError: 'NoneType' object has no attribute 'transform" occurred.
Could you please explain it?

@pauldes
Copy link

pauldes commented May 21, 2018

Hi, thanks for this code ! but I think it is incomplete.
I already added a line "return self" at the end of the fit function, but now I have an error : 'SupervisedTermWeights' object has no attribute 'weights_'.
PLease help if you have improved the code by now :)

@rth
Copy link

rth commented Feb 26, 2019

thanks for this code ! but I think it is incomplete.

Added a few fixes in https://gist.github.com/rth/29455033bf33daaa074dda9807dec8f0 to pass the included tests.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment