Skip to content

Instantly share code, notes, and snippets.

@rth
Forked from larsmans/supervised_tf.py
Created February 26, 2019 16:31
Show Gist options
  • Save rth/29455033bf33daaa074dda9807dec8f0 to your computer and use it in GitHub Desktop.
Save rth/29455033bf33daaa074dda9807dec8f0 to your computer and use it in GitHub Desktop.
Supervised tf (tf-chi², tf-rf) for scikit-learn
import numpy as np
#from scipy.special import chdtrc
from scipy.sparse import spdiags
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer
def _chisquare(f_obs, f_exp, reduce):
"""Replacement for scipy.stats.chisquare with custom reduction.
Version from https://github.com/scipy/scipy/pull/2525 with additional
optimizations.
"""
f_obs = np.asarray(f_obs, dtype=np.float64)
k = len(f_obs)
# Reuse f_obs for chi-squared statistics
chisq = f_obs
chisq -= f_exp
chisq **= 2
chisq /= f_exp
chisq = reduce(chisq, axis=0)
return chisq #, chdtrc(k - 1, chisq)
def _chi2(X, y, alpha, reduce):
Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
Y = np.append(1 - Y, Y, axis=1)
observed = Y.T * X
feature_count = np.asarray(X.sum(axis=0))
class_prob = np.asarray(Y.mean(axis=0)).reshape(-1, 1)
expected = np.dot(class_prob, feature_count)
observed += alpha
expected += alpha
return _chisquare(observed, expected, reduce)
def _rf(X, y, alpha, reduce):
"""Relevance frequency. Ignores alpha."""
Y = LabelBinarizer().fit_transform(y)
if Y.shape[1] == 1:
Y = np.append(1 - Y, Y, axis=1)
# Per class "document frequencies" (# of samples containing each feature).
rf = ((Y.T * X) > 0).astype(np.float64)
for i in range(Y.shape[1]):
# rf.sum(axis=0) - rf[i] is the sum of all rows except i
rf[i] /= np.maximum(1., rf.sum(axis=0) - rf[i])
# XXX original uses log2(2 + rf)
return reduce(np.log1p(rf, out=rf), axis=0)
class SupervisedTermWeights(BaseEstimator, TransformerMixin):
"""Supervised term weighting transformer.
This estimator learns term weights in a supervised way, taking into account
the frequency with which features occur in the distinct classes of a
classification problem. It produces weighted frequencies by multiplying
term frequencies by the learned weights, to get combinations such as
tf-chi2, i.e., term frequency times chi2 test statistic.
Such weightings have been found to outperform the unsupervised tf-idf
weighting on a variety of text classification tasks (using linear
classifiers).
Parameters
----------
weighting : {'chi2', 'rf'}, default = 'chi2'
Weighting scheme. 'chi2' is the chi^2 test statistic; 'rf' is the
relevance frequency of Lan et al.
reduce : {'max', 'mean', 'sum'}, default = 'max'
How to reduce per-class scores for each feature into a single score:
take the max, mean or sum over the classes.
References
----------
Man Lan, Chew Lim Tan and Jian Su (2007). Supervised and Traditional Term
Weighting Methods for Automatic Text Categorization. PAMI.
"""
_WEIGHTING = {'chi2': _chi2, 'rf': _rf}
_REDUCE = {'max': np.max, 'mean': np.mean, 'sum': np.sum}
def __init__(self, weighting="chi2", reduce="max", alpha=1):
self.reduce = reduce
self.alpha = alpha
self.weighting = weighting
def fit(self, X, y):
"""Learn supervised term weights from training set X, y."""
reduce_ = self._REDUCE[self.reduce]
weighting = self._WEIGHTING[self.weighting]
self.weights_ = weighting(X, y, self.alpha, reduce_)
return self
def transform(self, X, y=None):
"""Transform term frequency matrix X into a weighted frequency matrix.
"""
n_features = self.weights_.shape[0]
return X * spdiags(self.weights_, 0, n_features, n_features)
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from supervised_tf import SupervisedTermWeights
from sklearn.utils.testing import assert_array_equal, assert_equal, assert_true
# Let's do language guessing.
docs = ["an apple a day keeps the doctor away",
"time flies like an arrow",
"the more the merrier",
"the quick brown fox jumps over the lazy dog",
"quod licet Iovi non licet bovi",
"ut desint vires, tamen laudanda est voluntas",
"gallia est omnis divisa in partes tres",
"ceterum censeo carthaginem delendam esse",
]
y = ["en", "en", "en", "en", "la", "la", "la", "la"]
v = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
X = v.fit_transform(docs)
def test_supervised_term_weights():
X_a = X.toarray()
for weighting in SupervisedTermWeights._WEIGHTING:
for reduction in SupervisedTermWeights._REDUCE:
sup_tw = SupervisedTermWeights(weighting=weighting,
reduce=reduction)
X1 = sup_tw.fit_transform(X, y)
X2 = sup_tw.fit(X, y).transform(X)
assert_true(sp.isspmatrix(X1))
assert_true(sp.isspmatrix(X2))
assert_equal(X1.shape, X2.shape)
X1_a = X1.toarray()
assert_array_equal(X1_a, X_a * sup_tw.weights_)
@rifqirf
Copy link

rifqirf commented Jul 13, 2020

Hi, i'm new to scikit-learn. can you give me example with classifier like SVM ?

@BradKML
Copy link

BradKML commented Aug 10, 2021

@rifqirf
It should be noted that Supervised term weighting is somewhat different from Unsupervised term weighting like TF-IDF.
Other than that there are papers out there that prove TF-RF is good.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment