Skip to content

Instantly share code, notes, and snippets.

@althonos
Created January 9, 2023 22:38
Show Gist options
  • Save althonos/832026d1b9bfeeeda66544eb8fe74a71 to your computer and use it in GitHub Desktop.
Save althonos/832026d1b9bfeeeda66544eb8fe74a71 to your computer and use it in GitHub Desktop.
An `sklearn` implementation of a feature selection procedure using p-value thresholds for each feature.
import sklearn
import fisher
class SelectPValueUnderThreshold(sklearn.base.TransformerMixin):
"""Select features with Fisher p-value under a certain threshold.
"""
def __init__(self, threshold=1.0):
self.threshold = threshold
self.features_ = None
self.pvalues_ = None
def fit(self, X, y):
self.pvalues_ = numpy.zeros(X.shape[1], dtype=numpy.float32)
for feature in range(X.shape[1]):
x = X[:,feature]
result = fisher.pvalue(
((x > 0) & (y == 1)).sum(),
((x == 0) & (y == 1)).sum(),
((x > 0) & (y == 0)).sum(),
((x == 0) & (y == 0)).sum(),
)
self.pvalues_[feature] = result.two_tail
self.indices_ = numpy.where(self.pvalues_ < self.threshold)[0]
return self
def transform(self, X):
if self.indices_ is None:
raise sklearn.exceptions.NotFittedError("model was not fitted")
if X.shape[1] != self.pvalues_.shape[0]:
raise ValueError(f"X has {X.shape[1]} features, but SelectPValueUnderThreshold is expecting {self.pvalues_.shape[0]} features as input.")
return X[:, self.indices_]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment