Skip to content

Instantly share code, notes, and snippets.

@richmanbtc
Created December 24, 2022 21:32
Show Gist options
  • Save richmanbtc/075178cd0e6d15c4a251128068991d47 to your computer and use it in GitHub Desktop.
Save richmanbtc/075178cd0e6d15c4a251128068991d47 to your computer and use it in GitHub Desktop.
# license: CC0
from sklearn.base import BaseEstimator, TransformerMixin, clone
import lightgbm as lgb
import numpy as np
import pandas as pd
# why does this work?
# https://arxiv.org/pdf/1704.05310.pdf
# https://www.inference.vc/unsupervised-learning-by-predicting-noise-an-information-maximization-view-2/
class NatFeatureRemover(BaseEstimator, TransformerMixin):
def __init__(self, estimator=None, remove_count=None, remove_ratio=None):
if remove_count and remove_ratio:
raise Exception('remove_count and remove_ratio cannot be set simultaneously')
self.estimator = lgb.LGBMRegressor(n_jobs=-1, random_state=1) if estimator is None else estimator
self.remove_count = remove_count
self.remove_ratio = remove_ratio
def fit(self, X, y=None):
X = self._validate_data(X)
if self.remove_count:
remove_count = self.remove_count
else:
remove_count = int(self.remove_ratio * X.shape[1])
self.selected_features_ = nfr_calc_features(self.estimator, remove_count, X)
return self
def transform(self, X, y=None):
X = self._validate_data(X)
return X[:, self.selected_features_].copy()
def inverse_transform(self, X, y=None):
raise Exception('inverse_transform not implemented')
def nfr_calc_features(model, remove_count, X):
model = clone(model)
# model.fit(X, np.arange(X.shape[0]))
# model.fit(X, np.random.normal(0, 1, size=X.shape[0]))
model.fit(X, np.random.uniform(0, 1, size=X.shape[0]))
importances = model.feature_importances_
features = list(range(X.shape[1]))
feature_imp = pd.DataFrame(zip(importances, features), columns=['value', 'feature'])
feature_imp = feature_imp.sort_values('value')
for i in range(X.shape[1] - remove_count, X.shape[1]):
features.remove(int(feature_imp['feature'].iloc[i]))
return np.array(features)