Skip to content

Instantly share code, notes, and snippets.

@agramfort
Created November 24, 2019 06:22
Show Gist options
  • Save agramfort/4873f16d78fde33f0caa482febf08211 to your computer and use it in GitHub Desktop.
Save agramfort/4873f16d78fde33f0caa482febf08211 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
class CountOrdinalEncoder(OrdinalEncoder):
"""Encode categorical features as an integer array
usint count information.
"""
def __init__(self, categories='auto', dtype=np.float64):
self.categories = categories
self.dtype = dtype
def fit(self, X, y=None):
"""Fit the OrdinalEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to determine the categories of each feature.
Returns
-------
self
"""
super().fit(X)
X_list, _, _ = self._check_X(X)
# now we'll reorder by counts
for k, cat in enumerate(self.categories_):
counts = []
for c in cat:
counts.append(np.sum(X_list[k] == c))
order = np.argsort(counts)
self.categories_[k] = cat[order]
return self
coe = CountOrdinalEncoder()
coe.fit_transform(pd.DataFrame(['fr', 'fr', 'fr', 'en', 'en', 'es', 'es']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment