Last active
May 22, 2020 17:08
-
-
Save pr2tik1/ac5a67bb2452cb89d406329acc801594 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_non_rare_labels(self, variable, tolerance): | |
''' | |
Function to check cardinality of a feature. | |
Args: Dataframe, | |
Feature - Numerical Features, | |
Tolerance - Threshold of number of values. | |
Output: List of unique values of the feature. | |
''' | |
temp = self.data.groupby([variable])[variable].count() / len(self.data) | |
non_rare = [x for x in temp.loc[temp>tolerance].index.values] | |
return non_rare | |
def rare_encoding(self, variable, tolerance): | |
''' | |
Encoding the rare labels to decrease cardinality | |
Input: Dataframe, variable, | |
tolerance - Threshold value | |
Ouput : dataframe with encoded values | |
''' | |
self.data = self.data.copy() | |
# find the most frequent category | |
frequent_cat = self.find_non_rare_labels(variable, tolerance) | |
# re-group rare labels | |
self.data[variable] = np.where(self.data[variable].isin(frequent_cat), self.data[variable], 'Rare') | |
return self.data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment