Skip to content

Instantly share code, notes, and snippets.

@pr2tik1
Last active May 22, 2020 17:08
Show Gist options
  • Save pr2tik1/ac5a67bb2452cb89d406329acc801594 to your computer and use it in GitHub Desktop.
Save pr2tik1/ac5a67bb2452cb89d406329acc801594 to your computer and use it in GitHub Desktop.
def find_non_rare_labels(self, variable, tolerance):
'''
Function to check cardinality of a feature.
Args: Dataframe,
Feature - Numerical Features,
Tolerance - Threshold of number of values.
Output: List of unique values of the feature.
'''
temp = self.data.groupby([variable])[variable].count() / len(self.data)
non_rare = [x for x in temp.loc[temp>tolerance].index.values]
return non_rare
def rare_encoding(self, variable, tolerance):
'''
Encoding the rare labels to decrease cardinality
Input: Dataframe, variable,
tolerance - Threshold value
Ouput : dataframe with encoded values
'''
self.data = self.data.copy()
# find the most frequent category
frequent_cat = self.find_non_rare_labels(variable, tolerance)
# re-group rare labels
self.data[variable] = np.where(self.data[variable].isin(frequent_cat), self.data[variable], 'Rare')
return self.data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment