GermanCM/one_hot_encoding_stratified.py

## one_hot_encoding_stratified.py
# source: https://stackoverflow.com/questions/18016495/get-subset-of-most-frequent-dummy-variables-in-pandas
# func that returns a dummified DataFrame of significant dummies in a given column
def dum_sign(dummy_col, threshold=0.1):
    import pandas as pd
    import numpy as np

    # removes the bind
    dummy_col = dummy_col.copy()

    # what is the ratio of a dummy in whole column
    count = pd.value_counts(dummy_col) / len(dummy_col)

    # cond whether the ratios is higher than the threshold
    mask = dummy_col.isin(count[count > threshold].index)

    # replace the ones which ratio is lower than the threshold by a special name
    dummy_col[~mask] = "others"

    return pd.get_dummies(dummy_col, prefix=dummy_col.name)
	# source: https://stackoverflow.com/questions/18016495/get-subset-of-most-frequent-dummy-variables-in-pandas
	# func that returns a dummified DataFrame of significant dummies in a given column
	def dum_sign(dummy_col, threshold=0.1):
	import pandas as pd
	import numpy as np

	# removes the bind
	dummy_col = dummy_col.copy()

	# what is the ratio of a dummy in whole column
	count = pd.value_counts(dummy_col) / len(dummy_col)

	# cond whether the ratios is higher than the threshold
	mask = dummy_col.isin(count[count > threshold].index)

	# replace the ones which ratio is lower than the threshold by a special name
	dummy_col[~mask] = "others"

	return pd.get_dummies(dummy_col, prefix=dummy_col.name)