steermomo/fe.py

## fe.py
# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600
# Encoding Functions
# Below are 5 encoding functions.
# (1) encode_FE does frequency encoding where it combines train and test first and then encodes.
# (2) encode_LE is a label encoded for categorical features
# (3) encode_AG makes aggregated features such as aggregated mean and std
# (4) encode_CB combines two columns
# (5) encode_AG2 makes aggregated features where it counts how many unique values of one feature is within a group.
# For more explanation about feature engineering, see the discussion here

# FREQUENCY ENCODE TOGETHER
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')
        print(nm,', ',end='')

# LABEL ENCODE
def encode_LE(col,train=X_train,test=X_test,verbose=True):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000:
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()
    if verbose: print(nm,', ',end='')


 # GROUP AGGREGATION MEAN AND STD
# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
def encode_AG(main_columns, uids, aggregations=['mean'], train_df=X_train, test_df=X_test,
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:
        for col in uids:
            for agg_type in aggregations:
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')

                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)

                print("'"+new_col_name+"'",', ',end='')


# COMBINE FEATURES
def encode_CB(col1,col2,df1=X_train,df2=X_test):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)
    encode_LE(nm,verbose=False)
    print(nm,', ',end='')

# GROUP AGGREGATION NUNIQUE
def encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test):
    for main_column in main_columns:
        for col in uids:
            comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
            print(col+'_'+main_column+'_ct, ',end='')
	# https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600
	# Encoding Functions
	# Below are 5 encoding functions.
	# (1) encode_FE does frequency encoding where it combines train and test first and then encodes.
	# (2) encode_LE is a label encoded for categorical features
	# (3) encode_AG makes aggregated features such as aggregated mean and std
	# (4) encode_CB combines two columns
	# (5) encode_AG2 makes aggregated features where it counts how many unique values of one feature is within a group.
	# For more explanation about feature engineering, see the discussion here

	# FREQUENCY ENCODE TOGETHER
	def encode_FE(df1, df2, cols):
	for col in cols:
	df = pd.concat([df1[col],df2[col]])
	vc = df.value_counts(dropna=True, normalize=True).to_dict()
	vc[-1] = -1
	nm = col+'_FE'
	df1[nm] = df1[col].map(vc)
	df1[nm] = df1[nm].astype('float32')
	df2[nm] = df2[col].map(vc)
	df2[nm] = df2[nm].astype('float32')
	print(nm,', ',end='')

	# LABEL ENCODE
	def encode_LE(col,train=X_train,test=X_test,verbose=True):
	df_comb = pd.concat([train[col],test[col]],axis=0)
	df_comb,_ = df_comb.factorize(sort=True)
	nm = col
	if df_comb.max()>32000:
	train[nm] = df_comb[:len(train)].astype('int32')
	test[nm] = df_comb[len(train):].astype('int32')
	else:
	train[nm] = df_comb[:len(train)].astype('int16')
	test[nm] = df_comb[len(train):].astype('int16')
	del df_comb; x=gc.collect()
	if verbose: print(nm,', ',end='')


	# GROUP AGGREGATION MEAN AND STD
	# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda
	def encode_AG(main_columns, uids, aggregations=['mean'], train_df=X_train, test_df=X_test,
	fillna=True, usena=False):
	# AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
	for main_column in main_columns:
	for col in uids:
	for agg_type in aggregations:
	new_col_name = main_column+'_'+col+'_'+agg_type
	temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
	if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
	temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
	columns={agg_type: new_col_name})

	temp_df.index = list(temp_df[col])
	temp_df = temp_df[new_col_name].to_dict()

	train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
	test_df[new_col_name] = test_df[col].map(temp_df).astype('float32')

	if fillna:
	train_df[new_col_name].fillna(-1,inplace=True)
	test_df[new_col_name].fillna(-1,inplace=True)

	print("'"+new_col_name+"'",', ',end='')



	# COMBINE FEATURES
	def encode_CB(col1,col2,df1=X_train,df2=X_test):
	nm = col1+'_'+col2
	df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
	df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)
	encode_LE(nm,verbose=False)
	print(nm,', ',end='')

	# GROUP AGGREGATION NUNIQUE
	def encode_AG2(main_columns, uids, train_df=X_train, test_df=X_test):
	for main_column in main_columns:
	for col in uids:
	comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
	mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
	train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
	test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
	print(col+'_'+main_column+'_ct, ',end='')