Skip to content

Instantly share code, notes, and snippets.

@barel-mishal
Last active October 30, 2021 12:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barel-mishal/6adf26069a9dc46ebc4b18dccd680e8b to your computer and use it in GitHub Desktop.
Save barel-mishal/6adf26069a9dc46ebc4b18dccd680e8b to your computer and use it in GitHub Desktop.
Optimized removable of outliers to every feature on each id in the table with Numpy and pandas

If you are searching how to remove outliers fast to of your table try this code

The function gets

pandas DataFrame with id index and numric features

image

calculetion:

the calculation for the removable is 2 standard deviation

Logic - how this func remove the outliers:

This function makes the use of the n dimension of the NumPy array to calculate and lookup each feature's and id's separately

  1. It calculates the std and means for each feature and ids.
  2. Generate lower mean and upper mean for each feature and ids.
  3. It makes a condition that each value in this feature and this id need not be bigger than the upper limit and less than the lower limit. If it does, it replaces the value with np.nan.

To make use the full power of numpy this function changes the 2 dimenation array to 3 dimmention array: so the rows are axis 0 axis, the features are axis 1 and n(3) axis are the ids

image

*THE PHOTO ARE FROM - Daniel Bourke course https://academy.zerotomastery.io/p/learn-tensorflow

def sort_data_by_ids(df, column_name):
return df.sort_values(column_name)
def flat_list(d_list):
'''
dependencies: itertools
'''
return list(itertools.chain.from_iterable(d_list))
def slice_df_for_floats_and_category(df, column_name):
return df.select_dtypes(include=['float64']), df.select_dtypes(
include=['category'])
def get_subject_ids(df, column_name):
return df[column_name].unique()
def calc_mean_and_std_for_df_by_ids(df, ids_values):
return df.groupby(ids_values).agg([np.mean, np.std])
def get_lims_upper_and_lower(df_means_and_stds,
number_of_ids,
number_featuers_columns,
by_sd_of=2):
calcs_shape_values = df_means_and_stds.values.reshape(
number_of_ids, number_featuers_columns, 2)
means = calcs_shape_values[:, :, :1]
stds = calcs_shape_values[:, :, 1:]
upper_lims = means + stds * by_sd_of
lower_lims = means - stds * by_sd_of
return upper_lims, lower_lims
def reshpe_vlaues_3d_ndarray(ndarray, axis0_dimensions, axis1_columns,
axis2_rows):
return ndarray.reshape(axis0_dimensions, axis1_columns, axis2_rows)
def select_and_replace_outliers(ndarry_of_features, ndarry_uppers_lims,
ndarry_lowers_lims):
conditiones = [
ndarry_of_features > ndarry_uppers_lims,
ndarry_of_features < ndarry_lowers_lims
]
choices = [np.nan, np.nan]
return np.select(conditiones, choices, ndarry_of_features)
def back_to_2d_ndarray(ndarry_of_features, axis1, axis2):
return ndarry_of_features.reshape(axis1, axis2)
def sort_data_by_index(df):
return df.sort_index()
def get_categories_cals_names(df):
return df.index.names[1:]
def incal_get_categories_col_from_multiindex(df):
levels_names = get_categories_cals_names(df)
get_values_values_from_index = df.reset_index(level=levels_names)
return get_values_values_from_index[levels_names]
def remove_outliers_mixed_df(df):
sorted_df = df.sort_index(level=1)
fetuers, ids = df.values, df.index
df_means_and_stds = calc_mean_and_std_for_df_by_ids(
df,
ids.get_level_values(1).astype('int32'))
number_of_ids = len(ids.levels[1].categories.astype('int32'))
fetuers_columns = df.columns
number_featuers_columns = len(fetuers_columns)
upper_lims, lower_lims = get_lims_upper_and_lower(df_means_and_stds,
number_of_ids,
number_featuers_columns)
dimensions_by_numbers_of_ids_upper_lims = reshpe_vlaues_3d_ndarray(
upper_lims, number_of_ids, 1, number_featuers_columns)
dimensions_by_numbers_of_ids_lower_lims = reshpe_vlaues_3d_ndarray(
lower_lims, number_of_ids, 1, number_featuers_columns)
columns_of_each_id = fetuers.shape[0] // number_of_ids
dimensions_by_numbers_of_ids_values = reshpe_vlaues_3d_ndarray(
fetuers, number_of_ids, columns_of_each_id, number_featuers_columns)
outliers_replaced_to_nan_values_ndarray = select_and_replace_outliers(
dimensions_by_numbers_of_ids_values,
dimensions_by_numbers_of_ids_upper_lims,
dimensions_by_numbers_of_ids_lower_lims)
combien_axis0_and_axis1 = number_of_ids * columns_of_each_id
original_df_shape = back_to_2d_ndarray(
outliers_replaced_to_nan_values_ndarray, combien_axis0_and_axis1,
number_featuers_columns)
df_fetuers_without_outliers = pd.DataFrame(original_df_shape,
columns=fetuers_columns,
index=ids)
return pd.concat([df_fetuers_without_outliers], axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment