Skip to content

Instantly share code, notes, and snippets.

@338rajesh
Created May 6, 2023 10:12
Show Gist options
  • Save 338rajesh/5994310a75653dda176f6f37d3e824ce to your computer and use it in GitHub Desktop.
Save 338rajesh/5994310a75653dda176f6f37d3e824ce to your computer and use it in GitHub Desktop.
Remove outliers of a 2D numpy array, based on the columns data distribution.
import numpy as np
def remove_outliers(arr, col_indices=None, k=1.5):
"""
Removes outliers from the given 2D array using inter-quartile range.
arr: np.ndarray
a 2D numpy array
col_indices=[0,]
These columns are used to find the outliers and
the corresponding rows are removed.
"""
col_indices = [0, ] if col_indices is None else col_indices
for i in col_indices:
ith_col = arr[:, i]
q25, q75 = np.percentile(ith_col, 25), np.percentile(ith_col, 75)
inter_quartile_range = q75 - q25
lower_bound, upper_bound = q25 - (k * inter_quartile_range), q75 + (k * inter_quartile_range)
mask = np.logical_and(ith_col >= lower_bound, ith_col <= upper_bound)
if np.sum(mask) > 0:
arr = arr[mask]
print(f"Based on column #{i}: Array shape changed to {arr.shape}")
return arr
A_raw_data = (5 * np.random.randn(10000, 3)) + 50
A_woo_data = remove_outliers(A_raw_data, col_indices=[1, 2], k=1.5)
print(A_raw_data.shape)
print(A_woo_data.shape)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment