Skip to content

Instantly share code, notes, and snippets.

View grahamharrison68's full-sized avatar

Graham Harrison grahamharrison68

  • Lincoln College
View GitHub Profile
# %% Create a normal distriubtion with some outliers
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000) # create 1000 normally distributed data points
df_normal = pd.DataFrame({'Col0': s})
df_normal['Col0'].hist()
# Note that I wouldn't usually iterate through every column in the data frame removing outliers as the data
# might not numeric or they might be categorical. The iteration of all columns is just done for expediency here ...
for column in df:
df = remove_all_outliers(df, column)
print(f"{column} has {count_outliers(df, column)} outliers")
box_and_whisker(df, column)
# %% New helper function ...
def remove_all_outliers(df_in, col_name):
loop_count = 0
outlier_count = count_outliers(df_in, col_name)
while outlier_count > 0:
loop_count += 1
if (loop_count > 100):
break
# %% Explain the results in data ...
print(f"Col1 original boundaries: minium={minimum_Col1_before:.2f}, maximum={maximum_Col1_before:.2f}")
print(f"Col1 new minimum and maximum values: minium={df['Col1'].min():.2f}, maximum={df['Col1'].max():.2f}")
print(f"Col1 new boundaries: minium={minimum_Col1_after:.2f}, maximum={maximum_Col1_after:.2f}")
print("")
print(f"Col0 has {count_outliers(df, 'Col0')} outliers")
print(f"Col1 has {count_outliers(df, 'Col1')} outliers")
print(f"Col2 has {count_outliers(df, 'Col2')} outliers")
# %% Remove the outliers
print(f"rows before removing: {df.shape[0]}")
df = remove_outliers(df, 'Col0')
df = remove_outliers(df, 'Col1')
df = remove_outliers(df, 'Col2')
print(f"rows after removing: {df.shape[0]}")
# Let's have a look at the end-result. Here is something very strange though, our data still appears to have outliers!
box_and_whisker(df, 'Col0')
box_and_whisker(df, 'Col1')
# %% Plot the graphs
box_and_whisker(df, 'Col0')
box_and_whisker(df, 'Col1')
box_and_whisker(df, 'Col2')
_, _, _, _, minimum_Col1_before, maximum_Col1_before = get_iqr_values(df, 'Col1')
# %% Count the outliers in the original data frame
print(f"Col0 has {count_outliers(df, 'Col0')} outliers")
print(f"Col1 has {count_outliers(df, 'Col1')} outliers")
# (modified from http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm)
def get_iqr_values(df_in, col_name):
median = df_in[col_name].median()
q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile
q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile
iqr = q3-q1 #Interquartile range
minimum = q1-1.5*iqr # The minimum value or the |- marker in the box plot
maximum = q3+1.5*iqr # The maximum value or the -| marker in the box plot
return median, q1, q3, iqr, minimum, maximum
df = pd.DataFrame({'Col0': generate(median=630), 'Col1': generate(median=740), 'Col2': generate(median=220)})
df_test = pd.DataFrame(generate())
df_test.head()
def generate(median=630, err=12, outlier_err=100, size=80, outlier_size=10):
np.random.seed(median) # The seed is set to the median to force the same output each time the code is run
errs = err * np.random.rand(size) * np.random.choice((-1, 1), size)
data = median + errs
lower_errs = outlier_err * np.random.rand(outlier_size)
lower_outliers = median - err - lower_errs
upper_errs = outlier_err * np.random.rand(outlier_size)
upper_outliers = median + err + upper_errs