Created
October 17, 2020 17:05
-
-
Save grahamharrison68/70c55334fa56250031215ff9b4a93537 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (modified from http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm) | |
def get_iqr_values(df_in, col_name): | |
median = df_in[col_name].median() | |
q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile | |
q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile | |
iqr = q3-q1 #Interquartile range | |
minimum = q1-1.5*iqr # The minimum value or the |- marker in the box plot | |
maximum = q3+1.5*iqr # The maximum value or the -| marker in the box plot | |
return median, q1, q3, iqr, minimum, maximum | |
def get_iqr_text(df_in, col_name): | |
median, q1, q3, iqr, minimum, maximum = get_iqr_values(df_in, col_name) | |
text = f"median={median:.2f}, q1={q1:.2f}, q3={q3:.2f}, iqr={iqr:.2f}, minimum={minimum:.2f}, maximum={maximum:.2f}" | |
return text | |
def remove_outliers(df_in, col_name): | |
_, _, _, _, minimum, maximum = get_iqr_values(df_in, col_name) | |
df_out = df_in.loc[(df_in[col_name] > minimum) & (df_in[col_name] < maximum)] | |
return df_out | |
def count_outliers(df_in, col_name): | |
_, _, _, _, minimum, maximum = get_iqr_values(df_in, col_name) | |
df_outliers = df_in.loc[(df_in[col_name] <= minimum) | (df_in[col_name] >= maximum)] | |
return df_outliers.shape[0] | |
def box_and_whisker(df_in, col_name): | |
title = get_iqr_text(df_in, col_name) | |
sns.boxplot(df_in[col_name]) | |
plt.title(title) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment