Skip to content

Instantly share code, notes, and snippets.

@grahamharrison68
Created October 17, 2020 17:05
Show Gist options
  • Save grahamharrison68/70c55334fa56250031215ff9b4a93537 to your computer and use it in GitHub Desktop.
Save grahamharrison68/70c55334fa56250031215ff9b4a93537 to your computer and use it in GitHub Desktop.
# (modified from http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm)
def get_iqr_values(df_in, col_name):
median = df_in[col_name].median()
q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile
q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile
iqr = q3-q1 #Interquartile range
minimum = q1-1.5*iqr # The minimum value or the |- marker in the box plot
maximum = q3+1.5*iqr # The maximum value or the -| marker in the box plot
return median, q1, q3, iqr, minimum, maximum
def get_iqr_text(df_in, col_name):
median, q1, q3, iqr, minimum, maximum = get_iqr_values(df_in, col_name)
text = f"median={median:.2f}, q1={q1:.2f}, q3={q3:.2f}, iqr={iqr:.2f}, minimum={minimum:.2f}, maximum={maximum:.2f}"
return text
def remove_outliers(df_in, col_name):
_, _, _, _, minimum, maximum = get_iqr_values(df_in, col_name)
df_out = df_in.loc[(df_in[col_name] > minimum) & (df_in[col_name] < maximum)]
return df_out
def count_outliers(df_in, col_name):
_, _, _, _, minimum, maximum = get_iqr_values(df_in, col_name)
df_outliers = df_in.loc[(df_in[col_name] <= minimum) | (df_in[col_name] >= maximum)]
return df_outliers.shape[0]
def box_and_whisker(df_in, col_name):
title = get_iqr_text(df_in, col_name)
sns.boxplot(df_in[col_name])
plt.title(title)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment