This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def generate(median=630, err=12, outlier_err=100, size=80, outlier_size=10): | |
np.random.seed(median) # The seed is set to the median to force the same output each time the code is run | |
errs = err * np.random.rand(size) * np.random.choice((-1, 1), size) | |
data = median + errs | |
lower_errs = outlier_err * np.random.rand(outlier_size) | |
lower_outliers = median - err - lower_errs | |
upper_errs = outlier_err * np.random.rand(outlier_size) | |
upper_outliers = median + err + upper_errs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df_test = pd.DataFrame(generate()) | |
df_test.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.DataFrame({'Col0': generate(median=630), 'Col1': generate(median=740), 'Col2': generate(median=220)}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (modified from http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm) | |
def get_iqr_values(df_in, col_name): | |
median = df_in[col_name].median() | |
q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile | |
q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile | |
iqr = q3-q1 #Interquartile range | |
minimum = q1-1.5*iqr # The minimum value or the |- marker in the box plot | |
maximum = q3+1.5*iqr # The maximum value or the -| marker in the box plot | |
return median, q1, q3, iqr, minimum, maximum |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% Plot the graphs | |
box_and_whisker(df, 'Col0') | |
box_and_whisker(df, 'Col1') | |
box_and_whisker(df, 'Col2') | |
_, _, _, _, minimum_Col1_before, maximum_Col1_before = get_iqr_values(df, 'Col1') | |
# %% Count the outliers in the original data frame | |
print(f"Col0 has {count_outliers(df, 'Col0')} outliers") | |
print(f"Col1 has {count_outliers(df, 'Col1')} outliers") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% Remove the outliers | |
print(f"rows before removing: {df.shape[0]}") | |
df = remove_outliers(df, 'Col0') | |
df = remove_outliers(df, 'Col1') | |
df = remove_outliers(df, 'Col2') | |
print(f"rows after removing: {df.shape[0]}") | |
# Let's have a look at the end-result. Here is something very strange though, our data still appears to have outliers! | |
box_and_whisker(df, 'Col0') | |
box_and_whisker(df, 'Col1') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% Explain the results in data ... | |
print(f"Col1 original boundaries: minium={minimum_Col1_before:.2f}, maximum={maximum_Col1_before:.2f}") | |
print(f"Col1 new minimum and maximum values: minium={df['Col1'].min():.2f}, maximum={df['Col1'].max():.2f}") | |
print(f"Col1 new boundaries: minium={minimum_Col1_after:.2f}, maximum={maximum_Col1_after:.2f}") | |
print("") | |
print(f"Col0 has {count_outliers(df, 'Col0')} outliers") | |
print(f"Col1 has {count_outliers(df, 'Col1')} outliers") | |
print(f"Col2 has {count_outliers(df, 'Col2')} outliers") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% New helper function ... | |
def remove_all_outliers(df_in, col_name): | |
loop_count = 0 | |
outlier_count = count_outliers(df_in, col_name) | |
while outlier_count > 0: | |
loop_count += 1 | |
if (loop_count > 100): | |
break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note that I wouldn't usually iterate through every column in the data frame removing outliers as the data | |
# might not numeric or they might be categorical. The iteration of all columns is just done for expediency here ... | |
for column in df: | |
df = remove_all_outliers(df, column) | |
print(f"{column} has {count_outliers(df, column)} outliers") | |
box_and_whisker(df, column) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% Create a normal distriubtion with some outliers | |
mu, sigma = 0, 0.1 # mean and standard deviation | |
s = np.random.normal(mu, sigma, 1000) # create 1000 normally distributed data points | |
df_normal = pd.DataFrame({'Col0': s}) | |
df_normal['Col0'].hist() |
OlderNewer