Graham Harrison grahamharrison68

## outliers_10.py
# %% Create a normal distriubtion with some outliers
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000) # create 1000 normally distributed data points

df_normal = pd.DataFrame({'Col0': s})
df_normal['Col0'].hist()

## outliers_9.py
# Note that I wouldn't usually iterate through every column in the data frame removing outliers as the data
# might not numeric or they might be categorical. The iteration of all columns is just done for expediency here ...
for column in df:
    df = remove_all_outliers(df, column)
    print(f"{column} has {count_outliers(df, column)} outliers")
    box_and_whisker(df, column)

## outliers_8.py
# %% New helper function ...
def remove_all_outliers(df_in, col_name):
    loop_count = 0
    outlier_count = count_outliers(df_in, col_name)

    while outlier_count > 0:
        loop_count += 1

        if (loop_count > 100):
            break

## outliers_7.py
# %% Explain the results in data ...
print(f"Col1 original boundaries: minium={minimum_Col1_before:.2f}, maximum={maximum_Col1_before:.2f}")
print(f"Col1 new minimum and maximum values: minium={df['Col1'].min():.2f}, maximum={df['Col1'].max():.2f}")
print(f"Col1 new boundaries: minium={minimum_Col1_after:.2f}, maximum={maximum_Col1_after:.2f}")
print("")
print(f"Col0 has {count_outliers(df, 'Col0')} outliers")
print(f"Col1 has {count_outliers(df, 'Col1')} outliers")
print(f"Col2 has {count_outliers(df, 'Col2')} outliers")

## outliers_6.py
# %% Remove the outliers
print(f"rows before removing: {df.shape[0]}")
df = remove_outliers(df, 'Col0')
df = remove_outliers(df, 'Col1')
df = remove_outliers(df, 'Col2')
print(f"rows after removing: {df.shape[0]}")

# Let's have a look at the end-result. Here is something very strange though, our data still appears to have outliers!
box_and_whisker(df, 'Col0')
box_and_whisker(df, 'Col1')

## outliers_5.py
# %% Plot the graphs
box_and_whisker(df, 'Col0')
box_and_whisker(df, 'Col1')
box_and_whisker(df, 'Col2')

_, _, _, _, minimum_Col1_before, maximum_Col1_before = get_iqr_values(df, 'Col1')

# %% Count the outliers in the original data frame
print(f"Col0 has {count_outliers(df, 'Col0')} outliers")
print(f"Col1 has {count_outliers(df, 'Col1')} outliers")

## outliers_4.py
# (modified from http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm)
def get_iqr_values(df_in, col_name):
    median = df_in[col_name].median()
    q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile
    q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile
    iqr = q3-q1 #Interquartile range
    minimum  = q1-1.5*iqr # The minimum value or the |- marker in the box plot
    maximum = q3+1.5*iqr # The maximum value or the -| marker in the box plot
    return median, q1, q3, iqr, minimum, maximum

## outliers_3.py
df = pd.DataFrame({'Col0': generate(median=630), 'Col1': generate(median=740), 'Col2': generate(median=220)})

## outliers_2.py
df_test = pd.DataFrame(generate())
df_test.head()

## generate.py
def generate(median=630, err=12, outlier_err=100, size=80, outlier_size=10):
    np.random.seed(median) # The seed is set to the median to force the same output each time the code is run
    errs = err * np.random.rand(size) * np.random.choice((-1, 1), size)
    data = median + errs

    lower_errs = outlier_err * np.random.rand(outlier_size)
    lower_outliers = median - err - lower_errs

    upper_errs = outlier_err * np.random.rand(outlier_size)
    upper_outliers = median + err + upper_errs
	# %% Create a normal distriubtion with some outliers
	mu, sigma = 0, 0.1 # mean and standard deviation
	s = np.random.normal(mu, sigma, 1000) # create 1000 normally distributed data points

	df_normal = pd.DataFrame({'Col0': s})
	df_normal['Col0'].hist()
	# Note that I wouldn't usually iterate through every column in the data frame removing outliers as the data
	# might not numeric or they might be categorical. The iteration of all columns is just done for expediency here ...
	for column in df:
	df = remove_all_outliers(df, column)
	print(f"{column} has {count_outliers(df, column)} outliers")
	box_and_whisker(df, column)
	# %% New helper function ...
	def remove_all_outliers(df_in, col_name):
	loop_count = 0
	outlier_count = count_outliers(df_in, col_name)

	while outlier_count > 0:
	loop_count += 1

	if (loop_count > 100):
	break
	# %% Explain the results in data ...
	print(f"Col1 original boundaries: minium={minimum_Col1_before:.2f}, maximum={maximum_Col1_before:.2f}")
	print(f"Col1 new minimum and maximum values: minium={df['Col1'].min():.2f}, maximum={df['Col1'].max():.2f}")
	print(f"Col1 new boundaries: minium={minimum_Col1_after:.2f}, maximum={maximum_Col1_after:.2f}")
	print("")
	print(f"Col0 has {count_outliers(df, 'Col0')} outliers")
	print(f"Col1 has {count_outliers(df, 'Col1')} outliers")
	print(f"Col2 has {count_outliers(df, 'Col2')} outliers")
	# %% Remove the outliers
	print(f"rows before removing: {df.shape[0]}")
	df = remove_outliers(df, 'Col0')
	df = remove_outliers(df, 'Col1')
	df = remove_outliers(df, 'Col2')
	print(f"rows after removing: {df.shape[0]}")

	# Let's have a look at the end-result. Here is something very strange though, our data still appears to have outliers!
	box_and_whisker(df, 'Col0')
	box_and_whisker(df, 'Col1')
	# %% Plot the graphs
	box_and_whisker(df, 'Col0')
	box_and_whisker(df, 'Col1')
	box_and_whisker(df, 'Col2')

	_, _, _, _, minimum_Col1_before, maximum_Col1_before = get_iqr_values(df, 'Col1')

	# %% Count the outliers in the original data frame
	print(f"Col0 has {count_outliers(df, 'Col0')} outliers")
	print(f"Col1 has {count_outliers(df, 'Col1')} outliers")
	# (modified from http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm)
	def get_iqr_values(df_in, col_name):
	median = df_in[col_name].median()
	q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile
	q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile
	iqr = q3-q1 #Interquartile range
	minimum = q1-1.5*iqr # The minimum value or the \|- marker in the box plot
	maximum = q3+1.5*iqr # The maximum value or the -\| marker in the box plot
	return median, q1, q3, iqr, minimum, maximum
	def generate(median=630, err=12, outlier_err=100, size=80, outlier_size=10):
	np.random.seed(median) # The seed is set to the median to force the same output each time the code is run
	errs = err * np.random.rand(size) * np.random.choice((-1, 1), size)
	data = median + errs

	lower_errs = outlier_err * np.random.rand(outlier_size)
	lower_outliers = median - err - lower_errs

	upper_errs = outlier_err * np.random.rand(outlier_size)
	upper_outliers = median + err + upper_errs