sachinsdate/interval_estimate.py

## interval_estimate.py
import math
import matplotlib.pyplot as plt
from scipy.stats import invweibull
from scipy.stats import norm
import numpy as np
import pandas as pd


#Load the data file
df = pd.read_csv('DOHMH_Beach_Water_Quality_Data.csv', header=0, infer_datetime_format=True, parse_dates=['Sample_Date'])

#filter out the data for our beach of interest, which is the MIDLAND BEACH
df_midland = df[df['Beach_Name']=='MIDLAND BEACH']
#print the data frame
print(df_midland)

#replace all these NaNs with 0
df_midland.fillna(value=0,inplace=True)

#print out the summary statistics for the sample
df_midland['Enterococci_Results'].describe()

#print out one more statistic which is the most frequently occurring value a.k.a. the mode
print(df_midland['Enterococci_Results'].mode())

#The following plot shows the frequency distribution of sample values:
plt.hist(df_midland['Enterococci_Results'], bins=100)
plt.xlabel('Number of Enterococci detected in the sample')
plt.ylabel('Number of samples')
plt.show()

#Calculate the interval estimate for the population mean mu

#sample size n
n = len(df_midland['Enterococci_Results'])
#sample mean Y
Y = df_midland['Enterococci_Results'].mean()
#sample standard deviation
S = df_midland['Enterococci_Results'].std()
#significance alpha (1-alpha)*100 = 95%
alpha = 0.05
#p-value for required alpha
p = alpha / 2
#z value for the specified p-value
z_p=norm.ppf(1-p)
#mu_low
mu_low = Y-z_p*S/math.sqrt(n)
#mu_high
mu_high = Y+z_p*S/math.sqrt(n)
print('95% Confidence intervals for the population mean (mu)='+str((mu_low, mu_high)))

#############################################################################
#plot the pdf of the inverse Weibull distribution.
fig = plt.figure()
fig.suptitle('Probability Density Function f(x)')
plt.xlabel('x')
plt.ylabel('Probability density')

c = 100
x = np.linspace(invweibull.ppf(0.00000001, c), invweibull.ppf(0.999999999, c), 10000)
y = invweibull.pdf(x, c)

plt.plot(x, y, 'r-', linewidth=1, alpha=0.6, color='black')

mu_l = 0.99
mu_h = 1.035
#shaded_x = x[np.logical_and(x >= mu_l, x <= mu_h)]
#plt.fill_between(shaded_x, invweibull.pdf(shaded_x, c), color='blue', alpha=0.65, linewidth=0)

shaded_x_low = x[np.logical_and(x >= 0, x <= mu_l)]
plt.fill_between(shaded_x_low, invweibull.pdf(shaded_x_low, c), color='blue', alpha=0.65, linewidth=0)

shaded_x_high = x[np.logical_and(x >= mu_h, x <= 10000)]
plt.fill_between(shaded_x_high, invweibull.pdf(shaded_x_high, c), color='blue', alpha=0.65, linewidth=0)

plt.show()

#############################################################################
#plot the pdf of the Normal distribution.
from scipy.stats import norm


fig = plt.figure()
fig.suptitle('Probability Density Function f(x)')
plt.xlabel('x')
plt.ylabel('Probability density')

x = np.linspace(norm.ppf(0.00000001), norm.ppf(0.999999999), 10000)
y = norm.pdf(x)

plt.plot(x, y, 'r-', linewidth=1, alpha=0.6, color='black')

z_l = -1000
z_h = -1.645
shaded_x = x[np.logical_and(x >= z_l, x <= z_h)]
plt.fill_between(shaded_x, norm.pdf(shaded_x), color='blue', alpha=0.65, linewidth=0)

plt.show()
	import math
	import matplotlib.pyplot as plt
	from scipy.stats import invweibull
	from scipy.stats import norm
	import numpy as np
	import pandas as pd


	#Load the data file
	df = pd.read_csv('DOHMH_Beach_Water_Quality_Data.csv', header=0, infer_datetime_format=True, parse_dates=['Sample_Date'])

	#filter out the data for our beach of interest, which is the MIDLAND BEACH
	df_midland = df[df['Beach_Name']=='MIDLAND BEACH']
	#print the data frame
	print(df_midland)

	#replace all these NaNs with 0
	df_midland.fillna(value=0,inplace=True)

	#print out the summary statistics for the sample
	df_midland['Enterococci_Results'].describe()

	#print out one more statistic which is the most frequently occurring value a.k.a. the mode
	print(df_midland['Enterococci_Results'].mode())

	#The following plot shows the frequency distribution of sample values:
	plt.hist(df_midland['Enterococci_Results'], bins=100)
	plt.xlabel('Number of Enterococci detected in the sample')
	plt.ylabel('Number of samples')
	plt.show()

	#Calculate the interval estimate for the population mean mu

	#sample size n
	n = len(df_midland['Enterococci_Results'])
	#sample mean Y
	Y = df_midland['Enterococci_Results'].mean()
	#sample standard deviation
	S = df_midland['Enterococci_Results'].std()
	#significance alpha (1-alpha)*100 = 95%
	alpha = 0.05
	#p-value for required alpha
	p = alpha / 2
	#z value for the specified p-value
	z_p=norm.ppf(1-p)
	#mu_low
	mu_low = Y-z_p*S/math.sqrt(n)
	#mu_high
	mu_high = Y+z_p*S/math.sqrt(n)
	print('95% Confidence intervals for the population mean (mu)='+str((mu_low, mu_high)))

	#############################################################################
	#plot the pdf of the inverse Weibull distribution.
	fig = plt.figure()
	fig.suptitle('Probability Density Function f(x)')
	plt.xlabel('x')
	plt.ylabel('Probability density')

	c = 100
	x = np.linspace(invweibull.ppf(0.00000001, c), invweibull.ppf(0.999999999, c), 10000)
	y = invweibull.pdf(x, c)

	plt.plot(x, y, 'r-', linewidth=1, alpha=0.6, color='black')

	mu_l = 0.99
	mu_h = 1.035
	#shaded_x = x[np.logical_and(x >= mu_l, x <= mu_h)]
	#plt.fill_between(shaded_x, invweibull.pdf(shaded_x, c), color='blue', alpha=0.65, linewidth=0)

	shaded_x_low = x[np.logical_and(x >= 0, x <= mu_l)]
	plt.fill_between(shaded_x_low, invweibull.pdf(shaded_x_low, c), color='blue', alpha=0.65, linewidth=0)

	shaded_x_high = x[np.logical_and(x >= mu_h, x <= 10000)]
	plt.fill_between(shaded_x_high, invweibull.pdf(shaded_x_high, c), color='blue', alpha=0.65, linewidth=0)

	plt.show()

	#############################################################################
	#plot the pdf of the Normal distribution.
	from scipy.stats import norm


	fig = plt.figure()
	fig.suptitle('Probability Density Function f(x)')
	plt.xlabel('x')
	plt.ylabel('Probability density')

	x = np.linspace(norm.ppf(0.00000001), norm.ppf(0.999999999), 10000)
	y = norm.pdf(x)

	plt.plot(x, y, 'r-', linewidth=1, alpha=0.6, color='black')

	z_l = -1000
	z_h = -1.645
	shaded_x = x[np.logical_and(x >= z_l, x <= z_h)]
	plt.fill_between(shaded_x, norm.pdf(shaded_x), color='blue', alpha=0.65, linewidth=0)

	plt.show()