paucoma/est_population.py

## est_population.py
# Script to simulate Population Estimation function by paui
# Inspired to investigate after watching Matts (standupmaths) video:
#  How to estimate a population using statisticians
#  https://www.youtube.com/watch?v=MTmnVBJ9gCI
#
#  Capture-Recapture Method
# Single-Shot version
#  eN - population estimate
#  M - number of marked samples before re-capture
#  C - recapture sample size
#  R - number of marked samples found in re-capture
#
#  eN = M*C/R , to avoid R = 0 case --> eN = ()(M+1)*(C+1)/(R+1))-1
#
# Multiple iterations Method
#
#  eN = \Sum_1*N {M[i]*C[i]} / \Sum_1^N {R[i]}
#
#  M increments with each new capture, unmarked samples get marked
#
# Variance estimate formula from: https://onlinecourses.science.psu.edu/stat506/node/58/
#   Variance Estimate:
#  var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r])))
# To deal with the case when x = 0 and we do not want to estimate τ by infinity, a modified estimator for τ is:
#        (M[r]+1)*(C[r]+1)/()
# Capture probability estimate from: http://ag.unr.edu/sedinger/Courses/ERS488-688/lectures/openjolly-seber.pdf

# Matts specific numbers were:
# M=64, C=67, R=21 --> eN=204 , Var=916 --> stddev = 30
# completely off, so probably
#  - the sampling wasn't independent enough
#  - the population wasn't a closed set durring the sampling periods

# ------------------------------
# --- Variables to play with ---
myPopulationSize = 47500  # must be integer > 0

# Definition of (re)Captures "C[]"
# --------------------------------
# Option A - Formulated sample size
if (1):
    #lets define total number of rounds to simulate
    nRounds = 20
    #lets define our capture size for each recapture round
    C=[None]*nRounds
    for i in range(0,nRounds) :
        C[i]= 40
else:
# Option B - Directly specified [set above if to false to enter here]
    C=[64,32,16,8]
    nRounds = len(C)

# The first element in C array is the initial sample size captured and marked

# --- End Variables ---
# ---------------------

#from statistics import median
import random
import math
# empty set to start with
setN = set()
# add elements to the set
for i in range(1,myPopulationSize):
    setN.add(i)

setC=[None]*nRounds
setM=[None]*nRounds
M=[None]*nRounds
R=[None]*nRounds
p=[None]*nRounds
#LP_eN=[None]*nRounds
#SeLP_eN=[None]*nRounds
eN=[None]*nRounds
stdev_eN=[None]*nRounds
confLow_eN=[None]*nRounds
confHigh_eN=[None]*nRounds
boundLow_eN=[None]*nRounds
boundHigh_eN=[None]*nRounds
rN=[]

print("Round\tMarked \t p \t\t est \t pest \tstdev \t\tLow \tHigh")
#Lets get our first capture
setC[0]=set(random.sample(setN,C[0]))
setM[0]=setC[0] #we mark all in round 1
rN.clear()
for r in range(1,len(C)):
    M[r] = len(setM[r-1]) #number of marked at the begining of round r
    setC[r] = set(random.sample(setN,C[r])) # Generating the Recapture round r
    R[r] = len(set(setM[r-1]&setC[r])) # number of recaptures round r
    setM[r] = set(setM[r-1]|setC[r]) # we mark the new finds in capture round r
    p[r] = round(1.0*R[r]/M[r],9) #capture probability of round r
    #Lets calculate our population estimate
    rN.append(r)
    topSum = botSum = 0

    #for i in rN :
    #  Lincoln-Petersen Estimator
    #    topSum = topSum + (M[i])*(C[i])
    #    botSum = botSum + (R[i]+1E-6)#1E-6 for zero case
    #LP_eN[r] = round(1.0*topSum/botSum) #population estimation on round r
    # 20/02/19
    #for i in rN :
    # Seber 1982 evolution of Lincoln-Petersen method (also avoids R[i] = 0 case)
    #    topSum = topSum + (M[i]+1)*(C[i]+1)
    #    botSum = botSum + (R[i]+1)
    #SeLP_eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r
    # 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation
    if p[r] > 0 :
        for i in rN :
        #  Lincoln-Petersen Estimator
            topSum = topSum + (M[i])*(C[i])
            botSum = botSum + (R[i])
        eN[r] = round(1.0*topSum/botSum) #population estimation on round r
    #eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r
    #var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r])))
    #20/02/19: added the +1 below to avoid zero case error, mostly insignificant in comparison to R[r]^3
    #var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r]+1)))
    #20/02/20: variance estimation "Error" from original approximation is 50% undershoot if R=1;12%,R=2;4%,R=3
    # Modifying to overshoot, by how much depends on how many R[r] s' you add to both top bottom and +1 or +0.1 ,...
    # Play with relative significance of "R"'s to zero eliminating constant "+1,+0.1,..."
    # we want the zero case eliminator (+0.1) to influence as little as possible
    # {...}*(R[r]+0.01)/((R[r]^3)*R[r]+0.01) -> 0%,R=1; +0.4%,R=2; +0.3%,R=3; +0.2%,R=4
    # var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r])*(R[r]+0.01))/((R[r]*R[r]*R[r])*R[r]+0.01)))
    # 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation
        # simplified math.sqrt(var_eN[r]) -> stdev_eN[]
        stdev_eN[r] = round(math.sqrt(abs((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r]))))
    #Confidence Level	0.70 0.75 0.80 0.85 0.90  0.92 0.95 0.96 0.98 0.99
    #z                  1.04 1.15 1.28 1.44 1.645 0.75 1.96 2.05 2.33 2.58
        confLow_eN[r] = 1.96*math.sqrt(((1-R[r]/M[r])*(R[r]/C[r])*(1-R[r]/C[r]))/(C[r]-1))+1/(2*C[r])
        confHigh_eN[r] = R[r]/C[r]-confLow_eN[r]
        confLow_eN[r] = R[r]/C[r]+confLow_eN[r]
        boundLow_eN[r] = round(M[r]/confLow_eN[r])
        boundHigh_eN[r] = round(M[r]/confHigh_eN[r])
    #std deviation =sqrt(Var)
        #print("  %d\t %d \t %1.3f \t %d \t %d \t%3.2f \t\t%3.2f\t%3.2f"%(r,M[r],p[r],eN[r],round(C[r]/(p[r]+1E-6)),stdev_eN[r],M[r]/confLow_eN[r],M[r]/confHigh_eN[r]))
        #lets define when an estimate is valid... conditions:
        # 2*stddev -> 95% of cases contemplated
        # z= 1.96 --> 0.95 confidence level for boundries
        # 1. is estimate > 2*stddev ?
        # 2. is estimate between boundries ?
        # ?3. is pest < est ? <--- maybe --> in large numbers always falls behind
        #if (eN[r]>(stdev_eN[r])) & (eN[r]>boundLow_eN[r]) & (eN[r]<boundHigh_eN[r]):
        #boundries for large numbers doesnt seem to work out
        if (eN[r]>(stdev_eN[r])) :
            print("  %d\t %d \t %1.4f \t %d \t %d \t%d \t\t%d\t%d"%(r,M[r],p[r],eN[r],round(C[r]/(p[r])),stdev_eN[r],boundLow_eN[r],boundHigh_eN[r]))

#eN.remove(None) #None is not accepted in the function median
#print("median of last %d rounds: %d"%(int(nRounds/2),median(eN[int(nRounds/-2):])))
print("real Popualtion %d"%len(setN))
	# Script to simulate Population Estimation function by paui
	# Inspired to investigate after watching Matts (standupmaths) video:
	# How to estimate a population using statisticians
	# https://www.youtube.com/watch?v=MTmnVBJ9gCI
	#
	# Capture-Recapture Method
	# Single-Shot version
	# eN - population estimate
	# M - number of marked samples before re-capture
	# C - recapture sample size
	# R - number of marked samples found in re-capture
	#
	# eN = MC/R , to avoid R = 0 case --> eN = ()(M+1)(C+1)/(R+1))-1
	#
	# Multiple iterations Method
	#
	# eN = \Sum_1N {M[i]C[i]} / \Sum_1^N {R[i]}
	#
	# M increments with each new capture, unmarked samples get marked
	#
	# Variance estimate formula from: https://onlinecourses.science.psu.edu/stat506/node/58/
	# Variance Estimate:
	# var_eN[r] = abs(round((M[r]C[r](M[r]-R[r])(C[r]-R[r]))/(R[r]R[r]*R[r])))
	# To deal with the case when x = 0 and we do not want to estimate τ by infinity, a modified estimator for τ is:
	# (M[r]+1)*(C[r]+1)/()
	# Capture probability estimate from: http://ag.unr.edu/sedinger/Courses/ERS488-688/lectures/openjolly-seber.pdf

	# Matts specific numbers were:
	# M=64, C=67, R=21 --> eN=204 , Var=916 --> stddev = 30
	# completely off, so probably
	# - the sampling wasn't independent enough
	# - the population wasn't a closed set durring the sampling periods

	# ------------------------------
	# --- Variables to play with ---
	myPopulationSize = 47500 # must be integer > 0

	# Definition of (re)Captures "C[]"
	# --------------------------------
	# Option A - Formulated sample size
	if (1):
	#lets define total number of rounds to simulate
	nRounds = 20
	#lets define our capture size for each recapture round
	C=[None]*nRounds
	for i in range(0,nRounds) :
	C[i]= 40
	else:
	# Option B - Directly specified [set above if to false to enter here]
	C=[64,32,16,8]
	nRounds = len(C)

	# The first element in C array is the initial sample size captured and marked

	# --- End Variables ---
	# ---------------------

	#from statistics import median
	import random
	import math
	# empty set to start with
	setN = set()
	# add elements to the set
	for i in range(1,myPopulationSize):
	setN.add(i)

	setC=[None]*nRounds
	setM=[None]*nRounds
	M=[None]*nRounds
	R=[None]*nRounds
	p=[None]*nRounds
	#LP_eN=[None]*nRounds
	#SeLP_eN=[None]*nRounds
	eN=[None]*nRounds
	stdev_eN=[None]*nRounds
	confLow_eN=[None]*nRounds
	confHigh_eN=[None]*nRounds
	boundLow_eN=[None]*nRounds
	boundHigh_eN=[None]*nRounds
	rN=[]

	print("Round\tMarked \t p \t\t est \t pest \tstdev \t\tLow \tHigh")
	#Lets get our first capture
	setC[0]=set(random.sample(setN,C[0]))
	setM[0]=setC[0] #we mark all in round 1
	rN.clear()
	for r in range(1,len(C)):
	M[r] = len(setM[r-1]) #number of marked at the begining of round r
	setC[r] = set(random.sample(setN,C[r])) # Generating the Recapture round r
	R[r] = len(set(setM[r-1]&setC[r])) # number of recaptures round r
	setM[r] = set(setM[r-1]\|setC[r]) # we mark the new finds in capture round r
	p[r] = round(1.0*R[r]/M[r],9) #capture probability of round r
	#Lets calculate our population estimate
	rN.append(r)
	topSum = botSum = 0

	#for i in rN :
	# Lincoln-Petersen Estimator
	# topSum = topSum + (M[i])*(C[i])
	# botSum = botSum + (R[i]+1E-6)#1E-6 for zero case
	#LP_eN[r] = round(1.0*topSum/botSum) #population estimation on round r
	# 20/02/19
	#for i in rN :
	# Seber 1982 evolution of Lincoln-Petersen method (also avoids R[i] = 0 case)
	# topSum = topSum + (M[i]+1)*(C[i]+1)
	# botSum = botSum + (R[i]+1)
	#SeLP_eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r
	# 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation
	if p[r] > 0 :
	for i in rN :
	# Lincoln-Petersen Estimator
	topSum = topSum + (M[i])*(C[i])
	botSum = botSum + (R[i])
	eN[r] = round(1.0*topSum/botSum) #population estimation on round r
	#eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r
	#var_eN[r] = abs(round((M[r]C[r](M[r]-R[r])(C[r]-R[r]))/(R[r]R[r]*R[r])))
	#20/02/19: added the +1 below to avoid zero case error, mostly insignificant in comparison to R[r]^3
	#var_eN[r] = abs(round((M[r]C[r](M[r]-R[r])(C[r]-R[r]))/(R[r]R[r]*R[r]+1)))
	#20/02/20: variance estimation "Error" from original approximation is 50% undershoot if R=1;12%,R=2;4%,R=3
	# Modifying to overshoot, by how much depends on how many R[r] s' you add to both top bottom and +1 or +0.1 ,...
	# Play with relative significance of "R"'s to zero eliminating constant "+1,+0.1,..."
	# we want the zero case eliminator (+0.1) to influence as little as possible
	# {...}(R[r]+0.01)/((R[r]^3)R[r]+0.01) -> 0%,R=1; +0.4%,R=2; +0.3%,R=3; +0.2%,R=4
	# var_eN[r] = abs(round((M[r]C[r](M[r]-R[r])(C[r]-R[r])(R[r]+0.01))/((R[r]R[r]R[r])*R[r]+0.01)))
	# 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation
	# simplified math.sqrt(var_eN[r]) -> stdev_eN[]
	stdev_eN[r] = round(math.sqrt(abs((M[r]C[r](M[r]-R[r])(C[r]-R[r]))/(R[r]R[r]*R[r]))))
	#Confidence Level 0.70 0.75 0.80 0.85 0.90 0.92 0.95 0.96 0.98 0.99
	#z 1.04 1.15 1.28 1.44 1.645 0.75 1.96 2.05 2.33 2.58
	confLow_eN[r] = 1.96math.sqrt(((1-R[r]/M[r])(R[r]/C[r])(1-R[r]/C[r]))/(C[r]-1))+1/(2C[r])
	confHigh_eN[r] = R[r]/C[r]-confLow_eN[r]
	confLow_eN[r] = R[r]/C[r]+confLow_eN[r]
	boundLow_eN[r] = round(M[r]/confLow_eN[r])
	boundHigh_eN[r] = round(M[r]/confHigh_eN[r])
	#std deviation =sqrt(Var)
	#print(" %d\t %d \t %1.3f \t %d \t %d \t%3.2f \t\t%3.2f\t%3.2f"%(r,M[r],p[r],eN[r],round(C[r]/(p[r]+1E-6)),stdev_eN[r],M[r]/confLow_eN[r],M[r]/confHigh_eN[r]))
	#lets define when an estimate is valid... conditions:
	# 2*stddev -> 95% of cases contemplated
	# z= 1.96 --> 0.95 confidence level for boundries
	# 1. is estimate > 2*stddev ?
	# 2. is estimate between boundries ?
	# ?3. is pest < est ? <--- maybe --> in large numbers always falls behind
	#if (eN[r]>(stdev_eN[r])) & (eN[r]>boundLow_eN[r]) & (eN[r]<boundHigh_eN[r]):
	#boundries for large numbers doesnt seem to work out
	if (eN[r]>(stdev_eN[r])) :
	print(" %d\t %d \t %1.4f \t %d \t %d \t%d \t\t%d\t%d"%(r,M[r],p[r],eN[r],round(C[r]/(p[r])),stdev_eN[r],boundLow_eN[r],boundHigh_eN[r]))

	#eN.remove(None) #None is not accepted in the function median
	#print("median of last %d rounds: %d"%(int(nRounds/2),median(eN[int(nRounds/-2):])))
	print("real Popualtion %d"%len(setN))