|
# coding=utf-8 |
|
|
|
# Copyright 2015 Pang Yan Han, Philip |
|
# License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html |
|
|
|
from __future__ import print_function |
|
from math import exp, log |
|
|
|
from thinkstats import Mean |
|
|
|
import random |
|
|
|
import scipy.optimize |
|
|
|
import Cdf |
|
import myplot |
|
|
|
def ex5_13(): |
|
# We'll do the simulation 1 year by 1 year |
|
# Each year, about 1 / 1000 people will contract the particular cancer. |
|
# So we assume that for each year, P(person gets cancer) = 1 / 1000 |
|
prob_get_cancer = 1.0 / 1000 |
|
|
|
# Performs a simulation and returns the simulated data |
|
def _perform_simulation(nr_cohorts): |
|
cohort_size = 100 |
|
nr_years = 10 |
|
simulated_data = [] |
|
# we perform simulation one cohort at a time |
|
for _ in range(nr_cohorts): |
|
cancer_cases_in_cohort = 0 |
|
# and for each cohort, we perform the simulation 1 year at a |
|
# time for `nr_years` years |
|
for _ in range(nr_years): |
|
nr_contracted_cancer_in_this_round = 0 |
|
# Once a patient contracts cancer, he/she is counted as a |
|
# case. Therefore we don't need to perform simulation for |
|
# that person anymore. The `nr_in_cohort_without_cancer` |
|
# stores the number of people in the cohort who have not yet |
|
# contracted cancer (we sound evil here but I don't know how |
|
# else to put it) |
|
nr_in_cohort_without_cancer = cohort_size - cancer_cases_in_cohort |
|
for i in range(nr_in_cohort_without_cancer): |
|
# this simulates a Ber(p) random variable, where p is |
|
# the probability of the person contracting cancer |
|
if random.uniform(0, 1) <= prob_get_cancer: |
|
nr_contracted_cancer_in_this_round += 1 |
|
cancer_cases_in_cohort += nr_contracted_cancer_in_this_round |
|
# we're done simulating the cohort and have the total number of |
|
# cancer cases in the cohort over 10 years. append it to our |
|
# data set. |
|
simulated_data.append(cancer_cases_in_cohort) |
|
return simulated_data |
|
|
|
# number of cohorts we are simulating |
|
# NOTE: Change this to a number you like. |
|
nr_cohorts = 1000 |
|
simulated_data = _perform_simulation(nr_cohorts) |
|
empirical_cdf = Cdf.MakeCdfFromList( |
|
simulated_data, |
|
name="cancer cases" |
|
) |
|
myplot.Cdf(empirical_cdf) |
|
myplot.Save( |
|
root="ex5-13-p1-cdf", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF(x)", |
|
title="CDF plot", |
|
) |
|
|
|
# Ok, let's try to see if an exponential distribution will be a |
|
# good model for our data. |
|
scale = myplot.Cdf(empirical_cdf, transform="exponential") |
|
myplot.Save( |
|
root="ex5-13-p1-ccdf-logy-scale", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CCDF(x) (log scale)", |
|
title="CCDF plot", |
|
**scale |
|
) |
|
|
|
# Ok, let's see if the Pareto will be a good model for our data. |
|
scale = myplot.Cdf(empirical_cdf, transform="pareto") |
|
myplot.Save( |
|
root="ex5-13-p1-ccdf-log-log-scale", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases (log scale)", |
|
ylabel="CCDF(x) (log scale)", |
|
title="CCDF plot (log log scale)", |
|
**scale |
|
) |
|
|
|
# Let's do a CDF plot with log x scale to see if the resulting plot |
|
# looks similar to that of a normal CDF. If yes, then the Log-normal |
|
# distribution will be a good model for the data. |
|
scale = myplot.Cdf(empirical_cdf, xscale="log") |
|
myplot.Save( |
|
root="ex5-13-p1-cdf-logx-scale", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases (log scale)", |
|
ylabel="CDF", |
|
title="CDF plot (log x scale)", |
|
**scale |
|
) |
|
|
|
# Let's see if the Weibull distribution will be a good model for our |
|
# data. |
|
scale = myplot.Cdf(empirical_cdf, transform="weibull") |
|
myplot.Save( |
|
root="ex5-13-p1-one-over-log-ccdf-log-log-scale", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases (log scale)", |
|
ylabel="log(1 / CCDF(x)) (log scale)", |
|
title="log(1 / CCDF) plot (log log scale)", |
|
**scale |
|
) |
|
|
|
# K, seems like the Exponential and Weibull distributions are the |
|
# best models for our data. |
|
# |
|
# We use the Method of Moments and MLE and obtained the exact same |
|
# parameter for lambda, which is 1 / Xbar . |
|
# |
|
# So we shall use an Exponential distribution with |
|
# lambda = 1 / Xbar as our model and plot it with our data and see |
|
# if it's a good fit. |
|
|
|
# First, we compute lambda = 1 / Xbar |
|
xBar = Mean(simulated_data) |
|
exp_lambda = 1.0 / xBar |
|
# Here, we plot the Exp(lambda) CDF, over increments of 0.01 to make |
|
# it more smooth when we plot it. Otherwise, we'll be getting a plot |
|
# which looks similar to the empirical CDF, which is only defined at |
|
# discrete points. |
|
max_val = max(simulated_data) |
|
exp_lambda_pdf = lambda x: exp_lambda * exp(-exp_lambda * x) |
|
val_to_density = {} |
|
i, incr = 0, 0.01 |
|
while i <= max_val: |
|
val_to_density[i] = exp_lambda_pdf(i) |
|
i += incr |
|
exp_model_cdf = Cdf.MakeCdfFromDict(val_to_density, |
|
name="model" |
|
) |
|
myplot.Cdfs([empirical_cdf, exp_model_cdf], transform="exponential") |
|
# Plot with log y scale of: |
|
# 1. Empirical CCDF |
|
# 2. CCDF of Exp(lambda) model |
|
scale = dict(yscale="log") |
|
myplot.Save( |
|
root="ex5-13-p1-ccdf-with-ccdf-of-exp-model-logy-scale", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CCDF(x) (log scale)", |
|
title="CCDF & CCDF of Exp({}) model (log y scale)".format( |
|
exp_lambda |
|
), |
|
**scale |
|
) |
|
# Plot with: |
|
# 1. Empirical CDF |
|
# 2. Exp(lambda) model |
|
myplot.Cdfs([empirical_cdf, exp_model_cdf]) |
|
myplot.Save( |
|
root="ex5-13-p1-cdf-with-exp-model", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF", |
|
title="CDF & Exp({}) model".format(exp_lambda), |
|
) |
|
|
|
# Perform another simulation to get test data, we'll be needing this |
|
# later when we validate our models |
|
test_data = _perform_simulation(nr_cohorts) |
|
test_data_cdf = Cdf.MakeCdfFromList(test_data, name="test data") |
|
|
|
# Exp model shifted to the left by `exp_model_shift`, where |
|
# `exp_model_shift` is defined as: |
|
# |
|
# P(Exp(lambda) <= exp_model_shift) |
|
# = probability of cohorts having 0 cancer cases |
|
exp_model_shift = exp_model_cdf.Value(empirical_cdf.Prob(0)) |
|
# Redo the Exp(lambda) CDF. We start off by computing the density |
|
# function from `i = exp_model_shift` instead of 0, until `i` |
|
# reaches `max_val + exp_model_shift`. |
|
# However, those densities will be attributed to |
|
# `i - exp_model_shift` instead of to `i` because we want to shift |
|
# the CDF by `exp_model_shift` units to the left on the x-axis. |
|
i, incr, stop = exp_model_shift, 0.01, max_val + exp_model_shift |
|
val_to_density = {} |
|
while i <= stop: |
|
val_to_density[i - exp_model_shift] = exp_lambda_pdf(i) |
|
i += incr |
|
# For CDF(0) of our Exponential model to start at |
|
# P(cohorts with <= 0 cancer cases), we need to sum all the |
|
# densities from 0 to `exp_model_shift` and add them to the density |
|
# at `exp_model_shift`. We do so in the |
|
# `sum_of_densities_from_zero_to_exp_model_shift` variable. |
|
sum_of_densities_from_zero_to_exp_model_shift = 0 |
|
i = 0 |
|
while i < exp_model_shift: |
|
sum_of_densities_from_zero_to_exp_model_shift += \ |
|
exp_lambda_pdf(i) |
|
i += incr |
|
# then we add it to density at the `exp_model_shift` value which is |
|
# indexed by 0 in the `val_to_density` dict |
|
val_to_density[0] = val_to_density[0] + \ |
|
sum_of_densities_from_zero_to_exp_model_shift |
|
shifted_exp_model_cdf = Cdf.MakeCdfFromDict(val_to_density, |
|
name="model" |
|
) |
|
myplot.Cdfs([empirical_cdf, shifted_exp_model_cdf]) |
|
myplot.Save( |
|
root="ex5-13-p1-cdf-with-shifted-exp-model", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF", |
|
title="CDF with Exp({}) model shifted by {} to the left".format( |
|
exp_lambda, exp_model_shift |
|
), |
|
) |
|
|
|
# A good chunk of our simulated data consists of zeros. |
|
# For parameter estimation using the Maximum Likelihood Estimate, we |
|
# need to perform some calculations involving log on the data. |
|
# However, we know that log(0) is undefined. So we remove all zeros |
|
# from the data set. |
|
X_without_zero = list(filter(lambda x: x > 0, simulated_data)) |
|
X_without_zero_len = len(X_without_zero) |
|
log_X_without_zero = [log(x) for x in X_without_zero] |
|
one_over_n_sum_of_log_X_without_zero = \ |
|
sum(log_X_without_zero) / float(X_without_zero_len) |
|
# equation we're trying to find roots |
|
def _eqn_for_k(k): |
|
numer = 0.0 |
|
for i in range(X_without_zero_len): |
|
numer += (X_without_zero[i] ** k) * (log_X_without_zero[i]) |
|
denom = 0.0 |
|
for i in range(X_without_zero_len): |
|
denom += (X_without_zero[i] ** k) |
|
return 1.0 / (numer / denom - one_over_n_sum_of_log_X_without_zero) - k |
|
# Plot graph from x = 0.1 to 10 to see roughly where it crosses |
|
# y = 0. Values of x close to where the graph crosses y = 0 will be |
|
# a good initial guess for `scipy.optimize.fsolve`. |
|
xs, ys = [], [] |
|
x, incr, stop = 0.1, 0.1, 10 |
|
while x <= stop: |
|
xs.append(x) |
|
ys.append(_eqn_for_k(x)) |
|
x += incr |
|
myplot.Plot(xs, ys, label="plot") |
|
myplot.Save( |
|
root="ex5-13-p1-weibull-k-parameter-plot", |
|
formats=["pdf", "png",], |
|
xlabel="x", |
|
ylabel="y", |
|
title="weibull k parameter plot", |
|
) |
|
# This is a rough estimate from looking at the k parameter plot |
|
initial_guess = 2.0 |
|
weibull_model_shape = scipy.optimize.fsolve(_eqn_for_k, |
|
[initial_guess] |
|
)[0] |
|
weibull_model_scale_on_full_data = ( |
|
sum((x ** weibull_model_shape for x in simulated_data)) / \ |
|
float(nr_cohorts) |
|
) ** (1.0 / weibull_model_shape) |
|
weibull_model_scale_on_non_zero_data = ( |
|
sum((x ** weibull_model_shape for x in X_without_zero)) / \ |
|
float(X_without_zero_len) |
|
) ** (1.0 / weibull_model_shape) |
|
print("shape = {}, scale on full data = {}, scale on non-zero data = {}".format( |
|
weibull_model_shape, weibull_model_scale_on_full_data, |
|
weibull_model_scale_on_non_zero_data |
|
)) |
|
|
|
def _weibull_pdf(shape, scale, x): |
|
return (shape / scale * (x / scale) ** (shape - 1) * |
|
exp(-((x / scale) ** shape)) |
|
) |
|
for scale, root_suffix in [ |
|
(weibull_model_scale_on_full_data, "scale-on-full-data",), |
|
(weibull_model_scale_on_non_zero_data, "scale-on-non-zero-data",), |
|
]: |
|
weibull_val_to_density = {} |
|
i, incr, max_val = 0, 0.01, max(simulated_data) |
|
while i <= max_val: |
|
weibull_val_to_density[i] = _weibull_pdf( |
|
weibull_model_shape, scale, i |
|
) |
|
i += incr |
|
weibull_model_cdf = Cdf.MakeCdfFromDict(weibull_val_to_density, |
|
name="model" |
|
) |
|
myplot.Cdfs([empirical_cdf, weibull_model_cdf]) |
|
myplot.Save( |
|
root="ex5-13-p1-cdf-with-weibull-model-{}".format( |
|
root_suffix |
|
), |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF(x)", |
|
title="CDF & Weibull model", |
|
) |
|
|
|
weibull_val_minus_one_to_density = dict( |
|
map(lambda x: (x[0] - 1, x[1]), |
|
weibull_val_to_density.items() |
|
) |
|
) |
|
weibull_model_left_shifted_cdf = Cdf.MakeCdfFromDict( |
|
weibull_val_minus_one_to_density, name="model" |
|
) |
|
myplot.Cdfs([empirical_cdf, weibull_model_left_shifted_cdf]) |
|
myplot.Save( |
|
root="ex5-13-p1-cdf-with-weibull-model-left-shifted-by-one-{}".format( |
|
root_suffix |
|
), |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF(x)", |
|
title="CDF & Weibull model (left shifted by 1 unit)", |
|
) |
|
|
|
sum_of_negative_and_zero_density = sum( |
|
map(lambda y: y[1], |
|
filter(lambda x: x[0] <= 0, |
|
weibull_val_minus_one_to_density.items() |
|
) |
|
) |
|
) |
|
weibull_val_to_density = dict( |
|
filter(lambda x: x[0] > 0, |
|
weibull_val_minus_one_to_density.items() |
|
) |
|
) |
|
weibull_val_to_density[0] = sum_of_negative_and_zero_density |
|
weibull_model_negative_grouped_with_zero_cdf = Cdf.MakeCdfFromDict( |
|
weibull_val_to_density, name="model" |
|
) |
|
myplot.Cdfs( |
|
[empirical_cdf, weibull_model_negative_grouped_with_zero_cdf] |
|
) |
|
myplot.Save( |
|
root="ex5-13-p1-cdf-with-weibull-model-negative-densities-accum-with-zero-{}".format(root_suffix), |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF(x)", |
|
title="CDF & Weibull model (r.v's <= 0 are grouped under 0)", |
|
) |
|
|
|
# See if things generalize for the Weibull distribution |
|
myplot.Cdfs([test_data_cdf, weibull_model_negative_grouped_with_zero_cdf]) |
|
myplot.Save( |
|
root="ex5-13-p1-test-data-cdf-with-weibull-model-negative-densities-accum-with-zero-{}".format(root_suffix), |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF(x)", |
|
title="Test data CDF & Weibull model", |
|
) |
|
|
|
# see if things generalize for the Exponential distribution |
|
myplot.Cdfs([test_data_cdf, shifted_exp_model_cdf]) |
|
myplot.Save( |
|
root="ex5-13-p1-test-data-cdf-with-shifted-exp-model", |
|
formats=["pdf", "png",], |
|
xlabel="cancer cases", |
|
ylabel="CDF(x)", |
|
title="Test data CDF & Exp({}) model shifted by {} to the left".format( |
|
exp_lambda, exp_model_shift |
|
), |
|
) |
|
|
|
if __name__ == "__main__": |
|
ex5_13() |