Instantly share code, notes, and snippets.

yanhan/Cdf.py Secret

Last active Sep 19, 2015
Code for Exercise 5.13 Part 1 of Think Stats

• Python 2.7
• virtualenv

Setup

``````# install build dependencies for matplotlib
sudo apt-get update
sudo apt-get -y build-dep matplotlib

# setup virtualenv
virtualenv venv
. venv/bin/activate
pip install -r requirements.txt
``````

Run

``````. venv/bin/activate
python ch05.py
``````
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 """This file contains code for use with "Think Stats", by Allen B. Downey, available from greenteapress.com Copyright 2008 Allen B. Downey. Distributed under the GNU General Public License at gnu.org/licenses/gpl.html. """ """Functions for building CDFs (cumulative distribution functions).""" import bisect import math import random import Pmf class Cdf(object): """Represents a cumulative distribution function. Attributes: xs: sequence of values ps: sequence of probabilities name: string used as a graph label. """ def __init__(self, xs=None, ps=None, name=''): self.xs = [] if xs is None else xs self.ps = [] if ps is None else ps self.name = name def Values(self): """Returns a sorted list of values. """ return self.xs def Items(self): """Returns a sorted sequence of (value, probability) pairs. Note: in Python3, returns an iterator. """ return zip(self.xs, self.ps) def Append(self, x, p): """Add an (x, p) pair to the end of this CDF. Note: this us normally used to build a CDF from scratch, not to modify existing CDFs. It is up to the caller to make sure that the result is a legal CDF. """ self.xs.append(x) self.ps.append(p) def Prob(self, x): """Returns CDF(x), the probability that corresponds to value x. Args: x: number Returns: float probability """ if x < self.xs[0]: return 0.0 index = bisect.bisect(self.xs, x) p = self.ps[index-1] return p def Value(self, p): """Returns InverseCDF(p), the value that corresponds to probability p. Args: p: number in the range [0, 1] Returns: number value """ if p < 0 or p > 1: raise ValueError('Probability p must be in range [0, 1]') if p == 0: return self.xs[0] if p == 1: return self.xs[-1] index = bisect.bisect(self.ps, p) if p == self.ps[index-1]: return self.xs[index-1] else: return self.xs[index] def Percentile(self, p): """Returns the value that corresponds to percentile p. Args: p: number in the range [0, 100] Returns: number value """ return self.Value(p / 100.0) def Random(self): """Chooses a random value from this distribution.""" return self.Value(random.random()) def Sample(self, n): """Generates a random sample from this distribution. Args: n: int length of the sample """ return [self.Random() for i in range(n)] def Mean(self): """Computes the mean of a CDF. Returns: float mean """ old_p = 0 total = 0.0 for x, new_p in zip(self.xs, self.ps): p = new_p - old_p total += p * x old_p = new_p return total def _Round(self, multiplier=1000.0): """ An entry is added to the cdf only if the percentile differs from the previous value in a significant digit, where the number of significant digits is determined by multiplier. The default is 1000, which keeps log10(1000) = 3 significant digits. """ # TODO(write this method) pass def Render(self): """Generates a sequence of points suitable for plotting. An empirical CDF is a step function; linear interpolation can be misleading. Returns: tuple of (xs, ps) """ xs = [self.xs[0]] ps = [0.0] for i, p in enumerate(self.ps): xs.append(self.xs[i]) ps.append(p) try: xs.append(self.xs[i+1]) ps.append(p) except IndexError: pass return xs, ps def MakeCdfFromItems(items, name=''): """Makes a cdf from an unsorted sequence of (value, frequency) pairs. Args: items: unsorted sequence of (value, frequency) pairs name: string name for this CDF Returns: cdf: list of (value, fraction) pairs """ runsum = 0 xs = [] cs = [] for value, count in sorted(items): runsum += count xs.append(value) cs.append(runsum) total = float(runsum) ps = [c/total for c in cs] cdf = Cdf(xs, ps, name) return cdf def MakeCdfFromDict(d, name=''): """Makes a CDF from a dictionary that maps values to frequencies. Args: d: dictionary that maps values to frequencies. name: string name for the data. Returns: Cdf object """ return MakeCdfFromItems(d.iteritems(), name) def MakeCdfFromHist(hist, name=''): """Makes a CDF from a Hist object. Args: hist: Pmf.Hist object name: string name for the data. Returns: Cdf object """ return MakeCdfFromItems(hist.Items(), name) def MakeCdfFromPmf(pmf, name=None): """Makes a CDF from a Pmf object. Args: pmf: Pmf.Pmf object name: string name for the data. Returns: Cdf object """ if name == None: name = pmf.name return MakeCdfFromItems(pmf.Items(), name) def MakeCdfFromList(seq, name=''): """Creates a CDF from an unsorted sequence. Args: seq: unsorted sequence of sortable values name: string name for the cdf Returns: Cdf object """ hist = Pmf.MakeHistFromList(seq) return MakeCdfFromHist(hist, name)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 # coding=utf-8 # Copyright 2015 Pang Yan Han, Philip # License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html from __future__ import print_function from math import exp, log from thinkstats import Mean import random import scipy.optimize import Cdf import myplot def ex5_13(): # We'll do the simulation 1 year by 1 year # Each year, about 1 / 1000 people will contract the particular cancer. # So we assume that for each year, P(person gets cancer) = 1 / 1000 prob_get_cancer = 1.0 / 1000 # Performs a simulation and returns the simulated data def _perform_simulation(nr_cohorts): cohort_size = 100 nr_years = 10 simulated_data = [] # we perform simulation one cohort at a time for _ in range(nr_cohorts): cancer_cases_in_cohort = 0 # and for each cohort, we perform the simulation 1 year at a # time for `nr_years` years for _ in range(nr_years): nr_contracted_cancer_in_this_round = 0 # Once a patient contracts cancer, he/she is counted as a # case. Therefore we don't need to perform simulation for # that person anymore. The `nr_in_cohort_without_cancer` # stores the number of people in the cohort who have not yet # contracted cancer (we sound evil here but I don't know how # else to put it) nr_in_cohort_without_cancer = cohort_size - cancer_cases_in_cohort for i in range(nr_in_cohort_without_cancer): # this simulates a Ber(p) random variable, where p is # the probability of the person contracting cancer if random.uniform(0, 1) <= prob_get_cancer: nr_contracted_cancer_in_this_round += 1 cancer_cases_in_cohort += nr_contracted_cancer_in_this_round # we're done simulating the cohort and have the total number of # cancer cases in the cohort over 10 years. append it to our # data set. simulated_data.append(cancer_cases_in_cohort) return simulated_data # number of cohorts we are simulating # NOTE: Change this to a number you like. nr_cohorts = 1000 simulated_data = _perform_simulation(nr_cohorts) empirical_cdf = Cdf.MakeCdfFromList( simulated_data, name="cancer cases" ) myplot.Cdf(empirical_cdf) myplot.Save( root="ex5-13-p1-cdf", formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF(x)", title="CDF plot", ) # Ok, let's try to see if an exponential distribution will be a # good model for our data. scale = myplot.Cdf(empirical_cdf, transform="exponential") myplot.Save( root="ex5-13-p1-ccdf-logy-scale", formats=["pdf", "png",], xlabel="cancer cases", ylabel="CCDF(x) (log scale)", title="CCDF plot", **scale ) # Ok, let's see if the Pareto will be a good model for our data. scale = myplot.Cdf(empirical_cdf, transform="pareto") myplot.Save( root="ex5-13-p1-ccdf-log-log-scale", formats=["pdf", "png",], xlabel="cancer cases (log scale)", ylabel="CCDF(x) (log scale)", title="CCDF plot (log log scale)", **scale ) # Let's do a CDF plot with log x scale to see if the resulting plot # looks similar to that of a normal CDF. If yes, then the Log-normal # distribution will be a good model for the data. scale = myplot.Cdf(empirical_cdf, xscale="log") myplot.Save( root="ex5-13-p1-cdf-logx-scale", formats=["pdf", "png",], xlabel="cancer cases (log scale)", ylabel="CDF", title="CDF plot (log x scale)", **scale ) # Let's see if the Weibull distribution will be a good model for our # data. scale = myplot.Cdf(empirical_cdf, transform="weibull") myplot.Save( root="ex5-13-p1-one-over-log-ccdf-log-log-scale", formats=["pdf", "png",], xlabel="cancer cases (log scale)", ylabel="log(1 / CCDF(x)) (log scale)", title="log(1 / CCDF) plot (log log scale)", **scale ) # K, seems like the Exponential and Weibull distributions are the # best models for our data. # # We use the Method of Moments and MLE and obtained the exact same # parameter for lambda, which is 1 / Xbar . # # So we shall use an Exponential distribution with # lambda = 1 / Xbar as our model and plot it with our data and see # if it's a good fit. # First, we compute lambda = 1 / Xbar xBar = Mean(simulated_data) exp_lambda = 1.0 / xBar # Here, we plot the Exp(lambda) CDF, over increments of 0.01 to make # it more smooth when we plot it. Otherwise, we'll be getting a plot # which looks similar to the empirical CDF, which is only defined at # discrete points. max_val = max(simulated_data) exp_lambda_pdf = lambda x: exp_lambda * exp(-exp_lambda * x) val_to_density = {} i, incr = 0, 0.01 while i <= max_val: val_to_density[i] = exp_lambda_pdf(i) i += incr exp_model_cdf = Cdf.MakeCdfFromDict(val_to_density, name="model" ) myplot.Cdfs([empirical_cdf, exp_model_cdf], transform="exponential") # Plot with log y scale of: # 1. Empirical CCDF # 2. CCDF of Exp(lambda) model scale = dict(yscale="log") myplot.Save( root="ex5-13-p1-ccdf-with-ccdf-of-exp-model-logy-scale", formats=["pdf", "png",], xlabel="cancer cases", ylabel="CCDF(x) (log scale)", title="CCDF & CCDF of Exp({}) model (log y scale)".format( exp_lambda ), **scale ) # Plot with: # 1. Empirical CDF # 2. Exp(lambda) model myplot.Cdfs([empirical_cdf, exp_model_cdf]) myplot.Save( root="ex5-13-p1-cdf-with-exp-model", formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF", title="CDF & Exp({}) model".format(exp_lambda), ) # Perform another simulation to get test data, we'll be needing this # later when we validate our models test_data = _perform_simulation(nr_cohorts) test_data_cdf = Cdf.MakeCdfFromList(test_data, name="test data") # Exp model shifted to the left by `exp_model_shift`, where # `exp_model_shift` is defined as: # # P(Exp(lambda) <= exp_model_shift) # = probability of cohorts having 0 cancer cases exp_model_shift = exp_model_cdf.Value(empirical_cdf.Prob(0)) # Redo the Exp(lambda) CDF. We start off by computing the density # function from `i = exp_model_shift` instead of 0, until `i` # reaches `max_val + exp_model_shift`. # However, those densities will be attributed to # `i - exp_model_shift` instead of to `i` because we want to shift # the CDF by `exp_model_shift` units to the left on the x-axis. i, incr, stop = exp_model_shift, 0.01, max_val + exp_model_shift val_to_density = {} while i <= stop: val_to_density[i - exp_model_shift] = exp_lambda_pdf(i) i += incr # For CDF(0) of our Exponential model to start at # P(cohorts with <= 0 cancer cases), we need to sum all the # densities from 0 to `exp_model_shift` and add them to the density # at `exp_model_shift`. We do so in the # `sum_of_densities_from_zero_to_exp_model_shift` variable. sum_of_densities_from_zero_to_exp_model_shift = 0 i = 0 while i < exp_model_shift: sum_of_densities_from_zero_to_exp_model_shift += \ exp_lambda_pdf(i) i += incr # then we add it to density at the `exp_model_shift` value which is # indexed by 0 in the `val_to_density` dict val_to_density[0] = val_to_density[0] + \ sum_of_densities_from_zero_to_exp_model_shift shifted_exp_model_cdf = Cdf.MakeCdfFromDict(val_to_density, name="model" ) myplot.Cdfs([empirical_cdf, shifted_exp_model_cdf]) myplot.Save( root="ex5-13-p1-cdf-with-shifted-exp-model", formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF", title="CDF with Exp({}) model shifted by {} to the left".format( exp_lambda, exp_model_shift ), ) # A good chunk of our simulated data consists of zeros. # For parameter estimation using the Maximum Likelihood Estimate, we # need to perform some calculations involving log on the data. # However, we know that log(0) is undefined. So we remove all zeros # from the data set. X_without_zero = list(filter(lambda x: x > 0, simulated_data)) X_without_zero_len = len(X_without_zero) log_X_without_zero = [log(x) for x in X_without_zero] one_over_n_sum_of_log_X_without_zero = \ sum(log_X_without_zero) / float(X_without_zero_len) # equation we're trying to find roots def _eqn_for_k(k): numer = 0.0 for i in range(X_without_zero_len): numer += (X_without_zero[i] ** k) * (log_X_without_zero[i]) denom = 0.0 for i in range(X_without_zero_len): denom += (X_without_zero[i] ** k) return 1.0 / (numer / denom - one_over_n_sum_of_log_X_without_zero) - k # Plot graph from x = 0.1 to 10 to see roughly where it crosses # y = 0. Values of x close to where the graph crosses y = 0 will be # a good initial guess for `scipy.optimize.fsolve`. xs, ys = [], [] x, incr, stop = 0.1, 0.1, 10 while x <= stop: xs.append(x) ys.append(_eqn_for_k(x)) x += incr myplot.Plot(xs, ys, label="plot") myplot.Save( root="ex5-13-p1-weibull-k-parameter-plot", formats=["pdf", "png",], xlabel="x", ylabel="y", title="weibull k parameter plot", ) # This is a rough estimate from looking at the k parameter plot initial_guess = 2.0 weibull_model_shape = scipy.optimize.fsolve(_eqn_for_k, [initial_guess] )[0] weibull_model_scale_on_full_data = ( sum((x ** weibull_model_shape for x in simulated_data)) / \ float(nr_cohorts) ) ** (1.0 / weibull_model_shape) weibull_model_scale_on_non_zero_data = ( sum((x ** weibull_model_shape for x in X_without_zero)) / \ float(X_without_zero_len) ) ** (1.0 / weibull_model_shape) print("shape = {}, scale on full data = {}, scale on non-zero data = {}".format( weibull_model_shape, weibull_model_scale_on_full_data, weibull_model_scale_on_non_zero_data )) def _weibull_pdf(shape, scale, x): return (shape / scale * (x / scale) ** (shape - 1) * exp(-((x / scale) ** shape)) ) for scale, root_suffix in [ (weibull_model_scale_on_full_data, "scale-on-full-data",), (weibull_model_scale_on_non_zero_data, "scale-on-non-zero-data",), ]: weibull_val_to_density = {} i, incr, max_val = 0, 0.01, max(simulated_data) while i <= max_val: weibull_val_to_density[i] = _weibull_pdf( weibull_model_shape, scale, i ) i += incr weibull_model_cdf = Cdf.MakeCdfFromDict(weibull_val_to_density, name="model" ) myplot.Cdfs([empirical_cdf, weibull_model_cdf]) myplot.Save( root="ex5-13-p1-cdf-with-weibull-model-{}".format( root_suffix ), formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF(x)", title="CDF & Weibull model", ) weibull_val_minus_one_to_density = dict( map(lambda x: (x[0] - 1, x[1]), weibull_val_to_density.items() ) ) weibull_model_left_shifted_cdf = Cdf.MakeCdfFromDict( weibull_val_minus_one_to_density, name="model" ) myplot.Cdfs([empirical_cdf, weibull_model_left_shifted_cdf]) myplot.Save( root="ex5-13-p1-cdf-with-weibull-model-left-shifted-by-one-{}".format( root_suffix ), formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF(x)", title="CDF & Weibull model (left shifted by 1 unit)", ) sum_of_negative_and_zero_density = sum( map(lambda y: y[1], filter(lambda x: x[0] <= 0, weibull_val_minus_one_to_density.items() ) ) ) weibull_val_to_density = dict( filter(lambda x: x[0] > 0, weibull_val_minus_one_to_density.items() ) ) weibull_val_to_density[0] = sum_of_negative_and_zero_density weibull_model_negative_grouped_with_zero_cdf = Cdf.MakeCdfFromDict( weibull_val_to_density, name="model" ) myplot.Cdfs( [empirical_cdf, weibull_model_negative_grouped_with_zero_cdf] ) myplot.Save( root="ex5-13-p1-cdf-with-weibull-model-negative-densities-accum-with-zero-{}".format(root_suffix), formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF(x)", title="CDF & Weibull model (r.v's <= 0 are grouped under 0)", ) # See if things generalize for the Weibull distribution myplot.Cdfs([test_data_cdf, weibull_model_negative_grouped_with_zero_cdf]) myplot.Save( root="ex5-13-p1-test-data-cdf-with-weibull-model-negative-densities-accum-with-zero-{}".format(root_suffix), formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF(x)", title="Test data CDF & Weibull model", ) # see if things generalize for the Exponential distribution myplot.Cdfs([test_data_cdf, shifted_exp_model_cdf]) myplot.Save( root="ex5-13-p1-test-data-cdf-with-shifted-exp-model", formats=["pdf", "png",], xlabel="cancer cases", ylabel="CDF(x)", title="Test data CDF & Exp({}) model shifted by {} to the left".format( exp_lambda, exp_model_shift ), ) if __name__ == "__main__": ex5_13()
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters