nmatzke/220_week10_lab_v2.R

## 220_week10_lab_v2.R


#######################################################
# WEEK 10 LAB
#######################################################

#######################################################
# PART 1: ADVANTAGES/DISADVANTAGES OF MODELS
#######################################################
#
# Please form groups of 2-4 people, and discuss the readings
# (Asimov and O'Neill) for 10 minutes.  Try to make sure
# everyone understands the basic points of each.
#
#######################################################

#######################################################
# PART 2: INTRODUCTION TO ML INFERENCE
########################################################
#
# We will begin our exploration of model-based inference
# with a very simple dataset and model.
#
# Many biological problems, in many different fields,
# from ecology to epidemiology, are concerned with
# trying to estimate the prevalence or frequency of
# something.
#
# For example:
#
# - Out of 1000 ponds in a region, how many contain a
#   particular endangered frog species?
#
# - Out of a million people, how many are resistant
#   to measles (due to e.g. vaccination or previous
#   infection)?
#
# These examples (leaving out numerous real-world
# complexities) boil down to this: there are 2
# possible outcomes, and we are interested in how
# often each outcome occurs.
#
# Another process with 2 outcomes is coin-flipping.
#
# Coin-flipping, and any other 2-outcome process,
# can be described with a one-parameter model, using
# a binomial distribution ("bi-nomial" = "two names").
#
# (We will use coin-flipping as an example, but you can
# mentally substitute any other binomial process if
# you like.)
#
# The single parameter describing a binomial process
# is "p", the probability of one of the outcomes. The
# probability of the other outcome is 1-p.
#
# Because there are so many different probabilities and
# "p" terms used in probability
# and statistics, I will use "P_heads" - the probability
# that a coin will produce "Heads" on a single flip - to
# describe this parameter.
#
########################################################

# Let's consider some coin-flip data.
#
# Here are 100 coin flips:

coin_flips = c('H','T','H','T','H','H','T','H','H','H','T','H','H','T','T','T','T','H','H','H','H','H','H','H','H','H','H','H','H','H','H','H','H','T','T','T','H','T','T','T','H','T','T','T','H','H','H','T','T','H','H','H','T','H','H','H','T','T','H','H','H','H','H','H','H','T','T','H','H','H','H','T','T','H','H','H','T','T','H','H','H','H','H','H','T','T','T','H','H','H','H','H','H','T','H','T','H','H','T','T')

# Look at the data
coin_flips

# What is your guess at "P_heads", the probability of heads?
#

# In the case of binomial data, we actually have a simple
# formula to calculate the best estimate of P_heads:

# Find the heads
heads_TF = (coin_flips == "H")
heads_TF

# Find the tails
tails_TF = (coin_flips == "T")
tails_TF

numHeads = sum(heads_TF)
numHeads

numTails = sum(tails_TF)
numTails

numTotal = length(coin_flips)
numTotal

# Here's the formula:
P_heads_ML_estimate = numHeads / numTotal
P_heads_ML_estimate

# PLEASE ANSWER LAB 10 QUESTION #1, NOW:
#
# 1. What number (exactly) was reported for P_heads_ML_estimate?
#
# (Obviously the answer is 0.65, but I want you to realize that
# the computer will auto-grade this.  So, 0.65 will be marked correct,
# but .65, 0.650, etc., will be marked incorrect.  Extra spaces may
# also cause the answer to be marked incorrect, e.g. "0.65" is correct,
# but " 0.65" or "0.65 " are incorrect. For future answers, when numbers
# are asked for, be sure to report the exact number R gave, without
# spaces etc.)


# Well, duh, that seems pretty obvious.  At least it would have been, if we
# weren't thinking of coins, where we have a strong prior belief that the
# coin is probably fair.
#
# It turns out that this formula can be justified through a technique
# known as Maximum Likelihood.
#
# What does it mean to say we have a "maximum likelihood" estimate of P_heads?
#
# "Likelihood", in statistics, means "the probability of the data under the model"
#
# So, a "Maximum Likelihood estimate" of P_heads is the value of the parameter
# P_heads that maximizes the probability of seeing the data you saw, namely,
# 65 Heads and 35 Tails.
#


# Comparing likelihoods for different possible parameter values of P_heads
#
# Let's calculate the probability of these same coin flip data under the
# hypothesis/model that P_heads=0.5
#
# We'll be very inefficient, and use a for-loop, and
# if/else statements

# Loop through all 100 flips
# Make a list of the probability of
# each datum
P_heads_guess = 0.5

# Empty list of probabilities
probs_list = rep(NA, times=length(coin_flips))
probs_list

for (i in 1:length(coin_flips))
    {
    # Print an update
    cat("\nAnalysing coin flip #", i, "/", length(coin_flips), sep="")

    # Get the current coin flip
    coin_flip = coin_flips[i]

    # If the coin flip is heads, give that datum
    # probability P_heads_guess.
    # If tails, give it (1-P_heads_guess)

    if (coin_flip == "H")
        {
        probs_list[i] = P_heads_guess
        } # End if heads

    if (coin_flip == "T")
        {
        probs_list[i] = (1-P_heads_guess)
        } # End if tails
    } # End for-loop

# Look at the resulting probabilities
probs_list

# We get the probability of all the data by multiplying
# all the probabilities, with the prod() function.
likelihood_of_data_given_P_heads_guess1 = prod(probs_list)
likelihood_of_data_given_P_heads_guess1

# PLEASE ANSWER LAB 10, QUESTION #2, NOW:
# 2. What number (exactly) was reported for likelihood_of_data_given_P_heads_guess1?


# That's a pretty small number!  You'll see that it's
# just 0.5^100:
0.5^100

# A probability of 0.5 is not small, but multiply it
# 100 values of 0.5 together, and you get a small value.
# That's the probability of that specific sequence of
# heads/tails, given the hypothesis that the true
# probability is P_heads_guess.

# Let's try another parameter value:

# Loop through all 100 flips
# Make a list of the probability of
# each datum
P_heads_guess = 0.7

# Empty list of probabilities
probs_list = rep(NA, times=length(coin_flips))
probs_list

for (i in 1:length(coin_flips))
    {
    # Print an update
    cat("\nAnalysing coin flip #", i, "/", length(coin_flips), sep="")

    # Get the current coin flip
    coin_flip = coin_flips[i]

    # If the coin flip is heads, give that datum
    # probability P_heads_guess.
    # If tails, give it (1-P_heads_guess)

    if (coin_flip == "H")
        {
        probs_list[i] = P_heads_guess
        } # End if heads

    if (coin_flip == "T")
        {
        probs_list[i] = (1-P_heads_guess)
        } # End if tails
    } # End for-loop

# Look at the resulting probabilities
probs_list

# We get the probability of all the data by multiplying
# all the probabilities
likelihood_of_data_given_P_heads_guess2 = prod(probs_list)
likelihood_of_data_given_P_heads_guess2

# PLEASE ANSWER LAB 10, QUESTION #3, NOW:
# 3. What number (exactly) was reported for likelihood_of_data_given_P_heads_guess2?

# We got a different likelihood. It's also very small.
# But that's not important. What's important is,
# how many times higher is it?

likelihood_of_data_given_P_heads_guess2 / likelihood_of_data_given_P_heads_guess1

# PLEASE ANSWER LAB 10, QUESTION #4, NOW:
# 4. What number (exactly) was reported for
# likelihood_of_data_given_P_heads_guess2 / likelihood_of_data_given_P_heads_guess1 ?


# Whoa!  That's a lot higher!  This means the coin flip data is 54 times more
# probable under the hypothesis that P_heads=0.7 than under the
# hypothesis that P_heads=0.5.

# Maximum likelihood: Hopefully you can see that the data is in some sense
# "best explained" by the parameter value (the valueof P_heads) that
# maximizes the probability of the data. This is what we call the Maximum
# Likelihood solution.

# To try different values of P_heads, we could keep copying and pasting code,
# but that seems annoying.  Let's define an R "function" instead.
#
# Functions, in R, collect a number of commands, and run them when you
# type the name of the function. Functions are simple programs. All of
# the R commands you have ever run, e.g. mean(), prod(), etc. are all
# just functions that R has pre-programmed.
#
# NOTES ON FUNCTIONS IN R:
# * To run, functions have to be followed by (). Usually, but not always,
#   inputs go into the ().  One example of a function that needs no inputs
#   is getwd(), which gets your current working directory (wd) and prints it
#   to screen:

getwd()

# * To see the code that is running inside the function, you can often
#   just type the name of the function, without the (). This works for
#   most R functions except for "primitive"  ones (these are core R
#   functions, like getwd, mean, length, etc., that are programmed
#   a different way to be super-fast).
#
# * Typing the names of functions and figuring out exactly what is
#   happening in the code is a great way to learn R and learn a
#   specific scientific field, since you can see *exactly* what
#   some scientist is doing in their analysis.
#
# * For functions that are part of packages, you can get the help
#   page by type a ? before the function name. E.g., ?mean will open
#   the help page for the "mean" function.  R help is often not
#   written for beginners, but it is better than nothing.
#
# * When you try to run a function and get an error, this usually
#   means you left out a required input, or you put in the wrong
#   kind of input (e.g., if the input is supposed to be a number,
#   and you put in a word like "hello", you will get an error
#   message.
#
# When you type

mean("hello")

# ...what happens?

# PLEASE ANSWER LAB 10, QUESTION #5, NOW:
# 5. When you run 'mean("hello")', paste in the last part of the error message you get back.
# Your paste should being with "argument" and end with "NA".


# Let's define a function
#
# This function calculates the probability of a
# sequence of coin flip data, given a value of P_heads_guess
calc_prob_coin_flip_data <- function(P_heads_guess, coin_flips)
    {
    # Empty list of probabilities
    probs_list = rep(NA, times=length(coin_flips))
    probs_list

    for (i in 1:length(coin_flips))
        {
        # Print an update
        #cat("\nAnalysing coin flip #", i, "/", length(coin_flips), sep="")

        # Get the current coin flip
        coin_flip = coin_flips[i]

        # If the coin flip is heads, give that datum
        # probability P_heads_guess.
        # If tails, give it (1-P_heads_guess)

        if (coin_flip == "H")
            {
            probs_list[i] = P_heads_guess
            } # End if heads

        if (coin_flip == "T")
            {
            probs_list[i] = (1-P_heads_guess)
            } # End if tails
        } # End for-loop

    # Look at the resulting probabilities
    probs_list

    # We get the probability of all the data by multiplying
    # all the probabilities
    likelihood_of_data_given_P_heads_guess = prod(probs_list)

    # Return result
    return(likelihood_of_data_given_P_heads_guess)
    }

# Now, we can just use this function:
calc_prob_coin_flip_data(P_heads_guess=0.5, coin_flips=coin_flips)
calc_prob_coin_flip_data(P_heads_guess=0.6, coin_flips=coin_flips)
calc_prob_coin_flip_data(P_heads_guess=0.7, coin_flips=coin_flips)

# Look at that!  We did all of that work in a split-second.

# PLEASE ANSWER LAB 10, QUESTION #6, NOW:
# 6. What number (exactly) was reported for
# calc_prob_coin_flip_data(P_heads_guess=0.6, coin_flips=coin_flips) ?

# In fact, we can make another for-loop, and search for the ML
# value of P_heads by trying all of the values and plotting them.

# Sequence of 50 possible values of P_heads between 0 and 1
P_heads_values_to_try = seq(from=0, to=1, length.out=50)
likelihoods = rep(NA, times=length(P_heads_values_to_try))

for (i in 1:length(P_heads_values_to_try))
    {
    # Get the current guess at P_heads_guess
    P_heads_guess = P_heads_values_to_try[i]

    # Calculate likelihood of the coin flip data under
    # this value of P_heads
    likelihood = calc_prob_coin_flip_data(P_heads_guess=P_heads_guess, coin_flips=coin_flips)

    # Store the likelihood value
    likelihoods[i] = likelihood
    } # End for-loop

# Here are the resulting likelihoods:
likelihoods

# Let's try plotting the likelihoods to see if there's a peak
plot(x=P_heads_values_to_try, y=likelihoods)
lines(x=P_heads_values_to_try, y=likelihoods)

# Whoa! That's quite a peak!  You can see that the likelihoods
# vary over several orders of magnitude.
#
# Partially because of this extreme variation, we often use the
# logarithm of the likelihood, also known as
#
# * log-likelihood
# * lnL
#
# In R, the log() function returns the natural logarithm, while
# log10 returns the logarithms at base 10. I mention this because
# on some calculators, the LN button returns the natural log,
# and the LOG button returns the log10 logarithm.
#
#
# There are several reasons scientists, and scientific papers,
# often use the log-likelihood instead of the raw likelihood.
# These include:
#
# * Computers have a minimum numerical precision. If a likelihood
#   value gets too small, it gets rounded to 0. You can see this
#   happen if you calculate the raw likelihood of over 1000 coin-flips
#   using the parameter P_heads = 0.5
#
# Machine precision can vary between computers, so I will give you
# the results on my machine (but, try it on yours):

0.5^1073

# ...equals 9.881313e-324, which is a very small number

0.5^1074

# ...equals 4.940656e-324, which is half the size of the previous number,
# which makes sense, because we just multiplied it by 0.5
#
# However, on my machine,

0.5^1075

# ...equals 0. Instead of one more coin-flip causing the likelihood to be
# halved again, zero is infinitely less than 4.940656e-324. So the
# computer has made a "mistake" here.
#
# Log-likelihoods do not have this problem. Compare:

1073 * log(0.5)
1074 * log(0.5)
1075 * log(0.5)
1076 * log(0.5)

# We can convert log-likelihoods to raw-likelihood with exp(), which reverses
# the log() operation.

exp(1073 * log(0.5))
exp(1074 * log(0.5))
exp(1075 * log(0.5))
exp(1076 * log(0.5))

# You can see that, working in log-likelihood space, we have no
# numerical precision issues, but with raw likelihoods, we do.

# PLEASE ANSWER LAB 10, QUESTION #7, NOW:
# 7. What number (exactly) was reported for 1075 * log(0.5) ?


# Other reasons scientists use log-likelihoods instead of likelihoods:
#
# * The log-likelihood and raw-likelihood will have their maximums
#   at the same parameter values.
#
# * Taking the logarithm of an equation converts multiplication to
#   addition, which can make calculus much easier.
#
# * Log-likelihoods are easier to read and interpret than raw
#   likelihoods, once you get used to log-likelihoods. For any
#   large dataset, the raw likelihood will be a very tiny
#   number, hard to read/remember/interpret.
#
# * It turns out that a number of fundamental statistical measures
#   and tests use the log-likelihood as an input. These include
#   the Likelihood Ratio Test (LRT) and Akaike Information Criterion (AIC).
#   We will talk about these in future weeks.


# PLOTTING THE LOG-LIKELIHOOD

# We will repeat the code to calculate the likelihood
# for a sequence of 50 possible values of P_heads between 0 and 1:

P_heads_values_to_try = seq(from=0, to=1, length.out=50)
likelihoods = rep(NA, times=length(P_heads_values_to_try))

for (i in 1:length(P_heads_values_to_try))
    {
    # Get the current guess at P_heads_guess
    P_heads_guess = P_heads_values_to_try[i]

    # Calculate likelihood of the coin flip data under
    # this value of P_heads
    likelihood = calc_prob_coin_flip_data(P_heads_guess=P_heads_guess, coin_flips=coin_flips)

    # Store the likelihood value
    likelihoods[i] = likelihood
    } # End for-loop

# Here are the resulting likelihoods:
likelihoods

# Let's take the log (the natural log, i.e. the base is exp(1)).
log_likelihoods = log(likelihoods, base=exp(1))

plot(x=P_heads_values_to_try, y=log_likelihoods)
lines(x=P_heads_values_to_try, y=log_likelihoods)

# Let's plot the raw- and log-likelihood together:
par(mfrow=c(2,1))
plot(x=P_heads_values_to_try, y=likelihoods, main="Likelihood (L) of the data")
lines(x=P_heads_values_to_try, y=likelihoods)

plot(x=P_heads_values_to_try, y=log_likelihoods, main="Log-likelihood (LnL) of the data")
lines(x=P_heads_values_to_try, y=log_likelihoods)

# PLEASE ANSWER LAB 10, QUESTION #8, NOW:
# 8. Compare the raw-likelihood curve and the log-likelihood curve. Does it look
# like the top of each curve appears in about the same location on the
# x-axis? (i.e., at the same parameter value for P_heads?)


#
# Maximum likelihood optimization
#
# You can see that the maximum likelihood of the data occurs when
# P_heads is somewhere around 0.6 or 0.7.  What is it
# exactly?
#
# We could just keep trying more values until we find whatever
# precision we desire.  But, R has a function for
# maximum likelihood optimization!
#
# It's called optim().  Optim() takes a function as an input.
# Fortunately, we've already written a function!
#
# Let's modify our function a bit to return the log-likelihood,
# and print the result:

# Function that calculates the probability of coin flip data
# given a value of P_heads_guess
calc_prob_coin_flip_data2 <- function(P_heads_guess, coin_flips)
    {
    # Empty list of probabilities
    probs_list = rep(NA, times=length(coin_flips))
    probs_list

    for (i in 1:length(coin_flips))
        {
        # Print an update
        #cat("\nAnalysing coin flip #", i, "/", length(coin_flips), sep="")

        # Get the current coin flip
        coin_flip = coin_flips[i]

        # If the coin flip is heads, give that datum
        # probability P_heads_guess.
        # If tails, give it (1-P_heads_guess)

        if (coin_flip == "H")
            {
            probs_list[i] = P_heads_guess
            } # End if heads

        if (coin_flip == "T")
            {
            probs_list[i] = (1-P_heads_guess)
            } # End if tails
        } # End for-loop

    # Look at the resulting probabilities
    probs_list

    # We get the probability of all the data by multiplying
    # all the probabilities
    likelihood_of_data_given_P_heads_guess = prod(probs_list)

    # Get the log-likelihood
    LnL = log(likelihood_of_data_given_P_heads_guess)
    LnL

    # Error correction: if -Inf, reset to a low value
    if (is.finite(LnL) == FALSE)
        {
        LnL = -1000
        }

    # Print some output
    print_txt = paste("\nWhen P_heads=", P_heads_guess, ", LnL=", LnL, sep="")
    cat(print_txt)

    # Return result
    return(LnL)
    }

# Try the function out:
LnL = calc_prob_coin_flip_data2(P_heads_guess=0.1, coin_flips=coin_flips)
LnL = calc_prob_coin_flip_data2(P_heads_guess=0.2, coin_flips=coin_flips)
LnL = calc_prob_coin_flip_data2(P_heads_guess=0.3, coin_flips=coin_flips)

# Looks like it works!  Let's use optim() to search for he
# best P_heads value:

# Set a starting value of P_heads
starting_value = 0.1

# Set the limits of the search
limit_bottom = 0
limit_top = 1

optim_result = optim(par=starting_value, fn=calc_prob_coin_flip_data2, coin_flips=coin_flips, method="L-BFGS-B", lower=limit_bottom, upper=limit_top, control=list(fnscale=-1))

# You can see the search print out as it proceeds.

# Let's see what ML search decided on:
optim_result

# Let's compare the LnL from ML search, with the binomial mean
optim_result$par

# Here's the formula:
P_heads_ML_estimate = numHeads / numTotal
P_heads_ML_estimate

# PLEASE ANSWER LAB 10, QUESTION #9, NOW:
# 9. Is the ML value of P_heads_guess found by numerical optimization
# (found in optim_result$par ) very close to the ML value found
# by analytical solution (P_heads_ML_estimate)?

# PLEASE ANSWER LAB 10, QUESTION #10, NOW:
# 10. What was the maximized log-likelihood found by optim (hint: look at
# optim_result$value)?  Numerical optimization results might vary
# slightly, so round this value to 2 decimal places in your answer.


# Where does that formula -- the "analytic solution" for P_heads_ML_estimate
# come from?  We will discuss this further in future lectures, but here is a summary.
#
# It turns out that, in simple cases, we can use calculus to derive
# "analytical solutions" for maximizing the likelihood.  Basically, this involves:
#
# - taking the derivative of the likelihood function
#   (the derivative is the slope of a function)
# - setting the formula for the derivative to equal 0
#   (because, when the derivative=0, you are at a maximum or minimum)
# - solving for the parameter value that causes the derivative to
#   equal 0.
#
# This same basic calculus-based procedure provides a justification for
# the formulas that are used in e.g. linear regression.
#
# The nice things about calculus-derived formulas for maximizing likelihood
# -- known as "analytic solutions" -- include:
#
# - it is very fast
# - the solution can be found without a computer (this was useful back in
#   the 20th century, when statistics was being developed without computers)
# - there is no chance of optimization error (optimization errors can occur
#   if likelihood surfaces are too flat, too bumpy, etc.)
#
# This raises the question:
# Why would anyone ever go through all the optimization rigamarole, when
# they could just calculate P_heads directly?
#
# Well, only in simple cases do we have a formula for the maximum likelihood
# estimate.  The optim() strategy works whether or not
# there is a simple formula.
#
# In real life science, with complex models of biological processes (such as
# epidemiological models), ML optimization gets used A LOT, but most scientists
# don't learn it until graduate school, if then.
#
# WHAT WE LEARNED
#
# You can now see that there is nothing magical, or even particularly
# difficult, about these concepts:
#
# - likelihood
# - log-likelihood
# - maximum likelihood (ML)
# - inferring a parameter through ML
#
#
# IMPLICATIONS
#
# Despite their simplicity, it turns out that likelihood is one of
# the core concepts in model-based inference, and statistics
# generally.  Bayesian analyses have likelihood as a core component
# (we will talk about this more later), and methods from introductory
# statistics (linear models etc.)
#######################################################


#######################################################
# PART 3: BUILDING BLOCKS OF MODELS IN R
#######################################################
#
# R has a number of functions that speed up the process
# of building models, and simulating from them.
#
# For example, for binomial processes, these are:
#
# dbinom(x, size, prob, log) -- the probability density
# (the likelihood) of x "heads"
#
# size = the number of tries (the number of coin flips)
#
# prob = the parameter value (e.g. P_heads)
#
# log = should the likelihood be log-transformed when output?
#
#
# The other functions are:
#
# rbinom(n, size, prob) -- simulate n binomial outcomes,
# using "size" and "prob" as above
#
# pbinom(q, ...) - used for calculating tail probabilities
# (proportion of the distribution less than the value q)
#
# qbinom(p, ...) - reverse of pbinom()
#
#
# Any time you hear about "probability densities", "distributions"
# "random number generators", "random draws", or "simulations"
# in R, functions like these are being used.
#
# For uniform distributions, the equivalent functions are:
#
# dunif, runif, punif, qunif
#
# For normal (=Gaussian/bell-curve) distributions, the equivalents are:
#
#
# dnorm, rnorm, pnorm, qnorm


# Random seeds, and random number generators

# Random numbers in computers are not truly random. To make random numbers
# repeatable, a "seed" number is always used.

# Compare:

rnorm(n=1, mean=6, sd=1)
rnorm(n=1, mean=6, sd=1)
rnorm(n=1, mean=6, sd=1)

set.seed(54321)
rnorm(n=1, mean=6, sd=1)
rnorm(n=1, mean=6, sd=1)

set.seed(54321)
rnorm(n=1, mean=6, sd=1)
rnorm(n=1, mean=6, sd=1)

set.seed(54321)
rnorm(n=1, mean=6, sd=1)
rnorm(n=1, mean=6, sd=1)

# By re-setting the seed, we re-set the random number generator

# PLEASE ANSWER LAB 10, QUESTION #11, NOW:
# 11. If you set the random number seed to 54322, what is the first
# random number generated by rnorm(n=1, mean=6, sd=1)?


# Generate 1000 random numbers from a normal distribution, and
# then plot a histogram of them, with this code:

random_numbers = rnorm(n=1000, mean=6, sd=1)
hist(random_numbers)

# PLEASE ANSWER LAB 10, QUESTION #12, NOW:
# 12. What shape do you see?


# PLEASE ANSWER LAB 10, QUESTION #13, NOW:
# 13. I am 6 feet tall. What is the raw likelihood of my height,
# assuming that I am a random draw from a population that is
# 7 feet tall, with a standard deviation of 0.5 feet?

dnorm(x=6.0, mean=7.0, sd=0.5)

# PLEASE ANSWER LAB 10, QUESTION #14, NOW:
# 14. What is the log-likelihood of my height?
log(dnorm(x=6.0, mean=7.0, sd=0.5))

# PLEASE ANSWER LAB 10, QUESTION #15, NOW:
# 15. If we change the model of my population from mean height=7
# feet to mean height=6 feet, will the likelihood of my 6-foot
# height go up or down?

# PLEASE ANSWER LAB 10, QUESTION #16, NOW:
# 16. What is the raw likelihood of my height, assuming I am a
# random draw from a population that is 6 feet tall, with
# a standard deviation of 0.5 feet?

dnorm(x=6.0, mean=6.0, sd=0.5)

# NOTE: The "d" in "dnorm" refers to probability density. For continuous data,
# the probability of any exact value is 0, because there are an infinite number
# of possible values for a continuous variable. So we talk about the probability
# density instead of the exact probability (or probability mass). Probability
# densities work the same as probability masses for our purposes.


# PLEASE ANSWER LAB 10, QUESTION #17, NOW:
# 17. What is the corresponding log-likelihood?


# PLEASE ANSWER LAB 10, QUESTION #18, NOW:
# 18. What shape does dnorm() make, for a variety of
# height values?  Try:
#
height_values = seq(4, 8, 0.01)
probability_densities = dnorm(x=height_values, mean=6.0, sd=0.5)
plot(x=height_values, y=probability_densities)

#######################################################
# END OF LAB 10 WORKSHEET
# DISCUSSION POST ASSIGNMENT CONTINUES BELOW
#######################################################


#######################################################
# BIOSCI 220 Week 10 Lab Exercise
# PART 4
# Reading, plotting, and interpreting COVID data
#######################################################

# Download the CSV (comma-separated value) file from:
# https://github.com/owid/covid-19-data/tree/master/public/data
#
# Specifically:
# https://covid.ourworldindata.org/data/owid-covid-data.csv
#
# Canvas backup of "owid-covid-data.csv" is:
# https://canvas.auckland.ac.nz/files/5181566/download?download_frd=1
#
# Two methods to get the data:
#
# 1. Save locally, then open in R. The tricky thing about this is that
#    *you* have to figure out where you have saved the data, then get
#    R to find that file.

# If I saved the data to the directory
# "/drives/GDrive/__classes/BIOSCI220/2020_term2/week02_lectures/"
# I could store this "path" in "fn" (fn=filename),
# then read it with read.csv():

# fn = "/drives/GDrive/__classes/BIOSCI220/2020_term2/week02_lectures/owid-covid-data.csv"
# data = read.csv(file=fn)
# dim(data)

# 2. Read the file straight off the internet (easier for now)
#

fn = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
data = read.csv(file=fn)
dim(data)


#######################################################
# Exploring a data table
#
# A lot of work in R is done with "data frames". These
# are just tables of data.
#
# Data tables can be huge, so there are bunch of functions
# that let you explore what is in a data table.
#
# Try these commands. Figure out what each one does.
#
# Remember, you can do e.g. "?dim" to get the help page
# for the function.
#
#######################################################

dim(data)
nrow(data)
ncol(data)
head(data)
tail(data)
names(data)
class(data)


#######################################################
# Accessing particular columns of data
#######################################################
# With R data.frames, you can access individual columns
# using "$columnname"

# However, this dataset is much huger than in May 2020, such
# that now it causes problems when printed to the screen
# (so, DON'T try to print the whole dataset to screen, you
# may get a frozen screen).

# We can use the function "unique" to
# get just one entry for each country

unique(data$location)


#######################################################
# Plotting the data
# Below, I have a script to plot the number of new
# cases, per day, for New Zealand
#######################################################

# Specify the country name
country_name = "New Zealand"

# The "==" symbol gives TRUE or FALSE
# Here, we are seeing, for each row, if it matches
# "New Zealand".
#
TF = data$location == country_name

# R treats FALSE as 0, and TRUE as 1, so doing sum(TF)
# gives us the number of TRUE values
sum(TF)

# We can subset the table "data" to a smaller table, "d",
# by using square brackets. Square brackets indicate which
# [rows,columns] of the data table you want, where blank =
# "give everything".
#
# So, this command subsets the table "data" to all rows
# where location == "New Zealand"
d = data[TF,]

# How big is the new table?
dim(d)

# Let's plot date versus cases, for New Zealand
plot(x=d$date, y=d$new_cases)

# Why didn't that work?  Look at d$date:
d$date
class(d$date)

# If the "class" of the data is "character", and if R can't easily
# convert it to numbers (class "numeric"), then R gives an error.

# We can fix this by forcing R to read the dates as datatype date,
# using the "as.Date" function:

plot(x=as.Date(d$date), y=d$new_cases)


# You can see that R has plotted the dates on the x-axis, and
# the new cases on the y-axis. R uses a bunch of defaults in its
# plots, but we can do better.
#
# I am just going to make a better plot. It is up to you to
# READ this code, and look through the relevant help files,
# ?plot and ?par,
# to learn what these various inputs do.

plot(x=as.Date(d$date), y=d$new_cases, xlab="Date", ylab="Number of new cases", pch=1, col="red", cex=0.5)
titletxt = paste0("Number of new cases per day in: ", country_name)
title(titletxt)
lines(x=as.Date(d$date), y=d$new_cases, lty="solid", lwd=2, col="red")


# Hmm, that line has some gaps, suggesting there are some days with no data.
# What is going on?
unique(d$new_cases)

# Let's try replacing the "NA" values with 0:
TF = is.na(d$new_cases)
d$new_cases[TF] = 0

# And, plot again:
plot(x=as.Date(d$date), y=d$new_cases, xlab="Date", ylab="Number of new cases", pch=1, col="red", cex=0.5)
titletxt = paste0("Number of new cases per day in: ", country_name)
title(titletxt)
lines(x=as.Date(d$date), y=d$new_cases, lty="solid", lwd=2, col="red")


#######################################################
# That looks a little better, but it still seems like quite a jagged plot.
# Some of this is randomness in the daily counts, and some is
# probably errors in the data.
#
# We might get a clearer picture if we had a smoother version of the data.
#
# Also, for SIR models, we will want to know how many COVID
# cases are active at any particular point in time.
#
# The conventional wisdom is that once someone gets COVID,
# they are infectious for about 14 days. This may not be
# perfectly accurate, but it is a reasonable starting point.
#
# The code below keeps a running count of cases for the
# last 14 days. It also counts how many have recovered.
#######################################################

# Subset to your country, replace "NA" values with 0:
TF = data$location == country_name
d2 = data[TF, ]
TF = is.na(d2$new_cases)
d2$new_cases[TF] = 0

# Keep a count of active cases, and recovered cases
active_cases = rep(0.0, times=nrow(d2))
recovered_cases = rep(0.0, times=nrow(d2))

# Here, we do a "for-loop". It will leap through every row
# of the table, and do a calculation.
#
# NOTE: to make a for-loop run, you have to run
# the WHOLE thing, from the "for" to the final closing
# bracket "}", at once!
#
for (i in 1:nrow(d2))
	{
	# Keep track of the starting row number
	if (i <= 14)
		{
		startrow = 1
		endrow = i
		recovered_cases[i] = 0
		} else {
		startrow = startrow + 1
		endrow = i
		sum_of_recovered_cases = sum(d2$new_cases[1:(startrow-1)])
		recovered_cases[i] = sum_of_recovered_cases
		}

	# Add up the last 14 days of active cases
	sum_of_active_cases_14days = sum(d2$new_cases[startrow:endrow])

	# Store the result:
	active_cases[i] = sum_of_active_cases_14days
	}

# Plot the active cases, AND recovered cases
# Plot a plot of white dots (just to set the plot scale)
xvals = c(as.Date(d$date), as.Date(d$date))
yvals = c(active_cases, recovered_cases)
plot(x=xvals, y=yvals, xlab="Date", ylab="Number of cases", pch=".", col="white", cex=0.5)
titletxt = paste0("Number of active & recovered cases in: ", country_name)
title(titletxt)

# Notes:
# for help on plots, ?plot
# for help on point characters (pch), ?points
# for other graphical parameters (e.g. lty = line type), ?par

# Add points & line for active cases
points(x=as.Date(d$date), y=active_cases, pch=1, col="red", cex=0.6)
lines(x=as.Date(d$date), y=active_cases, lty="solid", lwd=2, col="red2")

# Add points & line for recovered cases
points(x=as.Date(d$date), y=recovered_cases, pch=2, col="green3", cex=0.5)
lines(x=as.Date(d$date), y=recovered_cases, lty="solid", lwd=2, col="green4")

# Add vertical line with abline, and text, for Level 4 lockdown start
lockdown_time = as.Date("2020-03-26")
abline(v=lockdown_time, col="black", lty="dashed", lwd=2)
text(x=lockdown_time, y=0.99*max(yvals), labels="lockdown begins", pos=2, srt=90, cex=0.7)

# Add vertical line with abline, and text, for Level 4 lockdown end
lockdown_time = as.Date("2020-04-28")
abline(v=lockdown_time, col="black", lty="dashed", lwd=2)
text(x=lockdown_time, y=0.99*max(yvals), labels="lockdown ends", pos=2, srt=90, cex=0.7)


# Add horizontal line with abline
threshold = 1000
abline(h=threshold, col="grey", lty="dotted", lwd=2)
text_x = min(xvals) + 0.05*(max(xvals) - min(xvals))
text(x=text_x, y=threshold, labels="(1000 cases)", pos=3, srt=0, cex=0.7, col="grey")

# Add legend
legend(x="right", legend=c("active cases", "recovered"), col=c("red2", "green4"), lwd=c(2,2), pch=c(1,2))


#######################################################
# Tasks:
#
# Work through the example R script, plotting New Zealand’s COVID history
#
# 0. Do the Week 10 Worksheet, if you haven't already
#
# 1. Do Week 10 quiz (this will force you to work through the readings & example script!)
#
# 2. Make a nice plot, in R, using the COVID data for one or more countries
# (a country other than New Zealand, which is the example)
#    A nice plot means: title, labeled axes, legend if multiple colors are used, etc.
#
#    If warranted, draw some vertical lines to represent major events (e.g. lockdowns starting and
#     ending; use Google to find these. Yes, sometimes the story will be complex as
#     some countries had regional lockdowns, etc.  Just try to label the biggest events
#     influencing the curve. Your plot will be marked based on whether it communicates the
#     basics of the Covid epidemic in the country, not on every last detail, so use your
#     own best judgement about what to include.)
#
# 3. Screen capture your plot & post to Canvas, along with a paragraph interpreting
# the plot, meaning: you provide a verbal model of what you think is going on in that country.
#
# 4. Comment on 2 other peoples’ posts – say what is interesting, ask a
# question, etc. (make sure you are polite & professional)
#
# This assignment should be doable in lab, so everything is due 5 pm Friday, May 21.
#
# The main forum for help is the physical lab sessions. We will also monitor Piazza,
# BUT DO NOT EXPECT PIAZZA HELP OUTSIDE OF WEEKDAY HOURS.
#
#######################################################

#######################################################
# HINTS
#######################################################
# I am deliberately “throwing you into the deep end” with the R script.
#
# Your job is to run the script, and, from reading the script, learn the basic R commands to plot data.
#
# This is how everyone learns R!
#
# Yes, it is OK to copy and modify the R code
#
# Rubric: 10 points total
# 2 - plot readable (not NZ-only; you could compare NZ to elsewhere)
#       * The plot MUST be viewable IN THE CANVAS POST, it MUST NOT be
#         a separate attachment
# 2 - plot colors, lines, style; plot has appropriate title, axis labels, legend if needed, etc.
# 4 - paragraph interpretation
# 2 - 2 polite, professional responses
#
#######################################################


#######################################################
# Plotting to PDF
#
# Because the graphics window in RStudio can be different
# sizes and shapes, depending on your setup, it can be
# difficult to write code that makes a plot "look good",
# especially in a small window.
#
# One solution is to send your plots to a PDF. This will
# *probably* work on your desktop/laptop -- the main thing that
# is required is that the computer have a PDF viewer installed.
#
# If it *doesn't* work, you can either (a) install a PDF viewer, or
# (b) e.g. for a lab computer where you can't install things, you
# can manually open the PDF in a web browser.
#######################################################


#######################################################
# The first example plot, as a PDF
#######################################################

pdffn = "number_of_new_cases_NZ.pdf"  # filename of PDF
pdf(file=pdffn, width=10, height=8)    # pdf() opens the PDF for writing - won't close until dev.off()!!!

# Code for your plot
plot(x=as.Date(d$date), y=d$new_cases, xlab="Date", ylab="Number of new cases", pch=1, col="red", cex=0.5)
titletxt = paste0("Number of new cases per day in: ", country_name)
title(titletxt)
lines(x=as.Date(d$date), y=d$new_cases, lty="solid", lwd=2, col="red")

dev.off()	# closes the PDF for writing; after this, new plots go to screen again
cmdstr = paste0("open ", pdffn)  # create the command to open the PDF
system(cmdstr)                   # open the PDF in the operating system


#######################################################
# The second example plot, as a PDF
#######################################################

pdffn = "number_of_active_recovered_cases_NZ.pdf"  # filename of PDF
pdf(file=pdffn, width=10, height=8)    # pdf() opens the PDF for writing - won't close until dev.off()!!!

# Code for your plot
# Plot the active cases, AND recovered cases
# Plot a plot of white dots (just to set the plot scale)
xvals = c(as.Date(d$date), as.Date(d$date))
yvals = c(active_cases, recovered_cases)
plot(x=xvals, y=yvals, xlab="Date", ylab="Number of cases", pch=".", col="white", cex=0.5)
titletxt = paste0("Number of active & recovered cases in: ", country_name)
title(titletxt)

# Notes:
# for help on plots, ?plot
# for help on point characters (pch), ?points
# for other graphical parameters (e.g. lty = line type), ?par

# Add points & line for active cases
points(x=as.Date(d$date), y=active_cases, pch=1, col="red", cex=0.6)
lines(x=as.Date(d$date), y=active_cases, lty="solid", lwd=2, col="red2")

# Add points & line for recovered cases
points(x=as.Date(d$date), y=recovered_cases, pch=2, col="green3", cex=0.5)
lines(x=as.Date(d$date), y=recovered_cases, lty="solid", lwd=2, col="green4")

# Add vertical line with abline, and text, for Level 4 lockdown start
lockdown_time = as.Date("2020-03-26")
abline(v=lockdown_time, col="black", lty="dashed", lwd=2)
text(x=lockdown_time, y=0.99*max(yvals), labels="lockdown begins", pos=2, srt=90, cex=0.7)

# Add vertical line with abline, and text, for Level 4 lockdown end
lockdown_time = as.Date("2020-04-28")
abline(v=lockdown_time, col="black", lty="dashed", lwd=2)
text(x=lockdown_time, y=0.99*max(yvals), labels="lockdown ends", pos=2, srt=90, cex=0.7)


# Add horizontal line with abline
threshold = 1000
abline(h=threshold, col="grey", lty="dotted", lwd=2)
text_x = min(xvals) + 0.05*(max(xvals) - min(xvals))
text(x=text_x, y=threshold, labels="(1000 cases)", pos=3, srt=0, cex=0.7, col="grey")

# Add legend
legend(x="right", legend=c("active cases", "recovered"), col=c("red2", "green4"), lwd=c(2,2), pch=c(1,2))

dev.off()	# closes the PDF for writing; after this, new plots go to screen again
cmdstr = paste0("open ", pdffn)  # create the command to open the PDF
system(cmdstr)                   # open the PDF in the operating system