agila5/PITs.md

## PITs.md

      
    Raw
  

              PITs.md
            
          
    # packages
library(countreg) # for CDF of Hurdle Poisson distribution
#> Loading required package: MASS

# Take nonrandomized PIT function from Czado et al. (2009)
czado_pit <- function(x, Px, Px1, n.bins=10, y.max=2.75, my.title="PIT Histogram")
{
  a.mat <- matrix(0,n.bins,length(x))
  k.vec <- pmax(ceiling(n.bins*Px1),1)
  m.vec <- ceiling(n.bins*Px)
  d.vec <- Px-Px1
  for (i in 1:length(x))
  {
    if (k.vec[i]==m.vec[i]) {a.mat[k.vec[i],i]=1}
    else 
    { 
      a.mat[k.vec[i],i]=((k.vec[i]/n.bins)-Px1[i])/d.vec[i]
      if ((k.vec[i]+1)<=(m.vec[i]-1))
      {for (j in ((k.vec[i]+1):(m.vec[i]-1))) {a.mat[j,i]=(1/(n.bins*d.vec[i]))}}
      a.mat[m.vec[i],i]=(Px[i]-((m.vec[i]-1)/n.bins))/d.vec[i]     
    }
  }
  a <- apply(a.mat,1,sum)
  a <- (n.bins*a)/(length(x))
  p <- (0:n.bins)/n.bins
  PIT <- "Probability Integral Transform"
  RF <- "Relative Frequency"
  plot(p, p, ylim=c(0,y.max), type="n", xlab=PIT, ylab=RF, main=my.title) 
  temp1 <- ((1:n.bins)-1)/n.bins
  temp2 <- ((1:n.bins)/n.bins)
  o.vec <- rep(0,n.bins)
  segments(temp1,o.vec,temp1,a)
  segments(temp1,a,temp2,a)
  segments(temp2,o.vec,temp2,a)
  segments(0,0,1,0)
}

# Simulate data from the following model: 
# y_i ~ Zero-Hurdle Poisson(lambda_i, pi_i), i = 1, ..., n
# i.e. p(Y_i = y_i) = pi_i{y_i == 0} + (1 - pi_i) * Poisson(y_i; lambda_i) {y_i >= 0}
# ref: https://data.princeton.edu/wws509/notes/countmoments
# where
# log(lambda_i) = eta_i = 1 + x, x ~ N(0, 1)
# logit(pi_i) = 3.5 - eta_i, which means that the higher is lambda_i, the lower
# is the probability of being zero.
# I think that the previous model is a reasonable model for car crashes counts
# (which is my current research project).

# First I simulate eta
n <- 500
set.seed(2)
b0 <- 1
b1 <- 1
x <- rnorm(n)
eta <- b0 + b1 * x

# then lambda and pi, whose distribution is very skewed: 
lambda <- exp(eta)
pi <- exp(3.5 - eta) / (1 + exp(3.5 - eta))
hist(pi, breaks = 15, xlim = c(0, 1)) 

# and then y
y <- rhpois(n, lambda, 1 - pi)

# These are the "regular" PITs (i.e. the forecast distribution, F, is exactly
# the same distribution that generated the data so it should be a "perfect"
# model).
Py <- phpois(y, lambda, 1 - pi)
hist(Py, breaks = 20, xlim = c(0, 1)) # far from being uniform between 0 and 1

# Randomized PITs
v <- runif(n)
Py1 <- phpois(y - 1, lambda, 1 - pi)
randomized_pits <- Py1 + v * (Py - Py1)
hist(randomized_pits, breaks = 15, xlim = c(0, 1))

# The previous plot seems quite uniform but the problem is that Py1 == 0 for most
# observations, i.e.
mean(Py1 == 0)
#> [1] 0.838
# so we are actually representing v * Py, which is not that informative in this
# case since we are just randomizing the previous PITs values conditioning on y
# > 0.

# Nonrandomized PITs
czado_pit(y, Py, Py1, n.bins = 20, y.max = 1.5)

# The problem here is in the left tail on the distribution because "nothing
# happens" there. In fact, since
min(Py)
#> [1] 0.4566757
# is far from 0 so (u - P_(x-1)) / (Px - P_(x -1)) is just u / min(Py) for all u
# < min(Py) and we are just representing a straight line (i.e. F(u) = u / 0.45),
# which implies that the PIT histogram is flat.

# Obviously both problems fade away when the distribution of pi is less skewed
# but this is not the case for car crashes counts data.

# The question here is: does it make sense to check the uniformity between
# min(pi) and 1? I'm not 100% sure about that since in this example we are
# still looking at a "non-uniform" histogram between 0.45 and 1. Is it possible
# to correct the randomized and non-randomized PITS histogram?
^{Created on 2020-04-14 by the reprex package (v0.3.0)}