fonnesbeck/R vs Python.ipynb

## R vs Python.ipynb
{
 "metadata": {
  "name": "",
  "signature": "sha256:81db4918c624da531d3c6b8321242710aa07d49e4ea9b174796a291ca62a17ac"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%load_ext rmagic"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%R\n",
      "\n",
      "## Write simulation for the Ciombor trial \n",
      "# cd ~/research_drive/AdaptiveTrialDesign/simulations/survivalOutcomes/numberOfEventsStoppingRule/\n",
      "# setwd(\"~/research_drive/AdaptiveTrialDesign/simulations/survivalOutcomes/numberOfEventsStoppingRule/\")\n",
      "options(width = 120, scipen = 6)                 # This sets the width of the output. It's useful for making tabular output display correctly if the rows are really wide.\n",
      "\n",
      "# pinvgamma returns the probability that a random variable from the inverse gamma distribution (with the given parameters) is less than q.\n",
      "pinvgamma = function(q, shape = 1, rate = 1, scale = 1/rate,\n",
      "   lower.tail = TRUE, log.p = FALSE) {\n",
      "    pgamma(q = 1/q,shape,rate,scale,!lower.tail,log.p)\n",
      "}\n",
      "\n",
      "# qinvgamma: quantile function of the inverse-gamma distribution. It's kind of the reverse of the pinvgamma function.\n",
      "qinvgamma = function(p, shape = 1, rate = 1, scale = 1/rate,\n",
      "\t\t\tlower.tail = TRUE, log.p = FALSE) {\n",
      "    return( 1 / qgamma(p,shape,rate,scale,!lower.tail,log.p))\n",
      "}\n",
      "\n",
      "\n",
      "exponentialSurvivalFutilityStoppingNumEvents <- function(nreps = 100,   # number of complete trials simulated\n",
      "   M = 1000,                                     # number of trials simulated to get the predicted probabilites for the interim analysis at every trial.\n",
      "   nmax,                                         # maximum number of patients accrued, which occurs when the trial does not stop early\n",
      "   nominalProb,                                  # cut off for the posterior probability above which the trial is considered a success\n",
      "   futilitybound,                                # cut off for the predicted probability of success below which the trial is stopped for futility\n",
      "   nullmedian,                                   # the value of the median under the null hypothesis. This is used in testing\n",
      "   treatmentmedian,                              # the true median survival time for the treatment\n",
      "   checkpoints,                                  # vector of numbers of events at which to take the interim looks (not number of subjects accrued or time since the trial started).  = c(6)\n",
      "   recruitmenttime,                              # duration of time during which patients are still accruing\n",
      "   followuptime,                                 # duration of time after accrual is closed during which the patients are still being followed.\n",
      "   gammaalpha,                                   # alpha paramerter for the gamma prior for the rate parameter of the exponential time to event\n",
      "   gammabeta,                                    # beta paramerter for the gamma prior for the rate parameter of the exponential time to event  = (6/log(2))\n",
      "   verbose = FALSE,                              # Controls whether progress is printed to screen\n",
      "   seed = 50){                                   # random seed. The purpose of this is to make sure we can reproduce our results. If the nmax is large enough, though, the results should be the same to about 3 decimal points.\n",
      " \n",
      "   set.seed(seed)\n",
      "   log_2 <- log(2)                               # Trying to calculate this outside the loop to save time\n",
      "   nullmean <- nullmedian/log_2                  # This converts the input, in terms of the median of the distribution, into the mean, which some of the functions require.\n",
      "   treatmentmean <- treatmentmedian/log_2        # (The clinicians know the median values rather than the mean.)\n",
      "   maxtime <- recruitmenttime + followuptime     # This is the maximum time the trial will run, if there is no early stopping.\n",
      "\n",
      "  ####################################################################\n",
      "  ## For each complete trial, record final sample size, reason for stopping, time the trial ended, and conclusion of the trial.\n",
      "  ## finalsamplesize, reasonforstopping, finalPostProb\n",
      "  # All of these make empty data structures to store results that are generated at each iteration of the loops. Pre-allocation of the structures, specifying the size is more efficient in R.\n",
      "  finalsamplesize <- rep(NA, nreps)              # For each of the \"nreps\" trials simulated, this records the sample size at which the trial ended. \n",
      "  finalPostProb <- rep(NA, nreps)                # Each trial is considered a success if the final posterior probability that theta is greater than the null value is greater than nominalProb. This stores all those probabilities. \n",
      "  #finalThetaLower <- rep(NA, nreps)              # This is for the lower bound of a confidence interval for theta, one for each of the simulated trials.\n",
      "  #finalThetaUpper <- rep(NA, nreps)\n",
      "  stopearly <- factor(rep(NA, nreps), levels = c(\"No\", \"Yes\"))  # This records whether each trial ended early for either futility or efficacy.\n",
      "  endtime <- rep(NA, nreps)                      # Records the length of time that the trial lasted. \n",
      "  interimtime <- matrix(NA, nrow = nreps, ncol = length(checkpoints))   # Records the time point of each of the interim looks, for each simulated trial. (Since the trigger for doing an interim look is based on number of events, the time of each look can vary.)\n",
      "  numberofevents <- rep(NA, nreps)               # The number of events (deaths) in each trial.(Each patient in each trial has a simulated death time. They are only counted as an event if the death time is BEFORE the trial ends.)\n",
      "  exposuretime <- rep(NA, nreps)                 # The total time all the patients were in the study, found by summing each patient's time in the study, within one simulated trial.  \n",
      "  numberofeventsInterim <- rep(NA, nreps)        # The number of events at the last interim time point. (This should probaly be expanded to a matrix to record the number of events at EACH interim point.)\n",
      "  exposuretimeInterim <- rep(NA, nreps)          # The total exposure time at the time point of the last interim analysis. \n",
      "  posteriorProbsInterim <- rep(NA, nreps)        # The posterior probability that theta is bigger than the null value, at the interim time point.\n",
      "  predprobsuccessInterim <- rep(NA, nreps)       # The probability that the trial would be a success if continued until nmax, calculated based on data collected until the interim time point.\n",
      "  theta_hat <- numeric(nreps)                    # This will store the estimates of theta for each trial. \n",
      "  ####################################################################\n",
      "  ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
      "  dat_entrytime <- seq(from = 0, to = recruitmenttime, length.out = nmax)     # Generates a uniform time for entry into the study for each patient. Note this is outside the main loop, so the entry times are the same for every trial in this case.   \n",
      "for(reps in seq_len(nreps)){                   # The main loop. Each iteration of this simulates a complete trial.\n",
      "  ## Generate data for all patients. These should be constant for one trial. \n",
      "\n",
      "  dat_times <- rexp(n = nmax, rate = 1/treatmentmean)                 # This is the \"lifetime\" of each patient.\n",
      "    # Calculate the time of death (in time since beginning of study) for each person\n",
      "  dat_timeOfDeath <- dat_entrytime + dat_times   # This is the time of death. It takes into account when the entered the study.\n",
      "  dat_deathOrder <- rank(dat_timeOfDeath)        # This ranks all of the death times to mark the person who died first, second, etc.\n",
      "  ##########################################################################\n",
      "  ### sample size analysis (efficacy and futility)\n",
      "  # Loop through the interim looks in checkpoints\n",
      "\n",
      "  for(i in seq_len(length(checkpoints))){        #  This is the \"inner\" loop, that goes through all of the checkpoints. At each one of these, an analysis of the data collected so far is conducted to determine whether to end the trial early for futility or efficacy.\n",
      "    #############################################################################\n",
      "    ##  Calculate CURRENT total exposure time and number of events, which are the\n",
      "    ##  sufficient statistics at the time of interim analysis.\n",
      "    ##  The simulated final times to events will be based on this.\n",
      "    #############################################################################  \n",
      "\n",
      "    lengthofstudy <- dat_timeOfDeath[dat_deathOrder == checkpoints[i]] # This is the \"current\" time for the current interim analysis, which is based on the death time of a particular patient.\n",
      "    interimtime[reps, i] <- lengthofstudy\n",
      "    subset <- dat_entrytime < lengthofstudy                            # This makes a subsetting criteria, which is those patients who have already entered the study.\n",
      "    currentdata_entrytime <- dat_entrytime[subset]                     # The \"currentdata\" prefix indicates this group of patients who have already entered the study.\n",
      "    currentdata_times <- dat_times[subset]\n",
      "\n",
      "    currentdata_censoringtime <- lengthofstudy - currentdata_entrytime # Censoring time is the total time we had to observe the patient, from the time the patient entered until now.\n",
      "    currentdata_observedt <- pmin.int(currentdata_times, currentdata_censoringtime) # The observedt is the (pairwise) minimum of the times and the censoringtime. time will be lower if they died before the current time \n",
      "    currentdata_deathOrder <- dat_deathOrder[subset]\n",
      "    currentdata_delta <- currentdata_deathOrder <= checkpoints[i]      # Logical statement indicating whether each person died or not before the current time.\n",
      "    #currentdata_delta <- abs(currentdata_observedt - currentdata_times) < .Machine$double.eps #This is an alternative to the last two lines, but it will take more time.\n",
      "\n",
      "    exposuretimeInterim[reps] <- exposuretimeTreatment <- sum(currentdata_observedt) # Calculate the total exposure time for all patients in this trial so far.\n",
      "    numberofeventsInterim[reps] <- numberofeventsTreatment <- sum(currentdata_delta) # Calculate the number of deaths in this trial so far.\n",
      "\n",
      "    posteriorProbsInterim[reps] <- pinvgamma(q = nullmedian, shape = (gammaalpha + numberofeventsTreatment), \n",
      "           rate = (log_2*gammabeta + log_2*exposuretimeTreatment), lower.tail = FALSE)   # Calculate probability that theta is greater than the null median survival time. \n",
      "                                                                                         # Note that this probability depends on the total number of events and the total exposure time.\n",
      "    #############################################################################################\n",
      "    ##  calculate predictive probability of success in maximum sample size (nmax) (plus followuptime additional follow up)\n",
      "    #################################################################################################   \n",
      "    posteriorProbs <- rep(NA, M)\n",
      "\n",
      "    # Need to figure out who has already died at this time point to know who to simulate times for in the MC loop. This calculation is very similar to the previous calculations, but here \n",
      "    # we calculate the quantities for all the patients, not just those who have already entered the study. \n",
      "\n",
      "    dat_censoringtime <- lengthofstudy - dat_entrytime\n",
      "    dat_observedt <- pmin.int(dat_times, dat_censoringtime)\n",
      "    dat_delta <- dat_observedt == dat_times  \n",
      "    dat_observedt[dat_censoringtime < 0] <- 0                         # Those who haven't entered the study yet\n",
      "    # Step 2: Simulate final time-to-event for each subject (who is still alive), conditional on the current censored time.\n",
      "      # This should be for all subjects in dat, to be used in step 4. \n",
      "    dat_newtime <- rep(NA, nmax)                                      # newtime will be the new, imputed times vector. For those who have already died at this point. Those who have not will get imputed times.\n",
      "    dat_newtime[dat_delta] <- dat_observedt[dat_delta]                # Those who have already died \n",
      "\n",
      "    for(iter in seq_len(M)){                                          # This is the MC loop (MC for Monte Carlo simulation)\n",
      "\t# Step 1: Simulate hazard rate based on current data.\n",
      "\thazardTreatment <- rgamma(n = 1, shape = (gammaalpha + numberofeventsTreatment), scale = 1/(gammabeta + exposuretimeTreatment))  # Generates a random variable from the specified gamma probability distribution.\n",
      "\n",
      "\t# Step 2: Simulate final time-to-event for each subject (who is still alive), conditional on the current censored time.\n",
      "\t  # This should be for all subjects in dat, to be used in step 4. \n",
      "\n",
      "\tdat_newtime[!dat_delta] <- dat_observedt[!dat_delta] + rexp(n = sum(!dat_delta), rate = hazardTreatment) # For the patients that haven't died yet, their new time is their old time plus a random time drawn from the exponential distribution.\n",
      "\n",
      "\t# Step4: (Stopping for futility) Bayesian \"test\" for maximum sample size and an additional followuptime\n",
      "\n",
      "\tmc_censoringtime <- maxtime - dat_entrytime                # maxtime is like the temporary length of study\n",
      "\tmc_observedt <- pmin.int(dat_newtime, mc_censoringtime)    # Using the simulated times from step 2. (?)\n",
      "\tmc_delta <- mc_observedt == dat_newtime                    # delta indicates whether the patient died. 0 indicates the time was censored.\n",
      "\n",
      "\ttempexposuretime <- sum(mc_observedt)\n",
      "\ttempnumberofevents <- sum(mc_delta)\n",
      "\tposteriorProbs[iter] <- pinvgamma(q = nullmedian, shape = (gammaalpha + tempnumberofevents), \n",
      "           rate = (log_2*gammabeta + log_2*tempexposuretime), lower.tail = FALSE)    # This is the \"analysis\" for this set of fake data.\n",
      "    }                                                              # end of MC reps (iter in 1:M)\n",
      "\n",
      "    predprobsuccessInterim[reps] <- predprobsuccessMax <- sum(posteriorProbs > nominalProb)/M  #Gives percent of trials in the MC simulation that had P(theta > nominalProb), theta = median\n",
      "\n",
      "    if(predprobsuccessMax < futilitybound){                        # This is the criteria for stopping early for futility. If the criteria is NOT satisfied, it goes to the next interim checkpoint (or final analysis if this is the last checkpoint).\n",
      "        stopearly[reps] <- \"Yes\"\n",
      "        #cat(paste(\"stopearly = \", stopearly[reps], \"\\t\"))\n",
      "\tbreak                                                      # This breaks out of the checkpoints loop\n",
      "    }                                                              # end of if()\n",
      "  }                                                                # end of i in 1:length(checkpoints)\n",
      "  if(is.na(stopearly[reps])){stopearly[reps] <- \"No\"}\n",
      "\n",
      "\n",
      "  # Record final sample size and the ending time of the study.\n",
      "  if(stopearly[reps] == \"Yes\"){\n",
      "    finalsamplesize[reps] <- sum(subset)\n",
      "    endtime[reps] <- lengthofstudy\n",
      "    finalPostProb[reps] <- NA\n",
      "    exposuretime[reps] <- exposuretimeTreatment\n",
      "    numberofevents[reps] <- numberofeventsTreatment\n",
      "  } else{                                                           \n",
      "    finalsamplesize[reps] <- nmax\n",
      "    endtime[reps] <- maxtime\n",
      "    ###########################################################\n",
      "    ###########################################################\n",
      "    ### Final analysis: bayesian                               # This only happens if the trial was not stopped for futility.\n",
      "    # If the trial wasn't stopped for futility, the time of this final analysis would be at maxtime, which includes followuptime.\n",
      "\n",
      "    dat_censoringtime <- maxtime - dat_entrytime\n",
      "    dat_observedt <- pmin.int(dat_times, dat_censoringtime)\n",
      "    dat_delta <- dat_observedt == dat_times                        # delta indicates dead. 0 indicates the time was censored.\n",
      "    numberofevents[reps] <- sum(dat_delta)\n",
      "    exposuretime[reps] <- sum(dat_observedt)\n",
      "\n",
      "    finalPostProb[reps] <- pinvgamma(q = nullmedian, shape = (gammaalpha + numberofevents[reps]), \n",
      "       rate = (log_2*gammabeta + log_2*exposuretime[reps]), lower.tail = FALSE)\n",
      "    #finalThetaLower[reps] <- qinvgamma(p = 0.05, shape = (gammaalpha + numberofevents[reps]), \n",
      "    #   rate = (log_2*gammabeta + log_2*exposuretime[reps]), lower.tail = TRUE)\n",
      "    #finalThetaUpper[reps] <- qinvgamma(p = 0.05, shape = (gammaalpha + numberofevents[reps]), \n",
      "    #   rate = (log_2*gammabeta + log_2*exposuretime[reps]), lower.tail = FALSE)\n",
      "\n",
      "    ##############################################################\n",
      "    ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
      "      }                                                            # end of else\n",
      "  # Get estimate of theta, the median PFS. The estimator we are using is the mean of the posterior distribution of theta.\n",
      "  theta_hat[reps] <- (log_2*gammabeta + log_2*exposuretime[reps])/(gammaalpha + numberofevents[reps] - 1)\n",
      "  #qinvgamma(p = 0.5, shape = (gammaalpha + numberofevents[reps]), rate = (log_2*gammabeta + log_2*exposuretime[reps])) This gives the median of the posterior distribution. \n",
      " \n",
      "  if(verbose == TRUE){cat(paste(reps, \" \", stopearly[reps], \"\\n\"))}                     # This displays the current number of repetitions on the screen.\n",
      "  }                                                                # end of reps in 1:nreps\n",
      "\n",
      "  # This part makes several objects for the output. \n",
      "  results <- data.frame(stopearly, finalsamplesize, endtime, numberofeventsInterim, numberofevents, exposuretimeInterim, exposuretime,\n",
      "    posteriorProbsInterim, predprobsuccessInterim, \n",
      "      finalPostProb)\n",
      "  interimTimesSummary <- data.frame(lookNumber = seq_len(length(checkpoints)), numberOfEvents = checkpoints, meanTime = colMeans(interimtime, na.rm = TRUE), \n",
      "     sd = apply(interimtime, FUN = sd, MARGIN = 2, na.rm = TRUE))\n",
      "  output <- data.frame(trueMedian = treatmentmedian,\n",
      "    nullMedian = nullmedian, \n",
      "    futilityBound = futilitybound, \n",
      "    avgTime = mean(endtime), \n",
      "    avgEvents = mean(numberofevents),\n",
      "    avgExposure = mean(exposuretime),\n",
      "    \"E(N)\" = mean(finalsamplesize), \n",
      "    PET = mean(stopearly == \"Yes\"),\n",
      "    \"power\" = sum(finalPostProb > nominalProb, na.rm = TRUE)/nreps,\n",
      "    numberOfTrials = nreps, numberOfSim = M)\n",
      "   bias <- data.frame(trueMedian = treatmentmedian, \n",
      "    nullMedian = nullmedian,\n",
      "    \"number_of_looks\" = length(checkpoints),\n",
      "    \"mean(theta_hat)\" = mean(theta_hat[finalsamplesize == nmax]), \n",
      "    \"median(theta_hat)\" = median(theta_hat[finalsamplesize == nmax]), \n",
      "    \"theta 0.025 percentile\" = quantile(theta_hat[finalsamplesize == nmax], probs = 0.025, names = FALSE),\n",
      "    \"theta 0.975 percentile\" = quantile(theta_hat[finalsamplesize == nmax], probs = 0.975, names = FALSE),\n",
      "    \"bias\" = mean(theta_hat[finalsamplesize == nmax]) - treatmentmedian,\n",
      "    \"sd(bias)\" = sd(theta_hat[finalsamplesize == nmax] - treatmentmedian))\n",
      "    #percent90CIdisagreesPostProb = mean(finalPostProb > nominalProb & (nullmedian > finalThetaLower & nullmedian < finalThetaUpper), na.rm = TRUE)\n",
      "\n",
      "   # This is what is to be returned by this function. Each of them has a name so that the user can refer to any part of the output he wants.\n",
      "   return(list(results = results, output = output, bias = bias, interimTimesSummary = interimTimesSummary, theta_hat = theta_hat))\n",
      "}                                                                   # end of function exponentialSurvivalFutilityStoppingNumEvents\n",
      "\n",
      "\n",
      "system.time(R.results <- exponentialSurvivalFutilityStoppingNumEvents(nreps = 100, M = 100, nmax = 48,       \n",
      "                                                                      nominalProb = 0.92, futilitybound = 0.25, nullmedian = 3, treatmentmedian = 3,      \n",
      "                                                                      checkpoints = 15,        \n",
      "                                                                      recruitmenttime = 12, followuptime = 6, gammaalpha = 3, gammabeta = (6/log(2)), seed = 50)\n",
      ")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "text": [
        "   user  system elapsed \n",
        "  2.226   0.007   2.233 \n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "########################################################################################\n",
      "### Code to replicate R file, titled exponentialSurvivalFutilityStoppingNumEvents.R\n",
      "### Purpose: to see speed improvement of Python vs. R\n",
      "### Author: Ben Saville\n",
      "### Date: Feb 10, 2014\n",
      "########################################################################################\n",
      "\n",
      "from __future__ import division          ### Needed in Python 2 to get correct values if dividing by integers, e.g. 9/10 = 0.9, and not rounding down to 0\n",
      "\n",
      "import numpy as np\n",
      "import scipy as sc\n",
      "import scipy.stats as ss\n",
      "import math\n",
      "import pandas as pd\n",
      "import time\n",
      "\n",
      "from scipy.special import gammainc, gamma\n",
      "\n",
      "invgammacdf = lambda x, a, b: 1 - gammainc(a, b/x)/gamma(a)\n",
      "\n",
      "\n",
      "start_time = time.time()\n",
      "\n",
      "\n",
      "nreps = 100\n",
      "M = 100\n",
      "nmax = 48\n",
      "nominalProb = 0.92\n",
      "futilitybound = 0.25\n",
      "nullmedian = 3\n",
      "treatmentmedian = 3    \n",
      "recruitmenttime = 12\n",
      "followuptime = 6\n",
      "gammaalpha = 3\n",
      "gammabeta = (6/math.log(2))\n",
      "seed = 50\n",
      "checkpoints = [15]\n",
      "\n",
      "   \n",
      "log_2 = math.log(2)                               # Trying to calculate this outside the loop to save time\n",
      "nullmean = nullmedian/log_2                  # This converts the input in terms of the median of the distribution, into the mean, which some of the functions require.\n",
      "treatmentmean = treatmentmedian/log_2        # (The clinicians know the median values rather than the mean.)\n",
      "maxtime = recruitmenttime + followuptime     # This is the maximum time the trial will run, if there is no early stopping.\n",
      "\n",
      "####################################################################\n",
      "## For each complete trial, record final sample size, reason for stopping, time the trial ended, and conclusion of the trial.\n",
      "## finalsamplesize, reasonforstopping, finalPostProb\n",
      "# All of these make empty data structures to store results that are generated at each iteration of the loops. Pre-allocation of the structures, specifying the size is more efficient in R.\n",
      "finalsamplesize = np.array([np.nan]*nreps)             # For each of the \"nreps\" trials simulated, this records the sample size at which the trial ended. \n",
      "finalPostProb = np.array([np.nan]*nreps)             # Each trial is considered a success if the final posterior probability that theta is greater than the null value is greater than nominalProb. This stores all those probabilities. \n",
      "#finalThetaLower = rep(NA, nreps)              # This is for the lower bound of a confidence interval for theta, one for each of the simulated trials.\n",
      "#finalThetaUpper = rep(NA, nreps)\n",
      "stopearly = np.array([False]*nreps)    # This records whether each trial ended early for either futility or efficacy.\n",
      "endtime = np.array([np.nan]*nreps)                       # Records the length of time that the trial lasted. \n",
      "\n",
      "interimtime = np.zeros( (nreps,len(checkpoints)) )\n",
      "interimtime[:] = np.NAN   # Records the time point of each of the interim looks, for each simulated trial. (Since the trigger for doing an interim look is based on number of events, the time of each look can vary.)\n",
      "numberofevents = np.array([np.nan]*nreps)               # The number of events (deaths) in each trial.(Each patient in each trial has a simulated death time. They are only counted as an event if the death time is BEFORE the trial ends.)\n",
      "exposuretime = np.array([np.nan]*nreps)                  # The total time all the patients were in the study, found by summing each patient's time in the study, within one simulated trial.  \n",
      "numberofeventsInterim = np.array([np.nan]*nreps)         # The number of events at the last interim time point. (This should probaly be expanded to a matrix to record the number of events at EACH interim point.)\n",
      "exposuretimeInterim = np.array([np.nan]*nreps)           # The total exposure time at the time point of the last interim analysis. \n",
      "posteriorProbsInterim = np.array([np.nan]*nreps)         # The posterior probability that theta is bigger than the null value, at the interim time point.\n",
      "predprobsuccessInterim = np.array([np.nan]*nreps)        # The probability that the trial would be a success if continued until nmax, calculated based on data collected until the interim time point.\n",
      "theta_hat = np.array([np.nan]*nreps)                     # This will store the estimates of theta for each trial. \n",
      "\n",
      "dat_entrytime = np.linspace(start=0,stop=recruitmenttime,num=nmax) # Generates a uniform time for entry into the study for each patient. Note this is outside the main loop, so the entry times are the same for every trial in this case.   \n",
      "\n",
      "#reps = 0\n",
      "\n",
      "for reps in xrange(nreps):\n",
      "    #for(reps in seq_len(nreps)){                   # The main loop. Each iteration of this simulates a complete trial.\n",
      "    ## Generate data for all patients. These should be constant for one trial. \n",
      "    # rate=1/scale; This is the \"lifetime\" of each patient.\n",
      "    dat_times = np.random.exponential(scale=treatmentmean, size=nmax)     \n",
      "    # Calculate the time of death (in time since beginning of study) for each person\n",
      "    # This is the time of death. It takes into account when the entered the study.\n",
      "    dat_timeOfDeath = dat_entrytime + dat_times   \n",
      "    \n",
      "    # This ranks all of the death times to mark the person who died first, second, etc.\n",
      "    dat_deathOrder = ss.rankdata(dat_timeOfDeath)  \n",
      "    \n",
      "    ##########################################################################\n",
      "    ### sample size analysis (efficacy and futility)\n",
      "    # Loop through the interim looks in checkpoints\n",
      "    #i=0\n",
      "    for i in xrange(len(checkpoints)):\n",
      "        #for(i in seq_len(length(checkpoints))){        #  This is the \"inner\" loop, that goes through all of the checkpoints. At each one of these, an analysis of the data collected so far is conducted to determine whether to end the trial early for futility or efficacy.\n",
      "            #############################################################################\n",
      "            ##  Calculate CURRENT total exposure time and number of events, which are the\n",
      "            ##  sufficient statistics at the time of interim analysis.\n",
      "            ##  The simulated final times to events will be based on this.\n",
      "            #############################################################################  \n",
      "        # This is the \"current\" time for the current interim analysis, which is based on the death time of a particular patient.\n",
      "        lengthofstudy = dat_timeOfDeath[dat_deathOrder == checkpoints[i]] \n",
      "        interimtime[reps, i] = lengthofstudy\n",
      "        # This makes a subsetting criteria, which is those patients who have already entered the study.\n",
      "        subset = dat_entrytime < lengthofstudy                            \n",
      "        # The \"currentdata\" prefix indicates this group of patients who have already entered the study.\n",
      "        currentdata_entrytime = dat_entrytime[subset]                     \n",
      "        currentdata_times = dat_times[subset]\n",
      "        # Censoring time is the total time we had to observe the patient, from the time the patient entered until now.\n",
      "        currentdata_censoringtime = lengthofstudy - currentdata_entrytime \n",
      "        # The observedt is the (pairwise) minimum of the times and the censoringtime. time will be lower if they died before the current time \n",
      "        currentdata_observedt = np.minimum(currentdata_times, currentdata_censoringtime) \n",
      "        currentdata_deathOrder = dat_deathOrder[subset]\n",
      "        # Logical statement indicating whether each person died or not before the current time.\n",
      "        currentdata_delta = currentdata_deathOrder <= checkpoints[i]      \n",
      "        #currentdata_delta = abs(currentdata_observedt - currentdata_times) < .Machine$double.eps #This is an alternative to the last two lines, but it will take more time.\n",
      "        # Calculate the total exposure time for all patients in this trial so far.\n",
      "        exposuretimeInterim[reps] = exposuretimeTreatment = currentdata_observedt.sum() \n",
      "        # Calculate the number of deaths in this trial so far.\n",
      "        numberofeventsInterim[reps] = numberofeventsTreatment = currentdata_delta.sum()\n",
      "        posteriorProbsInterim[reps] = 1 - invgammacdf(x=nullmedian, a=(gammaalpha + numberofeventsTreatment), b=(log_2*gammabeta + log_2*exposuretimeTreatment)  )  # Calculate probability that theta is greater than the null median survival time. \n",
      "        # Note that this probability depends on the total number of events and the total exposure time.\n",
      "        #############################################################################################\n",
      "        ##  calculate predictive probability of success in maximum sample size (nmax) (plus followuptime additional follow up)\n",
      "        #################################################################################################   \n",
      "        posteriorProbs = np.array([np.nan]*M)\n",
      "        # Need to figure out who has already died at this time point to know who to simulate times for in the MC loop. This calculation is very similar to the previous calculations, but here \n",
      "        # we calculate the quantities for all the patients, not just those who have already entered the study. \n",
      "        dat_censoringtime = lengthofstudy - dat_entrytime\n",
      "        dat_observedt = np.minimum(dat_times, dat_censoringtime)\n",
      "        dat_delta = (dat_observedt == dat_times ) \n",
      "         # Those who haven't entered the study yet\n",
      "        dat_observedt[dat_censoringtime < 0] = 0                        \n",
      "        # Step 2: Simulate final time-to-event for each subject (who is still alive), conditional on the current censored time.\n",
      "        # This should be for all subjects in dat, to be used in step 4. \n",
      "        dat_newtime = np.array([np.nan]*nmax)\n",
      "        # newtime will be the new, imputed times vector. For those who have already died at this point. Those who have not will get imputed times.\n",
      "        #dat_newtime = np.array(['NaN']*nmax)                              \n",
      "        # Those who have already died \n",
      "        dat_newtime[(dat_delta)] = dat_observedt[dat_delta]                \n",
      "\n",
      "        for iter in xrange(M):\n",
      "            # This is the MC loop (MC for Monte Carlo simulation)\n",
      "            #for(iter in seq_len(M)){                                          \n",
      "              # Step 1: Simulate hazard rate based on current data.\n",
      "            hazardTreatment = np.random.gamma(gammaalpha + numberofeventsTreatment, scale=1/(gammabeta + exposuretimeTreatment), size=1)\n",
      "            # Generates a random variable from the specified gamma probability distribution.\n",
      "            # Step 2: Simulate final time-to-event for each subject (who is still alive), conditional on the current censored time.\n",
      "            # This should be for all subjects in dat, to be used in step 4. \n",
      "            dat_newtime[-dat_delta] = dat_observedt[-dat_delta] + np.random.exponential(scale=1/hazardTreatment, size=(-dat_delta).sum())  # For the patients that haven't died yet, their new time is their old time plus a random time drawn from the exponential distribution.\n",
      "            # Step4: (Stopping for futility) Bayesian \"test\" for maximum sample size and an additional followuptime\n",
      "            # maxtime is like the temporary length of study\n",
      "            mc_censoringtime = maxtime - dat_entrytime                \n",
      "            # Using the simulated times from step 2. (?)\n",
      "            mc_observedt = np.minimum(dat_newtime, mc_censoringtime)    \n",
      "            # delta indicates whether the patient died. 0 indicates the time was censored.\n",
      "            mc_delta = (mc_observedt == dat_newtime)                    \n",
      "            tempexposuretime = mc_observedt.sum()\n",
      "            tempnumberofevents = mc_delta.sum()\n",
      "            posteriorProbs[iter] = 1 - invgammacdf(x=nullmedian, \n",
      "                                                a=(gammaalpha + tempnumberofevents), \n",
      "                                                b=(log_2*gammabeta + log_2*tempexposuretime)  )     # This is the \"analysis\" for this set of fake data.\n",
      "            #}                                                         # end of MC reps (iter in 1:M)\n",
      "        #posteriorProbs\n",
      "        #####################################################################\n",
      "        predprobsuccessInterim[reps] = predprobsuccessMax = (posteriorProbs > nominalProb).sum()/M  #Gives percent of trials in the MC simulation that had P(theta > nominalProb), theta = median\n",
      "        if(predprobsuccessMax < futilitybound):                       # This is the criteria for stopping early for futility. If the criteria is NOT satisfied, it goes to the next interim checkpoint (or final analysis if this is the last checkpoint).\n",
      "            stopearly[reps] = True\n",
      "          #cat(paste(\"stopearly = \", stopearly[reps], \"\\t\"))\n",
      "            break                                                      # This breaks out of the checkpoints loop\n",
      "                                                                      # end of if()\n",
      "        #}                                                                # end of i in 1:length(checkpoints)\n",
      "\n",
      "    # Record final sample size and the ending time of the study.\n",
      "    if(stopearly[reps]):\n",
      "        finalsamplesize[reps] = subset.sum()\n",
      "        endtime[reps] = lengthofstudy\n",
      "        finalPostProb[reps] = np.nan\n",
      "        exposuretime[reps] = exposuretimeTreatment\n",
      "        numberofevents[reps] = numberofeventsTreatment\n",
      "    else:                                                           \n",
      "        finalsamplesize[reps] = nmax\n",
      "        endtime[reps] = maxtime\n",
      "    ###########################################################\n",
      "    ###########################################################\n",
      "    ### Final analysis: bayesian                               # This only happens if the trial was not stopped for futility.\n",
      "    # If the trial wasn't stopped for futility, the time of this final analysis would be at maxtime, which includes followuptime.\n",
      "        dat_censoringtime = maxtime - dat_entrytime\n",
      "        dat_observedt = np.minimum(dat_times, dat_censoringtime)\n",
      "        dat_delta = (dat_observedt == dat_times )                       # delta indicates dead. 0 indicates the time was censored.\n",
      "        numberofevents[reps] = dat_delta.sum()\n",
      "        exposuretime[reps] = dat_observedt.sum()\n",
      "        finalPostProb[reps] = 1 - ss.invgamma.cdf(x=nullmedian, a=(gammaalpha + numberofevents[reps]), \n",
      "                            scale=(log_2*gammabeta + log_2*exposuretime[reps])  )\n",
      "    #finalThetaLower[reps] = qinvgamma(p = 0.05, shape = (gammaalpha + numberofevents[reps]), \n",
      "    #   rate = (log_2*gammabeta + log_2*exposuretime[reps]), lower.tail = TRUE)\n",
      "    #finalThetaUpper[reps] = qinvgamma(p = 0.05, shape = (gammaalpha + numberofevents[reps]), \n",
      "    #   rate = (log_2*gammabeta + log_2*exposuretime[reps]), lower.tail = FALSE)\n",
      "    ##############################################################\n",
      "    ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
      "        #}                                                            # end of else\n",
      "    # Get estimate of theta, the median PFS. The estimator we are using is the mean of the posterior distribution of theta.\n",
      "    theta_hat[reps] = (log_2*gammabeta + log_2*exposuretime[reps])/(gammaalpha + numberofevents[reps] - 1)\n",
      "    #qinvgamma(p = 0.5, shape = (gammaalpha + numberofevents[reps]), rate = (log_2*gammabeta + log_2*exposuretime[reps])) This gives the median of the posterior distribution. \n",
      "\n",
      "    #if(verbose == TRUE):\n",
      "    #    print([reps, stopearly[reps]])                      # This displays the current number of repetitions on the screen.\n",
      "    #}                                                                # end of reps in 1:nreps\n",
      "# This part makes several objects for the output. \n",
      "results = np.matrix((stopearly, finalsamplesize, endtime, numberofeventsInterim, numberofevents, exposuretimeInterim, exposuretime,\n",
      "                  posteriorProbsInterim, predprobsuccessInterim, \n",
      "                  finalPostProb)).T \n",
      "#interimTimesSummary = data.frame(lookNumber = seq_len(length(checkpoints)), numberOfEvents = checkpoints, meanTime = colMeans(interimtime, na.rm = TRUE), \n",
      "#                              sd = apply(interimtime, FUN = sd, MARGIN = 2, na.rm = TRUE))\n",
      "# output = pd.dataframe('trueMedian': treatmentmedian,\n",
      "#                  'nullMedian': nullmedian, \n",
      "#                  'futilityBound': futilitybound, \n",
      "#                  'avgTime': np.mean(endtime), \n",
      "#                  'avgEvents': np.mean(numberofevents),\n",
      "#                  'avgExposure': np.mean(exposuretime),\n",
      "#                  'E(N)': np.mean(finalsamplesize), \n",
      "#                  'PET': np.mean(stopearly == \"Yes\"),\n",
      "#                  'power': np.mean(finalPostProb > nominalProb),\n",
      "#                  'numberOfTrials': nreps,\n",
      "#                  'numberOfSim': M)\n",
      "output = pd.Series( (treatmentmedian, nullmedian, futilitybound,np.mean(endtime), np.mean(numberofevents),\n",
      "np.mean(exposuretime),np.mean(finalsamplesize), \n",
      "np.mean(stopearly == \"Yes\"),\n",
      "(finalPostProb[finalPostProb!=\"NaN\"] > nominalProb).sum() / nreps,\n",
      "nreps, M), index=['truemedian','nullmedian','futilityBound','avgTime',\n",
      "'avgEvents','avgExposure','E(N)','PET','power','numberOfTrials','numberOfSim'])\n",
      "                 # np.mean(endtime), \n",
      "                 # np.mean(numberofevents),\n",
      "                 # np.mean(exposuretime),\n",
      "                 # np.mean(finalsamplesize), \n",
      "                 # np.mean(stopearly == \"Yes\"),\n",
      "                 # np.mean(finalPostProb > nominalProb),\n",
      "                 # nreps,\n",
      "                 # M)\n",
      "# bias = data.frame(trueMedian = treatmentmedian, \n",
      "#                nullMedian = nullmedian,\n",
      "#                \"number_of_looks\" = length(checkpoints),\n",
      "#                \"mean(theta_hat)\" = mean(theta_hat[finalsamplesize == nmax]), \n",
      "#                \"median(theta_hat)\" = median(theta_hat[finalsamplesize == nmax]), \n",
      "#                \"theta 0.025 percentile\" = quantile(theta_hat[finalsamplesize == nmax], probs = 0.025, names = FALSE),\n",
      "#                \"theta 0.975 percentile\" = quantile(theta_hat[finalsamplesize == nmax], probs = 0.975, names = FALSE),\n",
      "#                \"bias\" = mean(theta_hat[finalsamplesize == nmax]) - treatmentmedian,\n",
      "#                \"sd(bias)\" = sd(theta_hat[finalsamplesize == nmax] - treatmentmedian))\n",
      "# This is what is to be returned by this function. Each of them has a name so that the user can refer to any part of the output he wants.\n",
      "#return output\n",
      "#return([output])list(results = results, output = output, bias = bias, interimTimesSummary = interimTimesSummary, theta_hat = theta_hat))\n",
      "#}                                                                   # end of function exponentialSurvivalFutilityStoppingNumEvents\n",
      "\n",
      "output\n",
      "\n",
      "#outputsave=output\n",
      "\n",
      "print time.time() - start_time, \"seconds\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1.11642193794 seconds\n"
       ]
      }
     ],
     "prompt_number": 4
    }
   ],
   "metadata": {}
  }
 ]
}