Skip to content

Instantly share code, notes, and snippets.

@wetherc
Last active September 27, 2017 17:03
Show Gist options
  • Save wetherc/95f713a06939eca85265b8b5db55f803 to your computer and use it in GitHub Desktop.
Save wetherc/95f713a06939eca85265b8b5db55f803 to your computer and use it in GitHub Desktop.
Group policies by periods of continuous coverage
library(data.table)
################################################################################
#
# Generate some dummy data
#
################################################################################
data <- data.frame(
policyId = "A",
rowId = c(1:5),
pni_startDate = as.Date(
c(
"2015-01-01",
"2015-05-31",
"2015-07-08",
"2016-01-10",
"2016-10-15"
)
),
pni_endDate = as.Date(
c(
"2015-06-01",
"2015-07-08",
"2016-01-08",
"2016-10-15",
"2017-10-15"
)
),
stringsAsFactors = FALSE
)
data <- rbind(data, data)
data[6:10, 1] <- "B"
data[data$policyId == "B", 3:4] <- data[data$policyId == "B", 3:4] + 1
View(data)
################################################################################
#
# Lead the policy start dates and determine whether we see a
# discontinuity in coverage. We're looking for something like
#
# | Person ID | Policy ID | Start Date | End Date |
# |-----------|-----------|------------|------------|
# | A | 1 | 2015-01-01 | 2015-07-01 | --+
# | A | 2 | 2015-07-01 | 2016-01-01 | + These policies overlap
# | A | 3 | 2015-12-31 | 2016-04-01 | --+
#
# There's a break in coverage from
# 2016-04-01 through 2016-05-01
#
# | A | 4 | 2016-05-01 | 2016-11-01 | --+
# | A | 5 | 2016-11-01 | 2017-05-01 | --+ These policies overlap
#
#
# Person A had two periods of continuous coverage: from
# 2015-01-01 through 2016-04-01, and from 2016-05-01 through
# 2017-05-01. I want to get just those beginning and end dates
# for the two periods of continuous coverage.
#
# The first step being to determine where the breaks in coverage
# occur; we're doing that with the lead operation here --- it just
# appends a new column to the dataset containing the start date of
# the policy on the next row and says, "Yup, it's continuous" if
# the delta between the two dates is <= 0.
#
################################################################################
data <- data.table(data)
data <- data[,
nextPolicyStartDate := shift(
pni_startDate,
n = 1L,
fill = NA,
type = "lead"
), by = policyId
]
data$isContiguous <- ifelse(
data$nextPolicyStartDate - data$pni_endDate <= 0,
TRUE,
FALSE
)
View(data)
################################################################################
#
# Get the max policy end date for each period of continuous coverage
# for each policy in the dataset.
#
# This is a terrible way to do anything and I should be
# ashamed of it.
#
################################################################################
data$policyGroupEndDate <- as.Date(NA)
data$policyGroupId <- as.integer(NA)
j <- 0L
# For each row in our data
for(i in 1:nrow(data)) {
# Skip over all rows except the very first
# for each set of rows in which coverage is
# continuous
if(i <= j) next
# Record the starting point
j <<- i
# Increment our counter by 1 for every policy
# until we see a break in coverage
while(data[j, "isContiguous"] %in% TRUE) {
j <<- j + 1
}
# From our starting row through j, record the end date of
# the policy, j, that caused a break in coverage
# along with its row number so we can uniquely
# identify each coverage group later on
data[i:j, "policyGroupEndDate"] <- data[j, "pni_endDate"]
data[i:j, "policyGroupId"] <- j
}
View(data)
################################################################################
#
# Return only the initial policy in each coverage
# group (i.e., the one with the earliest start date)
#
################################################################################
setkey(data, policyGroupId)
data <- data[,
n := frankv(pni_startDate, order = 1L, ties.method = "first"),
by = key(data)
][n == 1]
View(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment