Last active
September 27, 2017 17:03
-
-
Save wetherc/95f713a06939eca85265b8b5db55f803 to your computer and use it in GitHub Desktop.
Group policies by periods of continuous coverage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
################################################################################ | |
# | |
# Generate some dummy data | |
# | |
################################################################################ | |
data <- data.frame( | |
policyId = "A", | |
rowId = c(1:5), | |
pni_startDate = as.Date( | |
c( | |
"2015-01-01", | |
"2015-05-31", | |
"2015-07-08", | |
"2016-01-10", | |
"2016-10-15" | |
) | |
), | |
pni_endDate = as.Date( | |
c( | |
"2015-06-01", | |
"2015-07-08", | |
"2016-01-08", | |
"2016-10-15", | |
"2017-10-15" | |
) | |
), | |
stringsAsFactors = FALSE | |
) | |
data <- rbind(data, data) | |
data[6:10, 1] <- "B" | |
data[data$policyId == "B", 3:4] <- data[data$policyId == "B", 3:4] + 1 | |
View(data) | |
################################################################################ | |
# | |
# Lead the policy start dates and determine whether we see a | |
# discontinuity in coverage. We're looking for something like | |
# | |
# | Person ID | Policy ID | Start Date | End Date | | |
# |-----------|-----------|------------|------------| | |
# | A | 1 | 2015-01-01 | 2015-07-01 | --+ | |
# | A | 2 | 2015-07-01 | 2016-01-01 | + These policies overlap | |
# | A | 3 | 2015-12-31 | 2016-04-01 | --+ | |
# | |
# There's a break in coverage from | |
# 2016-04-01 through 2016-05-01 | |
# | |
# | A | 4 | 2016-05-01 | 2016-11-01 | --+ | |
# | A | 5 | 2016-11-01 | 2017-05-01 | --+ These policies overlap | |
# | |
# | |
# Person A had two periods of continuous coverage: from | |
# 2015-01-01 through 2016-04-01, and from 2016-05-01 through | |
# 2017-05-01. I want to get just those beginning and end dates | |
# for the two periods of continuous coverage. | |
# | |
# The first step being to determine where the breaks in coverage | |
# occur; we're doing that with the lead operation here --- it just | |
# appends a new column to the dataset containing the start date of | |
# the policy on the next row and says, "Yup, it's continuous" if | |
# the delta between the two dates is <= 0. | |
# | |
################################################################################ | |
data <- data.table(data) | |
data <- data[, | |
nextPolicyStartDate := shift( | |
pni_startDate, | |
n = 1L, | |
fill = NA, | |
type = "lead" | |
), by = policyId | |
] | |
data$isContiguous <- ifelse( | |
data$nextPolicyStartDate - data$pni_endDate <= 0, | |
TRUE, | |
FALSE | |
) | |
View(data) | |
################################################################################ | |
# | |
# Get the max policy end date for each period of continuous coverage | |
# for each policy in the dataset. | |
# | |
# This is a terrible way to do anything and I should be | |
# ashamed of it. | |
# | |
################################################################################ | |
data$policyGroupEndDate <- as.Date(NA) | |
data$policyGroupId <- as.integer(NA) | |
j <- 0L | |
# For each row in our data | |
for(i in 1:nrow(data)) { | |
# Skip over all rows except the very first | |
# for each set of rows in which coverage is | |
# continuous | |
if(i <= j) next | |
# Record the starting point | |
j <<- i | |
# Increment our counter by 1 for every policy | |
# until we see a break in coverage | |
while(data[j, "isContiguous"] %in% TRUE) { | |
j <<- j + 1 | |
} | |
# From our starting row through j, record the end date of | |
# the policy, j, that caused a break in coverage | |
# along with its row number so we can uniquely | |
# identify each coverage group later on | |
data[i:j, "policyGroupEndDate"] <- data[j, "pni_endDate"] | |
data[i:j, "policyGroupId"] <- j | |
} | |
View(data) | |
################################################################################ | |
# | |
# Return only the initial policy in each coverage | |
# group (i.e., the one with the earliest start date) | |
# | |
################################################################################ | |
setkey(data, policyGroupId) | |
data <- data[, | |
n := frankv(pni_startDate, order = 1L, ties.method = "first"), | |
by = key(data) | |
][n == 1] | |
View(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment