wetherc/groupByCoverageWindow.R

## groupByCoverageWindow.R
library(data.table)

################################################################################
#
# Generate some dummy data
#
################################################################################

data <- data.frame(
  policyId = "A",
  rowId = c(1:5),
  pni_startDate = as.Date(
    c(
      "2015-01-01",
      "2015-05-31",
      "2015-07-08",
      "2016-01-10",
      "2016-10-15"
    )
  ),
  pni_endDate = as.Date(
    c(
      "2015-06-01",
      "2015-07-08",
      "2016-01-08",
      "2016-10-15",
      "2017-10-15"
    )
  ),
  stringsAsFactors = FALSE
)

data <- rbind(data, data)
data[6:10, 1] <- "B"
data[data$policyId == "B", 3:4] <- data[data$policyId == "B", 3:4] + 1

View(data)


################################################################################
#
# Lead the policy start dates and determine whether we see a
# discontinuity in coverage. We're looking for something like
#
# | Person ID | Policy ID | Start Date |  End Date  |
# |-----------|-----------|------------|------------|
# | A         | 1         | 2015-01-01 | 2015-07-01 | --+
# | A         | 2         | 2015-07-01 | 2016-01-01 |   + These policies overlap
# | A         | 3         | 2015-12-31 | 2016-04-01 | --+
#
#                         There's a break in coverage from
#                         2016-04-01 through 2016-05-01
#
# | A         | 4         | 2016-05-01 | 2016-11-01 | --+
# | A         | 5         | 2016-11-01 | 2017-05-01 | --+ These policies overlap
#
#
# Person A had two periods of continuous coverage: from
# 2015-01-01 through 2016-04-01, and from 2016-05-01 through
# 2017-05-01. I want to get just those beginning and end dates
# for the two periods of continuous coverage.
#
# The first step being to determine where the breaks in coverage
# occur; we're doing that with the lead operation here --- it just
# appends a new column to the dataset containing the start date of
# the policy on the next row and says, "Yup, it's continuous" if
# the delta between the two dates is <= 0.
#
################################################################################

data <- data.table(data)
data <- data[,
  nextPolicyStartDate := shift(
    pni_startDate,
    n = 1L,
    fill = NA,
    type = "lead"
  ), by = policyId
]

data$isContiguous <- ifelse(
  data$nextPolicyStartDate - data$pni_endDate <= 0,
  TRUE,
  FALSE
)

View(data)


################################################################################
#
# Get the max policy end date for each period of continuous coverage
# for each policy in the dataset.
#
# This is a terrible way to do anything and I should be
# ashamed of it.
#
################################################################################

data$policyGroupEndDate <- as.Date(NA)
data$policyGroupId <- as.integer(NA)

j <- 0L

# For each row in our data
for(i in 1:nrow(data)) {
  # Skip over all rows except the very first
  # for each set of rows in which coverage is
  # continuous
  if(i <= j) next

  # Record the starting point
  j <<- i

  # Increment our counter by 1 for every policy
  # until we see a break in coverage
  while(data[j, "isContiguous"] %in% TRUE) {
    j <<- j + 1
  }

  # From our starting row through j, record the end date of
  # the policy, j, that caused a break in coverage
  # along with its row number so we can uniquely
  # identify each coverage group later on
  data[i:j, "policyGroupEndDate"] <- data[j, "pni_endDate"]
  data[i:j, "policyGroupId"] <- j
}

View(data)


################################################################################
#
# Return only the initial policy in each coverage
# group (i.e., the one with the earliest start date)
#
################################################################################

setkey(data, policyGroupId)
data <- data[,
  n := frankv(pni_startDate, order = 1L, ties.method = "first"),
  by = key(data)
][n == 1]

View(data)
	library(data.table)

	################################################################################
	#
	# Generate some dummy data
	#
	################################################################################

	data <- data.frame(
	policyId = "A",
	rowId = c(1:5),
	pni_startDate = as.Date(
	c(
	"2015-01-01",
	"2015-05-31",
	"2015-07-08",
	"2016-01-10",
	"2016-10-15"
	)
	),
	pni_endDate = as.Date(
	c(
	"2015-06-01",
	"2015-07-08",
	"2016-01-08",
	"2016-10-15",
	"2017-10-15"
	)
	),
	stringsAsFactors = FALSE
	)

	data <- rbind(data, data)
	data[6:10, 1] <- "B"
	data[data$policyId == "B", 3:4] <- data[data$policyId == "B", 3:4] + 1

	View(data)


	################################################################################
	#
	# Lead the policy start dates and determine whether we see a
	# discontinuity in coverage. We're looking for something like
	#
	# \| Person ID \| Policy ID \| Start Date \| End Date \|
	# \|-----------\|-----------\|------------\|------------\|
	# \| A \| 1 \| 2015-01-01 \| 2015-07-01 \| --+
	# \| A \| 2 \| 2015-07-01 \| 2016-01-01 \| + These policies overlap
	# \| A \| 3 \| 2015-12-31 \| 2016-04-01 \| --+
	#
	# There's a break in coverage from
	# 2016-04-01 through 2016-05-01
	#
	# \| A \| 4 \| 2016-05-01 \| 2016-11-01 \| --+
	# \| A \| 5 \| 2016-11-01 \| 2017-05-01 \| --+ These policies overlap
	#
	#
	# Person A had two periods of continuous coverage: from
	# 2015-01-01 through 2016-04-01, and from 2016-05-01 through
	# 2017-05-01. I want to get just those beginning and end dates
	# for the two periods of continuous coverage.
	#
	# The first step being to determine where the breaks in coverage
	# occur; we're doing that with the lead operation here --- it just
	# appends a new column to the dataset containing the start date of
	# the policy on the next row and says, "Yup, it's continuous" if
	# the delta between the two dates is <= 0.
	#
	################################################################################

	data <- data.table(data)
	data <- data[,
	nextPolicyStartDate := shift(
	pni_startDate,
	n = 1L,
	fill = NA,
	type = "lead"
	), by = policyId
	]

	data$isContiguous <- ifelse(
	data$nextPolicyStartDate - data$pni_endDate <= 0,
	TRUE,
	FALSE
	)

	View(data)


	################################################################################
	#
	# Get the max policy end date for each period of continuous coverage
	# for each policy in the dataset.
	#
	# This is a terrible way to do anything and I should be
	# ashamed of it.
	#
	################################################################################

	data$policyGroupEndDate <- as.Date(NA)
	data$policyGroupId <- as.integer(NA)

	j <- 0L

	# For each row in our data
	for(i in 1:nrow(data)) {
	# Skip over all rows except the very first
	# for each set of rows in which coverage is
	# continuous
	if(i <= j) next

	# Record the starting point
	j <<- i

	# Increment our counter by 1 for every policy
	# until we see a break in coverage
	while(data[j, "isContiguous"] %in% TRUE) {
	j <<- j + 1
	}

	# From our starting row through j, record the end date of
	# the policy, j, that caused a break in coverage
	# along with its row number so we can uniquely
	# identify each coverage group later on
	data[i:j, "policyGroupEndDate"] <- data[j, "pni_endDate"]
	data[i:j, "policyGroupId"] <- j
	}

	View(data)


	################################################################################
	#
	# Return only the initial policy in each coverage
	# group (i.e., the one with the earliest start date)
	#
	################################################################################

	setkey(data, policyGroupId)
	data <- data[,
	n := frankv(pni_startDate, order = 1L, ties.method = "first"),
	by = key(data)
	][n == 1]

	View(data)