grosscol/impute_missing.R

## impute_missing.R
library(dplyr)
df <- structure(
  list(
    ID = structure(
      c(
        1L,
        1L,
        1L,
        1L,
        1L,
        1L,
        3L,
        3L,
        3L,
        3L,
        3L,
        4L,
        4L,
        4L,
        4L,
        4L,
        4L,
        4L,
        4L,
        4L,
        2L,
        2L,
        2L,
        2L
      ),
      .Label = c("xx", "xyz", "yy", "zz"),
      class = "factor"
    ),
    Date = structure(
      c(
        8L,
        9L,
        10L,
        11L,
        12L,
        13L,
        14L,
        15L,
        16L,
        17L,
        18L,
        1L,
        1L,
        2L,
        3L,
        4L,
        5L,
        6L,
        7L,
        19L,
        20L,
        21L,
        22L,
        23L
      ),
      .Label = c(
        "1989-09-12",
        "1989-09-13",
        "1989-09-14",
        "1989-09-19",
        "1989-09-23",
        "1990-01-12",
        "1990-01-13",
        "1996-09-12",
        "1996-09-13",
        "1996-09-16",
        "1996-09-17",
        "1996-09-18",
        "1996-09-19",
        "2000-09-12",
        "2000-09-13",
        "2000-11-10",
        "2000-11-11",
        "2000-11-12",
        "2001-09-07",
        "2001-09-08",
        "2001-09-09",
        "2001-09-10",
        "2001-09-11"
      ),
      class = "factor"
    ),
    val = c(3, 5,
            9, 3, 5, 6, 8, 7, 9, 5, 3, 2, 8, 8, 5, 3, 2, 1, 5, 7, NA, NA,
            NA, NA)
  ),
  .Names = c("ID", "Date", "val"),
  row.names = c(NA,
                df$Date <-
                  as.Date(df$Date)                                                                                                                                                                                                                                                     -
                  24L),
  class = "data.frame"
)

ranges_df <- df %>%
  group_by(ID) %>%
  summarize(dmin = min(Date), dmax = max(Date))

alldays <-
  ranges_df %>% group_by(ID) %>% do(., data.frame(Date = seq(.$dmin, .$dmax, by = '1 day')))

imputed_df <- left_join(alldays, df)

imputed_df %>% group_by(ID) %>% summarize(
  total = n(),
  missing = sum(is.na(val)),
  percent_missing = missing / total * 100
)

alldays <- ranges_df %>% group_by(ID) %>% do(., data.frame( Date = seq(.$dmin,.$dmax, by = '1 day') ))

imputed_df <- left_join(alldays, df)

imputed_df %>% group_by(ID) %>% summarize(total=n(), missing=sum(is.na(val)), percent_missing=missing/total*100 )
	library(dplyr)
	df <- structure(
	list(
	ID = structure(
	c(
	1L,
	1L,
	1L,
	1L,
	1L,
	1L,
	3L,
	3L,
	3L,
	3L,
	3L,
	4L,
	4L,
	4L,
	4L,
	4L,
	4L,
	4L,
	4L,
	4L,
	2L,
	2L,
	2L,
	2L
	),
	.Label = c("xx", "xyz", "yy", "zz"),
	class = "factor"
	),
	Date = structure(
	c(
	8L,
	9L,
	10L,
	11L,
	12L,
	13L,
	14L,
	15L,
	16L,
	17L,
	18L,
	1L,
	1L,
	2L,
	3L,
	4L,
	5L,
	6L,
	7L,
	19L,
	20L,
	21L,
	22L,
	23L
	),
	.Label = c(
	"1989-09-12",
	"1989-09-13",
	"1989-09-14",
	"1989-09-19",
	"1989-09-23",
	"1990-01-12",
	"1990-01-13",
	"1996-09-12",
	"1996-09-13",
	"1996-09-16",
	"1996-09-17",
	"1996-09-18",
	"1996-09-19",
	"2000-09-12",
	"2000-09-13",
	"2000-11-10",
	"2000-11-11",
	"2000-11-12",
	"2001-09-07",
	"2001-09-08",
	"2001-09-09",
	"2001-09-10",
	"2001-09-11"
	),
	class = "factor"
	),
	val = c(3, 5,
	9, 3, 5, 6, 8, 7, 9, 5, 3, 2, 8, 8, 5, 3, 2, 1, 5, 7, NA, NA,
	NA, NA)
	),
	.Names = c("ID", "Date", "val"),
	row.names = c(NA,
	df$Date <-
	as.Date(df$Date) -
	24L),
	class = "data.frame"
	)

	ranges_df <- df %>%
	group_by(ID) %>%
	summarize(dmin = min(Date), dmax = max(Date))

	alldays <-
	ranges_df %>% group_by(ID) %>% do(., data.frame(Date = seq(.$dmin, .$dmax, by = '1 day')))

	imputed_df <- left_join(alldays, df)

	imputed_df %>% group_by(ID) %>% summarize(
	total = n(),
	missing = sum(is.na(val)),
	percent_missing = missing / total * 100
	)

	alldays <- ranges_df %>% group_by(ID) %>% do(., data.frame( Date = seq(.$dmin,.$dmax, by = '1 day') ))

	imputed_df <- left_join(alldays, df)

	imputed_df %>% group_by(ID) %>% summarize(total=n(), missing=sum(is.na(val)), percent_missing=missing/total*100 )