Skip to content

Instantly share code, notes, and snippets.

@grosscol
Created January 23, 2018 20:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save grosscol/09ca5a4e48f1adfe4968c4d4a7652ee5 to your computer and use it in GitHub Desktop.
Save grosscol/09ca5a4e48f1adfe4968c4d4a7652ee5 to your computer and use it in GitHub Desktop.
Impute missing dates per id
library(dplyr)
df <- structure(
list(
ID = structure(
c(
1L,
1L,
1L,
1L,
1L,
1L,
3L,
3L,
3L,
3L,
3L,
4L,
4L,
4L,
4L,
4L,
4L,
4L,
4L,
4L,
2L,
2L,
2L,
2L
),
.Label = c("xx", "xyz", "yy", "zz"),
class = "factor"
),
Date = structure(
c(
8L,
9L,
10L,
11L,
12L,
13L,
14L,
15L,
16L,
17L,
18L,
1L,
1L,
2L,
3L,
4L,
5L,
6L,
7L,
19L,
20L,
21L,
22L,
23L
),
.Label = c(
"1989-09-12",
"1989-09-13",
"1989-09-14",
"1989-09-19",
"1989-09-23",
"1990-01-12",
"1990-01-13",
"1996-09-12",
"1996-09-13",
"1996-09-16",
"1996-09-17",
"1996-09-18",
"1996-09-19",
"2000-09-12",
"2000-09-13",
"2000-11-10",
"2000-11-11",
"2000-11-12",
"2001-09-07",
"2001-09-08",
"2001-09-09",
"2001-09-10",
"2001-09-11"
),
class = "factor"
),
val = c(3, 5,
9, 3, 5, 6, 8, 7, 9, 5, 3, 2, 8, 8, 5, 3, 2, 1, 5, 7, NA, NA,
NA, NA)
),
.Names = c("ID", "Date", "val"),
row.names = c(NA,
df$Date <-
as.Date(df$Date) -
24L),
class = "data.frame"
)
ranges_df <- df %>%
group_by(ID) %>%
summarize(dmin = min(Date), dmax = max(Date))
alldays <-
ranges_df %>% group_by(ID) %>% do(., data.frame(Date = seq(.$dmin, .$dmax, by = '1 day')))
imputed_df <- left_join(alldays, df)
imputed_df %>% group_by(ID) %>% summarize(
total = n(),
missing = sum(is.na(val)),
percent_missing = missing / total * 100
)
alldays <- ranges_df %>% group_by(ID) %>% do(., data.frame( Date = seq(.$dmin,.$dmax, by = '1 day') ))
imputed_df <- left_join(alldays, df)
imputed_df %>% group_by(ID) %>% summarize(total=n(), missing=sum(is.na(val)), percent_missing=missing/total*100 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment