Last active
August 15, 2019 13:11
-
-
Save sharlagelfand/970bd5a2113d710725d13d09113beb7a to your computer and use it in GitHub Desktop.
using testthat on data cleaning/derivation steps to ensure that they work as expected
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Deriving the "overall working status" of someone's employment positions | |
# in Ontario, where it is the highest of Full Time > Part Time > Casual. | |
# e.g. if someone has a full time and a part time position, their overall | |
# working status is full time. If they have two part time positions, it's part time. | |
library(dplyr) | |
sample_df <- tibble::tribble( | |
~id, ~working_status, ~province, | |
1, "Full Time", "Ontario", | |
1, "Casual", "Ontario", | |
2, "Full Time", "Alberta", | |
2, "Part Time", "Ontario", | |
3, "Casual", "Ontario", | |
4, "Casual", "Alberta", | |
4, "Full Time", "Alberta", | |
5, "Part Time", "Ontario", | |
5, "Part Time", "Ontario" | |
) | |
# Rather than doing e.g... | |
# Write and run the function | |
df_with_overall_working_status <- sample_df %>% | |
mutate(working_status_ordinal = case_when( | |
working_status == "Full Time" ~ 3, | |
working_status == "Part Time" ~ 2, | |
working_status == "Casual" ~ 1 | |
)) %>% | |
group_by(id) %>% | |
mutate(highest_working_status_ordinal = max(working_status_ordinal)) %>% | |
filter(working_status_ordinal == highest_working_status_ordinal) %>% | |
ungroup() %>% | |
select(id, overall_working_status = working_status) %>% | |
distinct(id, overall_working_status) | |
# Check that it works by spot checking each id... | |
# Check that ID 1 is full time | |
df_with_overall_working_status %>% | |
filter(id == 1) | |
#> # A tibble: 1 x 2 | |
#> id overall_working_status | |
#> <dbl> <chr> | |
#> 1 1 Full Time | |
# ok looks good | |
# Check that ID 5 is part time | |
df_with_overall_working_status %>% | |
filter(id == 5) | |
#> # A tibble: 1 x 2 | |
#> id overall_working_status | |
#> <dbl> <chr> | |
#> 1 5 Part Time | |
# Check that ID 2 is part time since their full time position is outside of Ontario | |
df_with_overall_working_status %>% | |
filter(id == 2) | |
#> # A tibble: 1 x 2 | |
#> id overall_working_status | |
#> <dbl> <chr> | |
#> 1 2 Full Time | |
# Oops looks like I forgot to filter out non-Ontario positions, now let me go back and add a line to the code above and re check... nah | |
# Instead, move the derivation code into a function | |
# (I would namespace dplyr:: on all these in a package but I already have it loaded so w/e) | |
derive_overall_working_status <- function(x) { | |
x %>% | |
filter(province == "Ontario") %>% | |
mutate(working_status_ordinal = case_when( | |
working_status == "Full Time" ~ 3, | |
working_status == "Part Time" ~ 2, | |
working_status == "Casual" ~ 1 | |
)) %>% | |
group_by(id) %>% | |
mutate(highest_working_status_ordinal = max(working_status_ordinal)) %>% | |
filter(working_status_ordinal == highest_working_status_ordinal) %>% | |
ungroup() %>% | |
select(id, overall_working_status = working_status) %>% | |
distinct(id, overall_working_status) | |
} | |
# Now actually codify what I expect the output of derive_overall_working_status(sample_df) to look like | |
expected_output <- tibble::tribble( | |
~id, ~overall_working_status, | |
1, "Full Time", | |
2, "Part Time", | |
3, "Casual", # 4 is missing because they don't have any positions in Ontario | |
5, "Part Time" | |
) | |
# And run a test that derive_overall_working_status(sample_df) == expected_output, rather than checking ad-hoc | |
library(testthat) | |
test_that("derive_overall_working_status gets the highest working status and excludes positions outside of Ontario", { | |
expect_identical(derive_overall_working_status(sample_df), expected_output) | |
}) | |
# No news is good news -- silent if test passes! | |
emo::ji("tada") | |
#> 🎉 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment