sharlagelfand/testthat_analysis

## testthat_analysis
# Deriving the "overall working status" of someone's employment positions
# in Ontario, where it is the highest of Full Time > Part Time > Casual.
# e.g. if someone has a full time and a part time position, their overall
# working status is full time. If they have two part time positions, it's part time.
library(dplyr)

sample_df <- tibble::tribble(
  ~id, ~working_status, ~province,
  1, "Full Time", "Ontario",
  1, "Casual", "Ontario",
  2, "Full Time", "Alberta",
  2, "Part Time", "Ontario",
  3, "Casual", "Ontario",
  4, "Casual", "Alberta",
  4, "Full Time", "Alberta",
  5, "Part Time", "Ontario",
  5, "Part Time", "Ontario"
)

# Rather than doing e.g...

# Write and run the function
df_with_overall_working_status <- sample_df %>%
  mutate(working_status_ordinal = case_when(
    working_status == "Full Time" ~ 3,
    working_status == "Part Time" ~ 2,
    working_status == "Casual" ~ 1
  )) %>%
  group_by(id) %>%
  mutate(highest_working_status_ordinal = max(working_status_ordinal)) %>%
  filter(working_status_ordinal == highest_working_status_ordinal) %>%
  ungroup() %>%
  select(id, overall_working_status = working_status) %>%
  distinct(id, overall_working_status)

# Check that it works by spot checking each id...

# Check that ID 1 is full time

df_with_overall_working_status %>%
  filter(id == 1)
#> # A tibble: 1 x 2
#>      id overall_working_status
#>   <dbl> <chr>
#> 1     1 Full Time

# ok looks good

# Check that ID 5 is part time

df_with_overall_working_status %>%
  filter(id == 5)
#> # A tibble: 1 x 2
#>      id overall_working_status
#>   <dbl> <chr>
#> 1     5 Part Time

# Check that ID 2 is part time since their full time position is outside of Ontario

df_with_overall_working_status %>%
  filter(id == 2)
#> # A tibble: 1 x 2
#>      id overall_working_status
#>   <dbl> <chr>
#> 1     2 Full Time

# Oops looks like I forgot to filter out non-Ontario positions, now let me go back and add a line to the code above and re check... nah

# Instead, move the derivation code into a function
# (I would namespace dplyr:: on all these in a package but I already have it loaded so w/e)

derive_overall_working_status <- function(x) {
  x %>%
    filter(province == "Ontario") %>%
    mutate(working_status_ordinal = case_when(
      working_status == "Full Time" ~ 3,
      working_status == "Part Time" ~ 2,
      working_status == "Casual" ~ 1
    )) %>%
    group_by(id) %>%
    mutate(highest_working_status_ordinal = max(working_status_ordinal)) %>%
    filter(working_status_ordinal == highest_working_status_ordinal) %>%
    ungroup() %>%
    select(id, overall_working_status = working_status) %>%
    distinct(id, overall_working_status)
}

# Now actually codify what I expect the output of derive_overall_working_status(sample_df) to look like

expected_output <- tibble::tribble(
  ~id, ~overall_working_status,
  1, "Full Time",
  2, "Part Time",
  3, "Casual", # 4 is missing because they don't have any positions in Ontario
  5, "Part Time"
)

# And run a test that derive_overall_working_status(sample_df) == expected_output, rather than checking ad-hoc

library(testthat)

test_that("derive_overall_working_status gets the highest working status and excludes positions outside of Ontario", {
  expect_identical(derive_overall_working_status(sample_df), expected_output)
})

# No news is good news -- silent if test passes!

emo::ji("tada")
#> 🎉
	# Deriving the "overall working status" of someone's employment positions
	# in Ontario, where it is the highest of Full Time > Part Time > Casual.
	# e.g. if someone has a full time and a part time position, their overall
	# working status is full time. If they have two part time positions, it's part time.
	library(dplyr)

	sample_df <- tibble::tribble(
	~id, ~working_status, ~province,
	1, "Full Time", "Ontario",
	1, "Casual", "Ontario",
	2, "Full Time", "Alberta",
	2, "Part Time", "Ontario",
	3, "Casual", "Ontario",
	4, "Casual", "Alberta",
	4, "Full Time", "Alberta",
	5, "Part Time", "Ontario",
	5, "Part Time", "Ontario"
	)

	# Rather than doing e.g...

	# Write and run the function
	df_with_overall_working_status <- sample_df %>%
	mutate(working_status_ordinal = case_when(
	working_status == "Full Time" ~ 3,
	working_status == "Part Time" ~ 2,
	working_status == "Casual" ~ 1
	)) %>%
	group_by(id) %>%
	mutate(highest_working_status_ordinal = max(working_status_ordinal)) %>%
	filter(working_status_ordinal == highest_working_status_ordinal) %>%
	ungroup() %>%
	select(id, overall_working_status = working_status) %>%
	distinct(id, overall_working_status)

	# Check that it works by spot checking each id...

	# Check that ID 1 is full time

	df_with_overall_working_status %>%
	filter(id == 1)
	#> # A tibble: 1 x 2
	#> id overall_working_status
	#> <dbl> <chr>
	#> 1 1 Full Time

	# ok looks good

	# Check that ID 5 is part time

	df_with_overall_working_status %>%
	filter(id == 5)
	#> # A tibble: 1 x 2
	#> id overall_working_status
	#> <dbl> <chr>
	#> 1 5 Part Time

	# Check that ID 2 is part time since their full time position is outside of Ontario

	df_with_overall_working_status %>%
	filter(id == 2)
	#> # A tibble: 1 x 2
	#> id overall_working_status
	#> <dbl> <chr>
	#> 1 2 Full Time

	# Oops looks like I forgot to filter out non-Ontario positions, now let me go back and add a line to the code above and re check... nah

	# Instead, move the derivation code into a function
	# (I would namespace dplyr:: on all these in a package but I already have it loaded so w/e)

	derive_overall_working_status <- function(x) {
	x %>%
	filter(province == "Ontario") %>%
	mutate(working_status_ordinal = case_when(
	working_status == "Full Time" ~ 3,
	working_status == "Part Time" ~ 2,
	working_status == "Casual" ~ 1
	)) %>%
	group_by(id) %>%
	mutate(highest_working_status_ordinal = max(working_status_ordinal)) %>%
	filter(working_status_ordinal == highest_working_status_ordinal) %>%
	ungroup() %>%
	select(id, overall_working_status = working_status) %>%
	distinct(id, overall_working_status)
	}

	# Now actually codify what I expect the output of derive_overall_working_status(sample_df) to look like

	expected_output <- tibble::tribble(
	~id, ~overall_working_status,
	1, "Full Time",
	2, "Part Time",
	3, "Casual", # 4 is missing because they don't have any positions in Ontario
	5, "Part Time"
	)

	# And run a test that derive_overall_working_status(sample_df) == expected_output, rather than checking ad-hoc

	library(testthat)

	test_that("derive_overall_working_status gets the highest working status and excludes positions outside of Ontario", {
	expect_identical(derive_overall_working_status(sample_df), expected_output)
	})

	# No news is good news -- silent if test passes!

	emo::ji("tada")
	#> 🎉