Last active
March 9, 2022 11:11
-
-
Save avallecam/bcdb8ca98b40e7276a7269b72e564d85 to your computer and use it in GitHub Desktop.
cleaning solutions: combine case_when and stringer functions in different messy data scenarios
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` r | |
if(!require("tidyverse")) install.packages("tidyverse") | |
if(!require("snakecase")) install.packages("snakecase") | |
if(!require("janitor")) install.packages("janitor") | |
library(tidyverse) | |
# read raw data ----------------------------------------------------------- | |
# scenario: | |
# - pre-processed in excel | |
# - manually filled cells | |
raw_data <- tribble( | |
~id_person, ~test_result, ~food_place, ~birth_place, | |
7177,"positiv","mercado belen","Ánchash", | |
0177,"positivo","mer. belen","huánuco", | |
7178,"pos","en Mercado Belen",NA_character_ | |
) | |
raw_data | |
#> # A tibble: 3 x 4 | |
#> id_person test_result food_place birth_place | |
#> <dbl> <chr> <chr> <chr> | |
#> 1 7177 positiv mercado belen Ánchash | |
#> 2 177 positivo mer. belen huánuco | |
#> 3 7178 pos en Mercado Belen <NA> | |
# clean test_result ------------------------------------------------------- | |
# combo: | |
# - case_when + str_detect | |
raw_data %>% | |
select(test_result) %>% | |
mutate(test_result_clean = case_when( | |
str_detect(test_result,"pos") ~ "positive", | |
TRUE ~ test_result | |
)) | |
#> # A tibble: 3 x 2 | |
#> test_result test_result_clean | |
#> <chr> <chr> | |
#> 1 positiv positive | |
#> 2 positivo positive | |
#> 3 pos positive | |
# clean food_place -------------------------------------------------------- | |
# combo: | |
# - snakecase | |
# - case_when + str_detect | |
raw_data %>% | |
select(food_place) %>% | |
mutate(food_place=snakecase::to_snake_case(food_place)) %>% | |
mutate(food_placet_clean = case_when( | |
str_detect(food_place,"belen") ~ "market_belen", | |
TRUE ~ food_place | |
)) | |
#> # A tibble: 3 x 2 | |
#> food_place food_placet_clean | |
#> <chr> <chr> | |
#> 1 mercado_belen market_belen | |
#> 2 mer_belen market_belen | |
#> 3 en_mercado_belen market_belen | |
# clean id_person --------------------------------------------------------- | |
# combo: | |
# - as.character | |
# - case_when + str_length + str_replace + regular expression | |
raw_data %>% | |
select(id_person) %>% | |
mutate(id_person=as.character(id_person)) %>% | |
mutate(id_person_clean = case_when( | |
str_length(id_person)==3 ~ str_replace(id_person,"(.+)","0\\1"), | |
TRUE ~ id_person | |
)) | |
#> # A tibble: 3 x 2 | |
#> id_person id_person_clean | |
#> <chr> <chr> | |
#> 1 7177 7177 | |
#> 2 177 0177 | |
#> 3 7178 7178 | |
# clean birth_place ------------------------------------------------------- | |
# simulate an scenario with repeated observations | |
sim_data <- raw_data %>% | |
select(birth_place) %>% | |
bind_rows(raw_data %>% | |
select(birth_place)) | |
# issue: | |
# - is creates trailing digits per replicate | |
sim_data %>% | |
mutate(birth_place=janitor::make_clean_names(birth_place)) | |
#> # A tibble: 6 x 1 | |
#> birth_place | |
#> <chr> | |
#> 1 anchash | |
#> 2 huanuco | |
#> 3 na | |
#> 4 anchash_2 | |
#> 5 huanuco_2 | |
#> 6 na_2 | |
# combo: | |
# case_when + janitor::make_clean_names + str_replace + regular expression | |
sim_data %>% | |
mutate(birth_place=case_when( | |
!is.na(birth_place)~janitor::make_clean_names(birth_place), | |
TRUE ~ NA_character_ | |
)) %>% | |
mutate(birth_place=str_replace(string = birth_place, | |
pattern = "(.+)_(\\d+)", | |
replacement = "\\1")) | |
#> # A tibble: 6 x 1 | |
#> birth_place | |
#> <chr> | |
#> 1 anchash | |
#> 2 huanuco | |
#> 3 <NA> | |
#> 4 anchash | |
#> 5 huanuco | |
#> 6 <NA> | |
``` | |
<sup>Created on 2022-03-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment