Last active
October 21, 2021 21:50
-
-
Save tshynik/0270e84753bec4ed627e98bcc2a7a806 to your computer and use it in GitHub Desktop.
Faking a dataset based on real data & column names
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(dplyr) | |
library(magrittr) | |
x <- fread("AZMyVWarehouseStandardText20210915-15249545275.txt") | |
scramble_data <- function(colname, data=x, size=20) { | |
samp <- data[ | |
sample( | |
x=.N, | |
size=size | |
), | |
colname, | |
with=F | |
] | |
return(samp) | |
} | |
# testing: | |
scramble_data(colname="CountyFileID", data=x) | |
y <- lapply( | |
X = names(x), | |
FUN = scramble_data, | |
data = x | |
) %>% | |
bind_cols() | |
# check output: | |
y | |
# anonymize phone numbers even further, if necessary: | |
y[, PhoneNumber := paste0('555', substr(PhoneNumber,4,10)) ] | |
# add more variability than strictly random, for testing purposes: | |
x %>% group_by(PhoneQualityScore) %>% tally | |
y$PhoneQualityScore <- sample( | |
x=c('','','','','','Very High','High','Low','Medium'), | |
size = 20, | |
replace = T | |
) | |
# write to file: | |
y %>% fwrite('fake_std_text_export.txt', sep='\t') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment