tshynik/fake_dataset.R

## fake_dataset.R
library(data.table)
library(dplyr)
library(magrittr)

x <- fread("AZMyVWarehouseStandardText20210915-15249545275.txt")

scramble_data <- function(colname, data=x, size=20) {
  samp <- data[
    sample(
      x=.N,
      size=size
    ),
    colname,
    with=F
    ]
  return(samp)
}

# testing:
scramble_data(colname="CountyFileID", data=x)

y <- lapply(
  X = names(x),
  FUN = scramble_data,
  data = x
) %>%
  bind_cols()

# check output:
y

# anonymize phone numbers even further, if necessary:
y[,  PhoneNumber := paste0('555', substr(PhoneNumber,4,10)) ]

# add more variability than strictly random, for testing purposes:
x %>% group_by(PhoneQualityScore) %>% tally
y$PhoneQualityScore <- sample(
  x=c('','','','','','Very High','High','Low','Medium'),
  size = 20,
  replace = T
)

# write to file:
y %>% fwrite('fake_std_text_export.txt', sep='\t')
	library(data.table)
	library(dplyr)
	library(magrittr)

	x <- fread("AZMyVWarehouseStandardText20210915-15249545275.txt")

	scramble_data <- function(colname, data=x, size=20) {
	samp <- data[
	sample(
	x=.N,
	size=size
	),
	colname,
	with=F
	]
	return(samp)
	}

	# testing:
	scramble_data(colname="CountyFileID", data=x)

	y <- lapply(
	X = names(x),
	FUN = scramble_data,
	data = x
	) %>%
	bind_cols()

	# check output:
	y

	# anonymize phone numbers even further, if necessary:
	y[, PhoneNumber := paste0('555', substr(PhoneNumber,4,10)) ]

	# add more variability than strictly random, for testing purposes:
	x %>% group_by(PhoneQualityScore) %>% tally
	y$PhoneQualityScore <- sample(
	x=c('','','','','','Very High','High','Low','Medium'),
	size = 20,
	replace = T
	)

	# write to file:
	y %>% fwrite('fake_std_text_export.txt', sep='\t')