Create a gist now

Instantly share code, notes, and snippets.

@abicky /create_sample_data.R Secret
Last active Apr 22, 2017

Embed
What would you like to do?
# cf. https://abicky.net/2015/05/04/155600/
create_sample_data <- function() {
# AB テストのデータ
set.seed(0)
N <- 100000
# 0-23 時のアクセス比
HOUR_ACCESS_PROB <- c(rep(0.1, 9), rep(1, 12), rep(0.1, 3))
# 0-6(Sun, Mon, ..., Fri) のアクセス比
WEEK_ACCESS_PROB <- rep(1, 7)
TARGET_DATES <- as.POSIXlt(seq(as.POSIXct("2015-04-01", tz = "Asia/Tokyo"), by = "1 day", len = 7))
MAX_LOG_TIME <- as.integer(as.POSIXct("2015-05-05")) - 1
A_IMP <- rpois(1, N)
B_IMP <- rpois(1, N)
A_CVR <- 0.300
B_CVR <- 0.290
# imp から cv までの平均秒
CV_INTERVAL = 30
# 1 日の平均アクセス回数(アクセスする場合の数)
ACCESS_COUNT_PER_DAY <- 3
MAX_USER_ID = 999999999
PAGES <- c("page1", "page2", "page3")
STATES <- c("new", "normal", "active", "inactive")
INIT_PAGE_PROB <- structure(c(0.7, 0.2, 0.1), names = PAGES)
PAGE_TRANS_PROB <- matrix(c(
0.1, 0.4, 0.5,
0.8, 0.1, 0.1,
0.8, 0.1, 0.1
), 3, byrow = TRUE, dimnames = list(PAGES, PAGES))
A_STATE_TRANS_PROB <- matrix(c(
0.3, 0.2, 0.2, 0.3,
0.0, 0.4, 0.2, 0.4,
0.0, 0.1, 0.9, 0.0,
0.0, 0.1, 0.0, 0.9
), 4, byrow = TRUE, dimnames = list(STATES, STATES))
B_STATE_TRANS_PROB <- matrix(c(
0.4, 0.2, 0.2, 0.2,
0.0, 0.4, 0.2, 0.4,
0.0, 0.1, 0.9, 0.0,
0.0, 0.1, 0.0, 0.9
), 4, byrow = TRUE, dimnames = list(STATES, STATES))
page_interval_time <- function(page) {
mean_interval <- switch(page, page1 = 30, 10)
return(as.integer(rnbinom(1, 1, mu = mean_interval - 1) + 1))
}
session_interval_time <- function(state) {
interval_days <- switch(
state,
new = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)),
normal = (rpois(1, 2) + 1),
active = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)),
inactive = sample(12, 1) * 30
)
return(as.integer(interval_days * 86400 + rnorm(1, 0, 3600)))
}
access_count_per_session <- function() {
mean_access_count <- 3
return(rpois(1, mean_access_count - 1) + 1)
}
create_event_logs <- function(user_ids, cvr, state_trans_prob, pattern) {
cat(sprintf("Creating logs of '%s' pattern ", pattern))
imp <- length(user_ids)
hours <- sample(24, imp, replace = TRUE, prob = HOUR_ACCESS_PROB) - 1L
secs <- hours * 3600L + sample(3600, imp, replace = TRUE) - 1L
dates <- sample(TARGET_DATES, imp, replace = TRUE, prob = WEEK_ACCESS_PROB[TARGET_DATES$wday + 1])
imp_times <- as.integer(as.POSIXct(dates)) + secs
imp_logs <- data.frame(time = imp_times, user_id = user_ids, event = factor("imp"), pattern = factor(pattern))
cv_idx <- which(sample(c(FALSE, TRUE), imp, replace = TRUE, prob = c(1 - cvr, cvr)))
cv_count <- length(cv_idx)
cv_times <- imp_times[cv_idx] + as.integer(rnbinom(cv_count, 1, mu = CV_INTERVAL - 1) + 1)
cv_user_ids <- user_ids[cv_idx]
cv_logs <- data.frame(time = cv_times, user_id = cv_user_ids, event = factor("cv"), pattern = factor(pattern))
block_size <- 100000
log_times <- integer(block_size)
log_user_ids <- integer(block_size)
log_pages <- character(block_size)
current_row <- 0L
progress <- 0
progress_interval <- floor(length(cv_user_ids) / 10)
for (i in seq_along(cv_user_ids)) {
progress <- progress + 1
if (progress %% progress_interval == 0) {
cat(".")
}
user_id <- cv_user_ids[i]
time <- cv_times[i]
state <- "new"
repeat {
page <- sample(PAGES, 1, prob = INIT_PAGE_PROB)
access_count <- access_count_per_session()
repeat {
current_row <- current_row + 1L
if (current_row > length(log_times)) {
log_times <- c(log_times, integer(length(log_times)))
log_user_ids <- c(log_user_ids, integer(length(log_user_ids)))
log_pages <- c(log_pages, character(length(log_pages)))
}
log_times[current_row] <- time
log_user_ids[current_row] <- user_id
log_pages[current_row] <- page
access_count <- access_count - 1
if (access_count == 0) {
break
}
time <- time + page_interval_time(page)
page <- sample(PAGES, 1, prob = PAGE_TRANS_PROB[page, ])
}
state <- sample(STATES, 1, prob = state_trans_prob[state, ])
time <- time + session_interval_time(state)
if (time > MAX_LOG_TIME) {
break
}
}
}
access_logs <- data.frame(
time = log_times[1:current_row],
user_id = log_user_ids[1:current_row],
page = log_pages[1:current_row]
)
cat(" done\n")
return(list(event_logs = rbind(imp_logs, cv_logs), access_logs = access_logs))
}
user_ids <- sample(MAX_USER_ID, A_IMP + B_IMP)
a_logs <- create_event_logs(user_ids[1:A_IMP], A_CVR, A_STATE_TRANS_PROB, "a")
b_logs <- create_event_logs(user_ids[-(1:A_IMP)], B_CVR, B_STATE_TRANS_PROB, "b")
event_logs <- rbind(a_logs$event_logs, b_logs$event_logs)
access_logs <- rbind(a_logs$access_logs, b_logs$access_logs)
dir <- tempdir()
event_log_file <- sprintf("%s/event_logs.tsv", dir)
access_log_file <- sprintf("%s/access_logs.tsv", dir)
cat("Writing files")
write.table(event_logs, file = event_log_file, sep = "\t", row.names = FALSE, quote = FALSE)
write.table(access_logs, file = access_log_file, sep = "\t", row.names = FALSE, quote = FALSE)
cat(" done\n")
return(list(event_log_file = event_log_file, access_log_file = access_log_file))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment