-
-
Save abicky/3a4789c3fd163a71606c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# cf. https://abicky.net/2015/05/04/155600/ | |
create_sample_data <- function() { | |
# AB テストのデータ | |
set.seed(0) | |
N <- 100000 | |
# 0-23 時のアクセス比 | |
HOUR_ACCESS_PROB <- c(rep(0.1, 9), rep(1, 12), rep(0.1, 3)) | |
# 0-6(Sun, Mon, ..., Fri) のアクセス比 | |
WEEK_ACCESS_PROB <- rep(1, 7) | |
TARGET_DATES <- as.POSIXlt(seq(as.POSIXct("2015-04-01", tz = "Asia/Tokyo"), by = "1 day", len = 7)) | |
MAX_LOG_TIME <- as.integer(as.POSIXct("2015-05-05")) - 1 | |
A_IMP <- rpois(1, N) | |
B_IMP <- rpois(1, N) | |
A_CVR <- 0.300 | |
B_CVR <- 0.290 | |
# imp から cv までの平均秒 | |
CV_INTERVAL = 30 | |
# 1 日の平均アクセス回数(アクセスする場合の数) | |
ACCESS_COUNT_PER_DAY <- 3 | |
MAX_USER_ID = 999999999 | |
PAGES <- c("page1", "page2", "page3") | |
STATES <- c("new", "normal", "active", "inactive") | |
INIT_PAGE_PROB <- structure(c(0.7, 0.2, 0.1), names = PAGES) | |
PAGE_TRANS_PROB <- matrix(c( | |
0.1, 0.4, 0.5, | |
0.8, 0.1, 0.1, | |
0.8, 0.1, 0.1 | |
), 3, byrow = TRUE, dimnames = list(PAGES, PAGES)) | |
A_STATE_TRANS_PROB <- matrix(c( | |
0.3, 0.2, 0.2, 0.3, | |
0.0, 0.4, 0.2, 0.4, | |
0.0, 0.1, 0.9, 0.0, | |
0.0, 0.1, 0.0, 0.9 | |
), 4, byrow = TRUE, dimnames = list(STATES, STATES)) | |
B_STATE_TRANS_PROB <- matrix(c( | |
0.4, 0.2, 0.2, 0.2, | |
0.0, 0.4, 0.2, 0.4, | |
0.0, 0.1, 0.9, 0.0, | |
0.0, 0.1, 0.0, 0.9 | |
), 4, byrow = TRUE, dimnames = list(STATES, STATES)) | |
page_interval_time <- function(page) { | |
mean_interval <- switch(page, page1 = 30, 10) | |
return(as.integer(rnbinom(1, 1, mu = mean_interval - 1) + 1)) | |
} | |
session_interval_time <- function(state) { | |
interval_days <- switch( | |
state, | |
new = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)), | |
normal = (rpois(1, 2) + 1), | |
active = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)), | |
inactive = sample(12, 1) * 30 | |
) | |
return(as.integer(interval_days * 86400 + rnorm(1, 0, 3600))) | |
} | |
access_count_per_session <- function() { | |
mean_access_count <- 3 | |
return(rpois(1, mean_access_count - 1) + 1) | |
} | |
create_event_logs <- function(user_ids, cvr, state_trans_prob, pattern) { | |
cat(sprintf("Creating logs of '%s' pattern ", pattern)) | |
imp <- length(user_ids) | |
hours <- sample(24, imp, replace = TRUE, prob = HOUR_ACCESS_PROB) - 1L | |
secs <- hours * 3600L + sample(3600, imp, replace = TRUE) - 1L | |
dates <- sample(TARGET_DATES, imp, replace = TRUE, prob = WEEK_ACCESS_PROB[TARGET_DATES$wday + 1]) | |
imp_times <- as.integer(as.POSIXct(dates)) + secs | |
imp_logs <- data.frame(time = imp_times, user_id = user_ids, event = factor("imp"), pattern = factor(pattern)) | |
cv_idx <- which(sample(c(FALSE, TRUE), imp, replace = TRUE, prob = c(1 - cvr, cvr))) | |
cv_count <- length(cv_idx) | |
cv_times <- imp_times[cv_idx] + as.integer(rnbinom(cv_count, 1, mu = CV_INTERVAL - 1) + 1) | |
cv_user_ids <- user_ids[cv_idx] | |
cv_logs <- data.frame(time = cv_times, user_id = cv_user_ids, event = factor("cv"), pattern = factor(pattern)) | |
block_size <- 100000 | |
log_times <- integer(block_size) | |
log_user_ids <- integer(block_size) | |
log_pages <- character(block_size) | |
current_row <- 0L | |
progress <- 0 | |
progress_interval <- floor(length(cv_user_ids) / 10) | |
for (i in seq_along(cv_user_ids)) { | |
progress <- progress + 1 | |
if (progress %% progress_interval == 0) { | |
cat(".") | |
} | |
user_id <- cv_user_ids[i] | |
time <- cv_times[i] | |
state <- "new" | |
repeat { | |
page <- sample(PAGES, 1, prob = INIT_PAGE_PROB) | |
access_count <- access_count_per_session() | |
repeat { | |
current_row <- current_row + 1L | |
if (current_row > length(log_times)) { | |
log_times <- c(log_times, integer(length(log_times))) | |
log_user_ids <- c(log_user_ids, integer(length(log_user_ids))) | |
log_pages <- c(log_pages, character(length(log_pages))) | |
} | |
log_times[current_row] <- time | |
log_user_ids[current_row] <- user_id | |
log_pages[current_row] <- page | |
access_count <- access_count - 1 | |
if (access_count == 0) { | |
break | |
} | |
time <- time + page_interval_time(page) | |
page <- sample(PAGES, 1, prob = PAGE_TRANS_PROB[page, ]) | |
} | |
state <- sample(STATES, 1, prob = state_trans_prob[state, ]) | |
time <- time + session_interval_time(state) | |
if (time > MAX_LOG_TIME) { | |
break | |
} | |
} | |
} | |
access_logs <- data.frame( | |
time = log_times[1:current_row], | |
user_id = log_user_ids[1:current_row], | |
page = log_pages[1:current_row] | |
) | |
cat(" done\n") | |
return(list(event_logs = rbind(imp_logs, cv_logs), access_logs = access_logs)) | |
} | |
user_ids <- sample(MAX_USER_ID, A_IMP + B_IMP) | |
a_logs <- create_event_logs(user_ids[1:A_IMP], A_CVR, A_STATE_TRANS_PROB, "a") | |
b_logs <- create_event_logs(user_ids[-(1:A_IMP)], B_CVR, B_STATE_TRANS_PROB, "b") | |
event_logs <- rbind(a_logs$event_logs, b_logs$event_logs) | |
access_logs <- rbind(a_logs$access_logs, b_logs$access_logs) | |
dir <- tempdir() | |
event_log_file <- sprintf("%s/event_logs.tsv", dir) | |
access_log_file <- sprintf("%s/access_logs.tsv", dir) | |
cat("Writing files") | |
write.table(event_logs, file = event_log_file, sep = "\t", row.names = FALSE, quote = FALSE) | |
write.table(access_logs, file = access_log_file, sep = "\t", row.names = FALSE, quote = FALSE) | |
cat(" done\n") | |
return(list(event_log_file = event_log_file, access_log_file = access_log_file)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment