# Copyright 2015- Takeshi Arabiki | |
# License: MIT License (http://opensource.org/licenses/MIT) | |
library(data.table) | |
access_logs <- (function() { | |
set.seed(0) | |
USER_COUNT <- 10000 | |
PAGES <- c("page1", "page2", "page3") | |
STATES <- c("new", "normal", "active", "inactive") | |
INIT_PAGE_PROB <- structure(c(0.7, 0.2, 0.1), names = PAGES) | |
PAGE_TRANS_PROB <- matrix(c( | |
0.1, 0.4, 0.5, | |
0.8, 0.1, 0.1, | |
0.8, 0.1, 0.1 | |
), 3, byrow = TRUE, dimnames = list(PAGES, PAGES)) | |
STATE_TRANS_PROB <- matrix(c( | |
0.4, 0.2, 0.2, 0.2, | |
0.0, 0.4, 0.2, 0.4, | |
0.0, 0.1, 0.9, 0.0, | |
0.0, 0.1, 0.0, 0.9 | |
), 4, byrow = TRUE, dimnames = list(STATES, STATES)) | |
page_interval_time <- function(page) { | |
mean_interval <- switch(page, page1 = 30, 10) | |
return(as.integer(rnbinom(1, 1, mu = mean_interval - 1) + 1)) | |
} | |
session_interval_time <- function(state) { | |
interval_days <- switch( | |
state, | |
new = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)), | |
normal = rpois(1, 2) + 1, | |
active = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)), | |
inactive = sample(12, 1) * 30 | |
) | |
return(as.integer(interval_days * 86400 + rnorm(1, 0, 3600))) | |
} | |
access_count_per_session <- function() { | |
mean_access_count <- 3 | |
return(rpois(1, mean_access_count - 1) + 1) | |
} | |
HOUR_ACCESS_RATIO <- c(rep(0.1, 9), rep(1, 12), rep(0.1, 3)) | |
BASE_TIME <- as.integer(as.POSIXct("2015-05-03")) | |
MAX_TIME <- as.integer(as.POSIXct("2015-06-06")) | |
MAX_USER_ID <- 999999999 | |
BLOCK_SIZE <- 100000 | |
user_ids <- sample(MAX_USER_ID, USER_COUNT) | |
hours <- sample(24, USER_COUNT, replace = TRUE, prob = HOUR_ACCESS_RATIO) - 1L | |
first_access_times <- BASE_TIME + hours * 3600L + sample(3600, USER_COUNT, replace = TRUE) - 1L | |
log_times <- integer(BLOCK_SIZE) | |
log_user_ids <- integer(BLOCK_SIZE) | |
log_pages <- character(BLOCK_SIZE) | |
current_row <- 0L | |
for (i in seq_along(user_ids)) { | |
user_id <- user_ids[i] | |
time <- first_access_times[i] | |
state <- "new" | |
repeat { | |
page <- sample(PAGES, 1, prob = INIT_PAGE_PROB) | |
access_count <- access_count_per_session() | |
repeat { | |
current_row <- current_row + 1L | |
if (current_row > length(log_times)) { | |
log_times <- c(log_times, integer(length(log_times))) | |
log_user_ids <- c(log_user_ids, integer(length(log_user_ids))) | |
log_pages <- c(log_pages, character(length(log_pages))) | |
} | |
log_times[current_row] <- time | |
log_user_ids[current_row] <- user_id | |
log_pages[current_row] <- page | |
access_count <- access_count - 1 | |
if (access_count == 0) { | |
break | |
} | |
time <- time + page_interval_time(page) | |
page <- sample(PAGES, 1, prob = PAGE_TRANS_PROB[page, ]) | |
} | |
state <- sample(STATES, 1, prob = STATE_TRANS_PROB[state, ]) | |
time <- time + session_interval_time(state) | |
if (time >= MAX_TIME) { | |
break | |
} | |
} | |
} | |
access_logs <- data.table( | |
time = log_times[1:current_row], | |
user_id = log_user_ids[1:current_row], | |
page = log_pages[1:current_row] | |
) | |
access_logs[, c("idate", "itime") := IDateTime(as.POSIXlt(time, origin = "1970-01-01"))] | |
return(access_logs) | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment