Skip to content

Instantly share code, notes, and snippets.

@abicky
Last active July 30, 2017 22:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abicky/dfdeaec9efded25b733a to your computer and use it in GitHub Desktop.
Save abicky/dfdeaec9efded25b733a to your computer and use it in GitHub Desktop.
# Copyright 2015- Takeshi Arabiki
# License: MIT License (http://opensource.org/licenses/MIT)
library(data.table)
access_logs <- (function() {
set.seed(0)
USER_COUNT <- 10000
PAGES <- c("page1", "page2", "page3")
STATES <- c("new", "normal", "active", "inactive")
INIT_PAGE_PROB <- structure(c(0.7, 0.2, 0.1), names = PAGES)
PAGE_TRANS_PROB <- matrix(c(
0.1, 0.4, 0.5,
0.8, 0.1, 0.1,
0.8, 0.1, 0.1
), 3, byrow = TRUE, dimnames = list(PAGES, PAGES))
STATE_TRANS_PROB <- matrix(c(
0.4, 0.2, 0.2, 0.2,
0.0, 0.4, 0.2, 0.4,
0.0, 0.1, 0.9, 0.0,
0.0, 0.1, 0.0, 0.9
), 4, byrow = TRUE, dimnames = list(STATES, STATES))
page_interval_time <- function(page) {
mean_interval <- switch(page, page1 = 30, 10)
return(as.integer(rnbinom(1, 1, mu = mean_interval - 1) + 1))
}
session_interval_time <- function(state) {
interval_days <- switch(
state,
new = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)),
normal = rpois(1, 2) + 1,
active = sample(1:3, 1, prob = c(0.7, 0.2, 0.1)),
inactive = sample(12, 1) * 30
)
return(as.integer(interval_days * 86400 + rnorm(1, 0, 3600)))
}
access_count_per_session <- function() {
mean_access_count <- 3
return(rpois(1, mean_access_count - 1) + 1)
}
HOUR_ACCESS_RATIO <- c(rep(0.1, 9), rep(1, 12), rep(0.1, 3))
BASE_TIME <- as.integer(as.POSIXct("2015-05-03"))
MAX_TIME <- as.integer(as.POSIXct("2015-06-06"))
MAX_USER_ID <- 999999999
BLOCK_SIZE <- 100000
user_ids <- sample(MAX_USER_ID, USER_COUNT)
hours <- sample(24, USER_COUNT, replace = TRUE, prob = HOUR_ACCESS_RATIO) - 1L
first_access_times <- BASE_TIME + hours * 3600L + sample(3600, USER_COUNT, replace = TRUE) - 1L
log_times <- integer(BLOCK_SIZE)
log_user_ids <- integer(BLOCK_SIZE)
log_pages <- character(BLOCK_SIZE)
current_row <- 0L
for (i in seq_along(user_ids)) {
user_id <- user_ids[i]
time <- first_access_times[i]
state <- "new"
repeat {
page <- sample(PAGES, 1, prob = INIT_PAGE_PROB)
access_count <- access_count_per_session()
repeat {
current_row <- current_row + 1L
if (current_row > length(log_times)) {
log_times <- c(log_times, integer(length(log_times)))
log_user_ids <- c(log_user_ids, integer(length(log_user_ids)))
log_pages <- c(log_pages, character(length(log_pages)))
}
log_times[current_row] <- time
log_user_ids[current_row] <- user_id
log_pages[current_row] <- page
access_count <- access_count - 1
if (access_count == 0) {
break
}
time <- time + page_interval_time(page)
page <- sample(PAGES, 1, prob = PAGE_TRANS_PROB[page, ])
}
state <- sample(STATES, 1, prob = STATE_TRANS_PROB[state, ])
time <- time + session_interval_time(state)
if (time >= MAX_TIME) {
break
}
}
}
access_logs <- data.table(
time = log_times[1:current_row],
user_id = log_user_ids[1:current_row],
page = log_pages[1:current_row]
)
access_logs[, c("idate", "itime") := IDateTime(as.POSIXlt(time, origin = "1970-01-01"))]
return(access_logs)
})()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment