Skip to content

Instantly share code, notes, and snippets.

@eibanez
Last active August 29, 2015 14:18
Show Gist options
  • Save eibanez/794021626bdbb7061ae3 to your computer and use it in GitHub Desktop.
Save eibanez/794021626bdbb7061ae3 to your computer and use it in GitHub Desktop.
# load contents of test.R first
library(RSQLite)
library(data.table)
library(dplyr)
test <- query_test(db, "interval", "Generator", "Generation")
query_test <- function(db, time, col, prop, columns = "name", time.range = NULL, filter = NULL, phase = 4) {
# Open connection
thesql <- src_sqlite(db$filename)
# Query interval data
# Get the table names that store the data
if (identical(prop, "*")) {
t.name <- db$properties[[1]]
} else if(length(prop) == 1L) {
t.name <- db$properties[[1]] %>% filter(property == prop)
} else {
t.name <- db$properties[[1]] %>% filter(property %in% prop)
}
t.name <- t.name %>%
filter(collection == col, phase_id == phase, is_summary == 0) %>%
select(collection, property, table_name) %>%
mutate(table_name = gsub("data_interval_", "", table_name))
# Get max/min time existing in the table to be queried
# In case time table has more time stamps than those in the dataset
time.limit <- t.name %>%
group_by(collection, property) %>%
do(
tbl(thesql, .$table_name) %>%
filter(phase_id == phase) %>%
summarize(time_from = min(time_from), time_to = max(time_to)) %>%
collect
)
min.time.data <- min(time.limit$time_from)
max.time.data <- max(time.limit$time_to)
message("min.time.data: ", min.time.data)
message("max.time.data: ", max.time.data)
# Collect time data
time.data <- tbl(thesql, "time") %>%
filter(phase_id == phase) %>%
filter(between(time, min.time.data, max.time.data)) %>%
rplexos:::filter_rplexos_time(time.range) %>%
select(time) %>%
collect
write.csv(time.data, "test-timedata.csv", row.names = FALSE)
# If time data is empty, return an empty data frame
if (nrow(time.data) == 0L) {
DBI::dbDisconnect(thesql$con)
return(data.frame())
}
# Convert into R time-data format
time.data$time <- lubridate::ymd_hms(time.data$time)
# Get interval data
out <- t.name %>%
group_by(collection, property) %>%
do(tbl(thesql, .$table_name) %>%
filter(phase_id == phase) %>%
rplexos:::filter_rplexos(filter) %>%
rplexos:::filter_rplexos_time(time.range, modified = TRUE) %>%
select(-time_to) %>%
rename(time = time_from) %>%
rplexos:::select_rplexos(columns, add.key = TRUE) %>%
collect
) %>%
ungroup %>%
mutate(time = lubridate::ymd_hms(time))
# Mask out data
out <- out %>%
mutate(name = name %>% factor %>% as.numeric,
value = value %>% factor %>% as.numeric)
write.csv(out, "test-out.csv", row.names = FALSE)
# Expand data
# This will be easier when dplyr supports rolling joins
out2 <- data.table(out, key = "key,time")
cj2 <- CJ(key = unique(out$key), time = time.data$time)
write.csv(cj2, "test-cj2.csv", row.names = FALSE)
message("key: \t", length(unique(out$key)))
message("time:\t", length(time.data$time))
message("time2:\t", length(unique(time.data$time)))
message("out2:\t", nrow(out2))
message("cj2: \t", nrow(cj2))
out3 <- out2[cj2, roll = TRUE, allow.cartesian = TRUE]
out <- out3 %>%
as.data.frame(stringsAsFactors = FALSE) %>%
select(-key)
write.csv(out3, "test-out3.csv", row.names = FALSE)
message("out3:\t", nrow(out3))
# Restore time zone
attributes(out$time) <- attributes(time.data$time)
# Disconnect database
DBI::dbDisconnect(thesql$con)
# Return value
return(out)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment