Skip to content

Instantly share code, notes, and snippets.

@turbomam
Last active July 7, 2020 16:12
Show Gist options
  • Save turbomam/f620e9e9f042b643e47de730328e3e83 to your computer and use it in GitHub Desktop.
Save turbomam/f620e9e9f042b643e47de730328e3e83 to your computer and use it in GitHub Desktop.
options(java.parameters = "-Xmx6g")
# see also https://jangorecki.gitlab.io/data.cube/library/data.table/html/dcast.data.table.html
library(config)
# library(dplyr)
# library(ggplot2)
library(httr)
# library(igraph)
library(jsonlite)
# library(randomForest)
library(rdflib)
# library(readr)
# library(readxl)
# library(reshape2)
# library(RJDBC)
library(solrium)
# library(stringdist)
# library(stringr)
# library(tm)
# library(uuid)
# # train
# library(splitstackshape)
#
# ### validation
# library(ROCR)
# library(caret)
# library(xgboost)
# # also try party and xgboot
# ensure that large integers aren't casted to scientific notation
# for example when being posted into a SQL query
options(scipen = 999)
print("Default file path set to:")
print(getwd())
config.file <- "~/graph_db_common.yaml"
config <- config::get(file = config.file)
chunk.vec <- function(vec, chunk.count) {
split(vec, cut(seq_along(vec), chunk.count, labels = FALSE))
}
make.table.frame <- function(my.vector) {
temp <- table(my.vector)
temp <- cbind.data.frame(names(temp), as.numeric(temp))
colnames(temp) <- c('value', 'count')
temp$value <- as.character(temp$value)
return(temp)
}
import.from.local.file <-
function(some.graph.name,
some.local.file,
some.rdf.format) {
print(some.graph.name)
print(some.local.file)
print(some.rdf.format)
post.dest <-
paste0(
config$my.graphdb.base,
'/repositories/',
config$my.selected.repo,
'/rdf-graphs/service?graph=',
some.graph.name
)
print(post.dest)
post.resp <-
httr::POST(
url = post.dest,
body = upload_file(some.local.file),
content_type(some.rdf.format),
authenticate(config$my.graphdb.username,
config$my.graphdb.pw,
type = 'basic')
)
print('Errors will be listed below:')
print(rawToChar(post.resp$content))
}
import.from.url <- function(some.graph.name,
some.ontology.url,
some.rdf.format) {
print(some.graph.name)
print(some.ontology.url)
print(some.rdf.format)
if (nchar(some.rdf.format) > 0) {
update.body <- paste0(
'{
"context": "',
some.graph.name,
'",
"data": "',
some.ontology.url,
'",
"format": "',
some.rdf.format,
'"
}'
)
} else {
update.body <- paste0('{
"context": "',
some.graph.name,
'",
"data": "',
some.ontology.url,
'"
}')
}
cat("\n")
cat(update.body)
cat("\n\n")
post.res <- POST(
url.post.endpoint,
body = update.body,
content_type("application/json"),
accept("application/json"),
saved.authentication
)
cat(rawToChar(post.res$content))
}
get.context.report <- function() {
context.report <- GET(
url = paste0(
config$my.graphdb.base,
"/repositories/",
config$my.selected.repo,
"/contexts"
),
saved.authentication
)
context.report <-
jsonlite::fromJSON(rawToChar(context.report$content))
context.report <-
context.report$results$bindings$contextID$value
return(context.report)
}
monitor.named.graphs <- function() {
while (TRUE) {
print(paste0(
Sys.time(),
": '",
last.post.status,
"' submitted at ",
last.post.time
))
context.report <- get.context.report()
pending.graphs <- sort(setdiff(expectation, context.report))
# will this properly handle the case when the report is empty (NULL)?
if (length(pending.graphs) == 0) {
print("Update complete")
break()
}
print(paste0("still waiting for: ", pending.graphs))
print(paste0("Next check in ",
config$monitor.pause.seconds,
" seconds."))
Sys.sleep(config$monitor.pause.seconds)
}
}
q2j2df <-
function(query,
endpoint = config$my.graphdb.base,
repo = config$my.selected.repo,
auth = saved.authentication) {
# query <- config$main.solr.query
minquery <- gsub(pattern = " +",
replacement = " ",
x = query)
rdfres <- httr::GET(
url = paste0(endpoint,
"/repositories/",
repo),
query = list(query = minquery),
auth
)
# convert binary JSON SPARQL results to a minimal dataframe
rdfres <-
jsonlite::fromJSON(rawToChar(rdfres$content))
rdfres <- rdfres$results$bindings
rdfres <-
do.call(what = cbind.data.frame, args = rdfres)
keepers <- colnames(rdfres)
keepers <- keepers[grepl(pattern = "value$", x = keepers)]
rdfres <- rdfres[, keepers]
# beautify column labels
temp <-
gsub(pattern = '\\.value$',
replacement = '',
x = colnames(rdfres))
colnames(rdfres) <- temp
return(rdfres)
}
url.post.endpoint <-
paste0(
config$my.graphdb.base,
"/rest/data/import/upload/",
config$my.selected.repo,
"/url"
)
update.endpoint <-
paste0(config$my.graphdb.base,
"/repositories/",
config$my.selected.repo,
"/statements")
select.endpoint <-
paste0(config$my.graphdb.base,
"/repositories/",
config$my.selected.repo)
saved.authentication <-
authenticate(config$my.graphdb.username,
config$my.graphdb.pw,
type = "basic")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment