Skip to content

Instantly share code, notes, and snippets.

@tts
Last active February 28, 2021 13:26
Show Gist options
  • Save tts/0c87ad15017fb21eea7c3287b698370f to your computer and use it in GitHub Desktop.
Save tts/0c87ad15017fb21eea7c3287b698370f to your computer and use it in GitHub Desktop.
Handling a Boolean query
library(stringr)
library(tidyverse)
#---------------------------------------------------------------
# https://twitter.com/nealhaddaway/status/1362512900716433409
#
# His example query
#--------------------------------------------------------------
q <- "((arable OR agricult* OR farm* OR crop* OR cultivat* OR field*) AND
(plough* OR plow* OR till* OR 'direct drill*' OR fertili* OR biosolid* OR
'bio solid' OR organic OR manur* or sewage OR compost* OR amendment* OR biochar*
OR digestate* OR 'crop residue*' OR 'crop straw*' OR mulch* OR 'crop rotat*' OR 'break crop*'
OR 'grass ley' OR 'clover ley' OR legume* OR 'bioenergy crop*' OR 'cover crop*' OR 'grass clover'
OR 'cropping system*' OR 'crop system' OR 'winter crop*' OR 'spring crop*' OR 'summer fallow*'
OR 'catch crop*' OR intercrop* OR conservation) AND (CH4 OR methane OR CO2 OR 'carbon dioxide'
OR N2O OR 'nitrous oxide' OR GHG* OR 'greenhouse gas*' OR 'green-house gas*') AND (flux* OR dynamic*
OR emission* OR exchang* OR balanc*))"
#----------------------------------
# First, replace hyphens with a Å,
# ie make them word chars
#----------------------------------
q <- gsub("-", "Å", q)
#------------------------------------------------------------------
# Detect two-word terms aka phrases (within apostrophes),
# and replace white space in them with a Ä, ie make them word chars
#
# https://stackoverflow.com/a/50205051
#------------------------------------------------------------------
repl <- regex("
(?<=') # positive lookbehind for an ' char
\\w+ # one or more word chars
(\\s?) # optional white space
\\w* # optional word(s)
\\*? # optional ending * char
", comments = TRUE)
replace_whitespace <- function(str) {
str_replace_all(str, "\\s", "Ä")
}
q_whitespace_replaced <- q %>%
str_replace_all(repl, replace_whitespace)
word_chars <- regex("
[\\w\\*]+ # one or more word characters or a * char
", comments = TRUE)
#------------------------
# Extract words to a list
#------------------------
words <- str_extract_all(q_whitespace_replaced, word_chars)
#---------------------------------------
# Detect OR clauses before the next AND
#---------------------------------------
do_q <- function(w, start, end) {
l <- vector("list", end)
stopped <- 1
for ( i in start:end ) {
if ( grepl("^AND", w[[1]][i]) ) {
stopped <- i
break
} else {
l[i] <- w[[1]][i]
names(l)[i] <- paste0("col", i)
}
}
return_obj <- list(l, stopped)
return(return_obj)
}
#-------------------------------
# Store all OR clauses in a list
#-------------------------------
res_list <- vector("list")
# First round
list_length <- length(words[[1]])
res <- do_q(words, 1, list_length)
res_list[1] <- res[1]
next_list_item <- unlist(res[2])+1
# and the rest
i <- 1
while ( next_list_item != 2 ) {
i <- i+1
res <- do_q(words, next_list_item, list_length)
res_list[i] <- res[1]
next_list_item <- unlist(res[2])+1
}
#--------------------------------------------------------------------------------------------------
# Save only list items with a name "col*", i.e. those with and OR
#
# https://community.rstudio.com/t/use-of-grep-to-sub-list-the-list-based-on-its-items-name-in-r/53798/2
#--------------------------------------------------------------------------------------------------
res_list_ors <- vector("list")
for ( i in 1:length(res_list) ){
index_list <- grep("col.*", names(res_list[[i]]))
res_list_ors[i] <- list(res_list[[i]][index_list])
}
#-------------------------------
# Construct the Boolean query
#-------------------------------
bstring <- ""
for ( i in 1:length(res_list_ors) ) {
# Remove the help Ä char, and add apostrophes, ie restore phrases
b_raw <- gsub("^([^X]+)Ä(.*)", "'\\1 \\2'", res_list_ors[[i]])
# Replace the help Å char with the original hyphen
b_raw <- gsub("Å", "-", b_raw)
# Concat OR terms
b <- paste(unlist(b_raw), collapse = " ")
# Concat all ORs with AND
bstring <- paste0(bstring, ") AND (", b)
}
#-------------------------------------------
# Clean: remove the leading AND,
# and wrap the whole string with parentheses
#-------------------------------------------
bstring <- gsub('^) AND ', '', bstring)
bstring <- paste0('(', bstring, '))')
(bstring)
# [1] "((arable OR agricult* OR farm* OR crop* OR cultivat* OR field*) AND
# (plough* OR plow* OR till* OR 'direct drill*' OR fertili* OR biosolid* OR 'bio solid'
# OR organic OR manur* or sewage OR compost* OR amendment* OR biochar* OR digestate* OR 'crop residue*'
# OR 'crop straw*' OR mulch* OR 'crop rotat*' OR 'break crop*' OR 'grass ley' OR 'clover ley' OR legume*
# OR 'bioenergy crop*' OR 'cover crop*' OR 'grass clover' OR 'cropping system*' OR 'crop system' OR 'winter crop*'
# OR 'spring crop*' OR 'summer fallow*' OR 'catch crop*' OR intercrop* OR conservation) AND (CH4 OR methane OR CO2
# OR 'carbon dioxide' OR N2O OR 'nitrous oxide' OR GHG* OR 'greenhouse gas*' OR 'green-house gas*') AND (flux*
# OR dynamic* OR emission* OR exchang* OR balanc*))"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment