Last active
February 28, 2021 13:26
-
-
Save tts/0c87ad15017fb21eea7c3287b698370f to your computer and use it in GitHub Desktop.
Handling a Boolean query
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(tidyverse) | |
#--------------------------------------------------------------- | |
# https://twitter.com/nealhaddaway/status/1362512900716433409 | |
# | |
# His example query | |
#-------------------------------------------------------------- | |
q <- "((arable OR agricult* OR farm* OR crop* OR cultivat* OR field*) AND | |
(plough* OR plow* OR till* OR 'direct drill*' OR fertili* OR biosolid* OR | |
'bio solid' OR organic OR manur* or sewage OR compost* OR amendment* OR biochar* | |
OR digestate* OR 'crop residue*' OR 'crop straw*' OR mulch* OR 'crop rotat*' OR 'break crop*' | |
OR 'grass ley' OR 'clover ley' OR legume* OR 'bioenergy crop*' OR 'cover crop*' OR 'grass clover' | |
OR 'cropping system*' OR 'crop system' OR 'winter crop*' OR 'spring crop*' OR 'summer fallow*' | |
OR 'catch crop*' OR intercrop* OR conservation) AND (CH4 OR methane OR CO2 OR 'carbon dioxide' | |
OR N2O OR 'nitrous oxide' OR GHG* OR 'greenhouse gas*' OR 'green-house gas*') AND (flux* OR dynamic* | |
OR emission* OR exchang* OR balanc*))" | |
#---------------------------------- | |
# First, replace hyphens with a Å, | |
# ie make them word chars | |
#---------------------------------- | |
q <- gsub("-", "Å", q) | |
#------------------------------------------------------------------ | |
# Detect two-word terms aka phrases (within apostrophes), | |
# and replace white space in them with a Ä, ie make them word chars | |
# | |
# https://stackoverflow.com/a/50205051 | |
#------------------------------------------------------------------ | |
repl <- regex(" | |
(?<=') # positive lookbehind for an ' char | |
\\w+ # one or more word chars | |
(\\s?) # optional white space | |
\\w* # optional word(s) | |
\\*? # optional ending * char | |
", comments = TRUE) | |
replace_whitespace <- function(str) { | |
str_replace_all(str, "\\s", "Ä") | |
} | |
q_whitespace_replaced <- q %>% | |
str_replace_all(repl, replace_whitespace) | |
word_chars <- regex(" | |
[\\w\\*]+ # one or more word characters or a * char | |
", comments = TRUE) | |
#------------------------ | |
# Extract words to a list | |
#------------------------ | |
words <- str_extract_all(q_whitespace_replaced, word_chars) | |
#--------------------------------------- | |
# Detect OR clauses before the next AND | |
#--------------------------------------- | |
do_q <- function(w, start, end) { | |
l <- vector("list", end) | |
stopped <- 1 | |
for ( i in start:end ) { | |
if ( grepl("^AND", w[[1]][i]) ) { | |
stopped <- i | |
break | |
} else { | |
l[i] <- w[[1]][i] | |
names(l)[i] <- paste0("col", i) | |
} | |
} | |
return_obj <- list(l, stopped) | |
return(return_obj) | |
} | |
#------------------------------- | |
# Store all OR clauses in a list | |
#------------------------------- | |
res_list <- vector("list") | |
# First round | |
list_length <- length(words[[1]]) | |
res <- do_q(words, 1, list_length) | |
res_list[1] <- res[1] | |
next_list_item <- unlist(res[2])+1 | |
# and the rest | |
i <- 1 | |
while ( next_list_item != 2 ) { | |
i <- i+1 | |
res <- do_q(words, next_list_item, list_length) | |
res_list[i] <- res[1] | |
next_list_item <- unlist(res[2])+1 | |
} | |
#-------------------------------------------------------------------------------------------------- | |
# Save only list items with a name "col*", i.e. those with and OR | |
# | |
# https://community.rstudio.com/t/use-of-grep-to-sub-list-the-list-based-on-its-items-name-in-r/53798/2 | |
#-------------------------------------------------------------------------------------------------- | |
res_list_ors <- vector("list") | |
for ( i in 1:length(res_list) ){ | |
index_list <- grep("col.*", names(res_list[[i]])) | |
res_list_ors[i] <- list(res_list[[i]][index_list]) | |
} | |
#------------------------------- | |
# Construct the Boolean query | |
#------------------------------- | |
bstring <- "" | |
for ( i in 1:length(res_list_ors) ) { | |
# Remove the help Ä char, and add apostrophes, ie restore phrases | |
b_raw <- gsub("^([^X]+)Ä(.*)", "'\\1 \\2'", res_list_ors[[i]]) | |
# Replace the help Å char with the original hyphen | |
b_raw <- gsub("Å", "-", b_raw) | |
# Concat OR terms | |
b <- paste(unlist(b_raw), collapse = " ") | |
# Concat all ORs with AND | |
bstring <- paste0(bstring, ") AND (", b) | |
} | |
#------------------------------------------- | |
# Clean: remove the leading AND, | |
# and wrap the whole string with parentheses | |
#------------------------------------------- | |
bstring <- gsub('^) AND ', '', bstring) | |
bstring <- paste0('(', bstring, '))') | |
(bstring) | |
# [1] "((arable OR agricult* OR farm* OR crop* OR cultivat* OR field*) AND | |
# (plough* OR plow* OR till* OR 'direct drill*' OR fertili* OR biosolid* OR 'bio solid' | |
# OR organic OR manur* or sewage OR compost* OR amendment* OR biochar* OR digestate* OR 'crop residue*' | |
# OR 'crop straw*' OR mulch* OR 'crop rotat*' OR 'break crop*' OR 'grass ley' OR 'clover ley' OR legume* | |
# OR 'bioenergy crop*' OR 'cover crop*' OR 'grass clover' OR 'cropping system*' OR 'crop system' OR 'winter crop*' | |
# OR 'spring crop*' OR 'summer fallow*' OR 'catch crop*' OR intercrop* OR conservation) AND (CH4 OR methane OR CO2 | |
# OR 'carbon dioxide' OR N2O OR 'nitrous oxide' OR GHG* OR 'greenhouse gas*' OR 'green-house gas*') AND (flux* | |
# OR dynamic* OR emission* OR exchang* OR balanc*))" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment