Skip to content

Instantly share code, notes, and snippets.

@mskyttner
Created June 11, 2021 10:52
Show Gist options
  • Save mskyttner/256b63db0d5bb542cd21aa9e9d160cc0 to your computer and use it in GitHub Desktop.
Save mskyttner/256b63db0d5bb542cd21aa9e9d160cc0 to your computer and use it in GitHub Desktop.
R script to generate sqlite3 with FTS5 search index off a directory of text files
# first in ~/repos do this:
# git clone https://github.com/jart/cosmopolitan.git
library(purrr)
library(RSQLite)
library(dplyr)
create_cosmosources_db <- function(repo = "~/repos/cosmopolitan",
mydb = "/tmp/cosmosources.db",
regex_filter = "\\.c$") {
ls_repo <- function(repo, use_fullpath = FALSE, regex = regex_filter)
dir(repo, recursive = TRUE, full.names = use_fullpath, pattern = regex)
clean_source <- function(file)
grep("^([│╞╚])|^(/[*])", readLines(con = file, warn = FALSE),
invert = TRUE, value = TRUE, perl = TRUE) %>%
paste(collapse = "\n") %>%
tibble(file = basename(file), code = .)
read_source <- possibly(.f = clean_source, otherwise = NA_character_)
sources <-
repo %>%
ls_repo(use_fullpath = TRUE) %>%
map_df(.f = read_source) %>%
mutate(path = ls_repo(repo)) %>%
mutate(fullpath = file.path(path, file))
if (file.exists(mydb)) unlink(mydb)
con <- dbConnect(SQLite(), mydb)
on.exit(dbDisconnect(con))
copy_to(con, sources, name = "sources", overwrite = TRUE)
con %>% dbExecute(
"create virtual table fts using fts5(
fullpath, file, path, code
);")
con %>% dbExecute(
"insert into fts select fullpath, file, path, code from sources;"
)
n_keys <- con %>% dbGetQuery("select count(*) from fts;")
message("Added FTS index for ", n_keys, " entries.")
message("Database created at ", mydb)
}
search_db <- function(con, query, n_words_snip = 6) {
highlights <- paste(collapse = ", ", sprintf(
"highlight(fts, %s, '<b>', '</b>') %s",
c(0, 3), c("location", "code")))
snippet <- sprintf("snippet(fts, 3, '<b>', '</b>', '', %s) as snip", n_words_snip)
sql <- sprintf(paste(
"select file, %s, %s, rank as score from fts",
"where fts match '%s' order by rank;"),
highlights, snippet, query)
message("Searching for ", query, " using ", sql)
con %>% dbGetQuery(sql) %>% as_tibble()
}
# create a db with .c sources for cosmopolitan with an FTS5 index
repo <- "~/repos/cosmopolitan"
mydb <- "/tmp/cosmosources.db"
create_cosmosources_db(repo, mydb)
# do some FTS MATCH searches
# (see https://www.sqlitetutorial.net/sqlite-full-text-search/)
con <- dbConnect(SQLite(), mydb)
con %>% dbListTables()
con %>% search_db("code: lua*") %>% select(file, snip)
con %>% search_db("python")
con %>% search_db("rust")
con %>% search_db("crashreport")
con %>% search_db("arm")
dbDisconnect(con)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment