-
-
Save mskyttner/256b63db0d5bb542cd21aa9e9d160cc0 to your computer and use it in GitHub Desktop.
R script to generate sqlite3 with FTS5 search index off a directory of text files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# first in ~/repos do this: | |
# git clone https://github.com/jart/cosmopolitan.git | |
library(purrr) | |
library(RSQLite) | |
library(dplyr) | |
create_cosmosources_db <- function(repo = "~/repos/cosmopolitan", | |
mydb = "/tmp/cosmosources.db", | |
regex_filter = "\\.c$") { | |
ls_repo <- function(repo, use_fullpath = FALSE, regex = regex_filter) | |
dir(repo, recursive = TRUE, full.names = use_fullpath, pattern = regex) | |
clean_source <- function(file) | |
grep("^([│╞╚])|^(/[*])", readLines(con = file, warn = FALSE), | |
invert = TRUE, value = TRUE, perl = TRUE) %>% | |
paste(collapse = "\n") %>% | |
tibble(file = basename(file), code = .) | |
read_source <- possibly(.f = clean_source, otherwise = NA_character_) | |
sources <- | |
repo %>% | |
ls_repo(use_fullpath = TRUE) %>% | |
map_df(.f = read_source) %>% | |
mutate(path = ls_repo(repo)) %>% | |
mutate(fullpath = file.path(path, file)) | |
if (file.exists(mydb)) unlink(mydb) | |
con <- dbConnect(SQLite(), mydb) | |
on.exit(dbDisconnect(con)) | |
copy_to(con, sources, name = "sources", overwrite = TRUE) | |
con %>% dbExecute( | |
"create virtual table fts using fts5( | |
fullpath, file, path, code | |
);") | |
con %>% dbExecute( | |
"insert into fts select fullpath, file, path, code from sources;" | |
) | |
n_keys <- con %>% dbGetQuery("select count(*) from fts;") | |
message("Added FTS index for ", n_keys, " entries.") | |
message("Database created at ", mydb) | |
} | |
search_db <- function(con, query, n_words_snip = 6) { | |
highlights <- paste(collapse = ", ", sprintf( | |
"highlight(fts, %s, '<b>', '</b>') %s", | |
c(0, 3), c("location", "code"))) | |
snippet <- sprintf("snippet(fts, 3, '<b>', '</b>', '', %s) as snip", n_words_snip) | |
sql <- sprintf(paste( | |
"select file, %s, %s, rank as score from fts", | |
"where fts match '%s' order by rank;"), | |
highlights, snippet, query) | |
message("Searching for ", query, " using ", sql) | |
con %>% dbGetQuery(sql) %>% as_tibble() | |
} | |
# create a db with .c sources for cosmopolitan with an FTS5 index | |
repo <- "~/repos/cosmopolitan" | |
mydb <- "/tmp/cosmosources.db" | |
create_cosmosources_db(repo, mydb) | |
# do some FTS MATCH searches | |
# (see https://www.sqlitetutorial.net/sqlite-full-text-search/) | |
con <- dbConnect(SQLite(), mydb) | |
con %>% dbListTables() | |
con %>% search_db("code: lua*") %>% select(file, snip) | |
con %>% search_db("python") | |
con %>% search_db("rust") | |
con %>% search_db("crashreport") | |
con %>% search_db("arm") | |
dbDisconnect(con) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment