Skip to content

Instantly share code, notes, and snippets.

@BERENZ
Created May 5, 2024 20:16
Show Gist options
  • Save BERENZ/42e8fa513376f947c28f40810d62341b to your computer and use it in GitHub Desktop.
Save BERENZ/42e8fa513376f947c28f40810d62341b to your computer and use it in GitHub Desktop.
RcppHNSW vs rnndescent
library(RcppHNSW)
library(rnndescent)
library(tokenizers)
library(text2vec)
library(igraph)
## read and process data
census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv")
cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv")
setDT(census)
setDT(cis)
census[is.na(dob_day), dob_day := ""]
census[is.na(dob_mon), dob_mon := ""]
census[is.na(dob_year), dob_year := ""]
cis[is.na(dob_day), dob_day := ""]
cis[is.na(dob_mon), dob_mon := ""]
cis[is.na(dob_year), dob_year := ""]
census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)]
census[, x:=1:.N]
cis[, y:=1:.N]
## create shingles and sparse mat
x_tokens <- text2vec::itoken_parallel(
iterable = census$txt,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = 2),
n_chunks = 10, progressbar = verbose)
x_voc <- text2vec::create_vocabulary(x_tokens)
x_vec <- text2vec::vocab_vectorizer(x_voc)
x_dtm <- text2vec::create_dtm(x_tokens, x_vec)
y_tokens <- text2vec::itoken_parallel(
iterable = cis$txt,
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = 2),
n_chunks = 10, progressbar = verbose)
y_voc <- text2vec::create_vocabulary(y_tokens)
y_vec <- text2vec::vocab_vectorizer(y_voc)
y_dtm <- text2vec::create_dtm(y_tokens, y_vec)
colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm))
## hnsw
hnsw_index <- RcppHNSW::hnsw_build(X = as.matrix(x_dtm[, colnames_xy]), n_threads = 4, distance = "cosine")
hnsw_result <- RcppHNSW::hnsw_search(X = as.matrix(y_dtm[, colnames_xy]), ann = hnsw_index,
k = 1, n_threads = 4)
## nndescent
nndes_index <- rnndescent::rnnd_build(data = as.matrix(x_dtm[, colnames_xy]), k = 40,
metric = "cosine", low_memory = FALSE, n_threads = 4)
nndes_result <- rnndescent::rnnd_query(nndes_index, query = as.matrix(y_dtm[, colnames_xy]), k = 1, n_threads = 4)
## compare with truth
matches <- merge(x = census[, .(x, person_id)],
y = cis[, .(y, person_id)],
by = "person_id")
matches[, hnsw:= hnsw_result$idx[,1][matches$y]]
matches[, nndesc:= nndes_result$idx[,1][matches$y]]
## share of correct neighbours
matches[, .(hnsw = mean(hnsw == x), nndesc = mean(nndesc == x))]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment