Created
May 5, 2024 20:16
-
-
Save BERENZ/42e8fa513376f947c28f40810d62341b to your computer and use it in GitHub Desktop.
RcppHNSW vs rnndescent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(RcppHNSW) | |
library(rnndescent) | |
library(tokenizers) | |
library(text2vec) | |
library(igraph) | |
## read and process data | |
census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") | |
cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") | |
setDT(census) | |
setDT(cis) | |
census[is.na(dob_day), dob_day := ""] | |
census[is.na(dob_mon), dob_mon := ""] | |
census[is.na(dob_year), dob_year := ""] | |
cis[is.na(dob_day), dob_day := ""] | |
cis[is.na(dob_mon), dob_mon := ""] | |
cis[is.na(dob_year), dob_year := ""] | |
census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] | |
cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] | |
census[, x:=1:.N] | |
cis[, y:=1:.N] | |
## create shingles and sparse mat | |
x_tokens <- text2vec::itoken_parallel( | |
iterable = census$txt, | |
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = 2), | |
n_chunks = 10, progressbar = verbose) | |
x_voc <- text2vec::create_vocabulary(x_tokens) | |
x_vec <- text2vec::vocab_vectorizer(x_voc) | |
x_dtm <- text2vec::create_dtm(x_tokens, x_vec) | |
y_tokens <- text2vec::itoken_parallel( | |
iterable = cis$txt, | |
tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = 2), | |
n_chunks = 10, progressbar = verbose) | |
y_voc <- text2vec::create_vocabulary(y_tokens) | |
y_vec <- text2vec::vocab_vectorizer(y_voc) | |
y_dtm <- text2vec::create_dtm(y_tokens, y_vec) | |
colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm)) | |
## hnsw | |
hnsw_index <- RcppHNSW::hnsw_build(X = as.matrix(x_dtm[, colnames_xy]), n_threads = 4, distance = "cosine") | |
hnsw_result <- RcppHNSW::hnsw_search(X = as.matrix(y_dtm[, colnames_xy]), ann = hnsw_index, | |
k = 1, n_threads = 4) | |
## nndescent | |
nndes_index <- rnndescent::rnnd_build(data = as.matrix(x_dtm[, colnames_xy]), k = 40, | |
metric = "cosine", low_memory = FALSE, n_threads = 4) | |
nndes_result <- rnndescent::rnnd_query(nndes_index, query = as.matrix(y_dtm[, colnames_xy]), k = 1, n_threads = 4) | |
## compare with truth | |
matches <- merge(x = census[, .(x, person_id)], | |
y = cis[, .(y, person_id)], | |
by = "person_id") | |
matches[, hnsw:= hnsw_result$idx[,1][matches$y]] | |
matches[, nndesc:= nndes_result$idx[,1][matches$y]] | |
## share of correct neighbours | |
matches[, .(hnsw = mean(hnsw == x), nndesc = mean(nndesc == x))] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment