pr130/find_similar_words.R

## find_similar_words.R
library(stringr)
library(dplyr)
library(stringdist)

df <- tibble(word = c("Agrilus pilosivittatus",
                      "Agrilus pilosovittatus",
                      "foo",
                      "bar",
                      "baz",
                      "something else",
                      "some stuff",
                      "some stff"))

# create a lag variable (this "shifts" the whole variable)
df <- df %>%
  mutate(word_before = lag(word)) %>%
  mutate(row_num = row_number())

# look at the new data (see the "shift"?)
df

# calculate the levensthein distance between the two
df <- df %>%
  mutate(dist = stringdist::stringdist(word, word_before))

df

# decide which rows to keep based on levensthein distance
rows_to_keep <- df %>%
  filter(dist < 2) %>% # adapt here to keep pairs with more / less levenshtein distance
  pull(row_num) # get the row_num variable as a vector

rows_to_keep
# we also want to get the word before each of the rows, so for each row number also keep the row before
# the c() means we "concatinate" the two vectors

rows_to_keep # we want this
rows_to_keep - 1 # "plus" this
all_rows_to_keep <- c(rows_to_keep, rows_to_keep  - 1)

sort(all_rows_to_keep) # just to look at it

# get words!!
df %>%
  filter(row_num %in% all_rows_to_keep) %>%
  select(word)
	library(stringr)
	library(dplyr)
	library(stringdist)

	df <- tibble(word = c("Agrilus pilosivittatus",
	"Agrilus pilosovittatus",
	"foo",
	"bar",
	"baz",
	"something else",
	"some stuff",
	"some stff"))

	# create a lag variable (this "shifts" the whole variable)
	df <- df %>%
	mutate(word_before = lag(word)) %>%
	mutate(row_num = row_number())

	# look at the new data (see the "shift"?)
	df

	# calculate the levensthein distance between the two
	df <- df %>%
	mutate(dist = stringdist::stringdist(word, word_before))

	df

	# decide which rows to keep based on levensthein distance
	rows_to_keep <- df %>%
	filter(dist < 2) %>% # adapt here to keep pairs with more / less levenshtein distance
	pull(row_num) # get the row_num variable as a vector

	rows_to_keep
	# we also want to get the word before each of the rows, so for each row number also keep the row before
	# the c() means we "concatinate" the two vectors

	rows_to_keep # we want this
	rows_to_keep - 1 # "plus" this
	all_rows_to_keep <- c(rows_to_keep, rows_to_keep - 1)

	sort(all_rows_to_keep) # just to look at it

	# get words!!
	df %>%
	filter(row_num %in% all_rows_to_keep) %>%
	select(word)