jonocarroll/fuzzygroup.R

## fuzzygroup.md

      
    Raw
  

              fuzzygroup.md
            
          
    ## Medical Term Fuzzy Grouping
## J. Carroll 2024
## 
## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin

## read in a set of medical terms, lowercased
terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))

## example data with typos and inserted words
gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
co <- c("myocarditis", "myocardits", "myocardites")

## find the closest matching word in wordlist, either as a direct string match
## or the lowest Levenshtein distance of all the words in wordlist
match_word <- function(word, wordlist) {
  word <- tolower(word)
  if (word %in% wordlist) return(word)
  wordlist[which.min(adist(word, wordlist)[1, ])]
}

## apply spellchecking to each word of a phrase of words
## joining back into a space-delimited phrase afterwards
spellcheck_phrase <- function(phrase, wordlist) {
  sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
}

## e.g. spellcheck the gi terms
spellcheck_phrase(gi, terms)
#> [1] "gastrointestinal disorders"       "gastrointestinal tract disorders"
#> [3] "gastrointestinal disorder"

## create an example dataset containing the (misspelled) terms and some values
meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])

## stir to ensure randomness works
meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
meddata
#>                               term value
#> 3       gastreinstestinal disorder     C
#> 4           hepatic encephalopathy     D
#> 8                       myocardits     H
#> 5           hepatic encephalapathy     E
#> 6            hepatic encefalopathy     F
#> 1       gastrointestinal disorders     A
#> 9                      myocardites     I
#> 7                      myocarditis     G
#> 2 gastrointestinal tract disorders     B

## add the corrected phrases to the data
meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
meddata
#>                               term value                        corrected
#> 3       gastreinstestinal disorder     C        gastrointestinal disorder
#> 4           hepatic encephalopathy     D           hepatic encephalopathy
#> 8                       myocardits     H                      myocarditis
#> 5           hepatic encephalapathy     E           hepatic encephalopathy
#> 6            hepatic encefalopathy     F           hepatic encephalopathy
#> 1       gastrointestinal disorders     A       gastrointestinal disorders
#> 9                      myocardites     I                      myocarditis
#> 7                      myocarditis     G                      myocarditis
#> 2 gastrointestinal tract disorders     B gastrointestinal tract disorders

## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
## the parameters here may need to be adjusted, but seem to work for this example data
meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
#> Loading required namespace: igraph
meddata
#>                               term value                        corrected
#> 3       gastreinstestinal disorder     C        gastrointestinal disorder
#> 4           hepatic encephalopathy     D           hepatic encephalopathy
#> 8                       myocardits     H                      myocarditis
#> 5           hepatic encephalapathy     E           hepatic encephalopathy
#> 6            hepatic encefalopathy     F           hepatic encephalopathy
#> 1       gastrointestinal disorders     A       gastrointestinal disorders
#> 9                      myocardites     I                      myocarditis
#> 7                      myocarditis     G                      myocarditis
#> 2 gastrointestinal tract disorders     B gastrointestinal tract disorders
#>                       group
#> 3 gastrointestinal disorder
#> 4    hepatic encephalopathy
#> 8               myocarditis
#> 5    hepatic encephalopathy
#> 6    hepatic encephalopathy
#> 1 gastrointestinal disorder
#> 9               myocarditis
#> 7               myocarditis
#> 2 gastrointestinal disorder

## grouping can now be done as usual
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
meddata |> 
  group_by(group) |> 
  summarise(res = toString(sort(value)))
#> # A tibble: 3 × 2
#>   group                     res    
#>   <chr>                     <chr>  
#> 1 gastrointestinal disorder A, B, C
#> 2 hepatic encephalopathy    D, E, F
#> 3 myocarditis               G, H, I
^{Created on 2024-03-13 with reprex v2.0.2}

  
## fuzzygroup.R
## Medical Term Fuzzy Grouping
## J. Carroll 2024
##
## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin

## read in a set of medical terms, lowercased
terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))

## example data with typos and inserted words
gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
co <- c("myocarditis", "myocardits", "myocardites")

## find the closest matching word in wordlist, either as a direct string match
## or the lowest Levenshtein distance of all the words in wordlist
match_word <- function(word, wordlist) {
  word <- tolower(word)
  if (word %in% wordlist) return(word)
  wordlist[which.min(adist(word, wordlist)[1, ])]
}

## apply spellchecking to each word of a phrase of words
## joining back into a space-delimited phrase afterwards
spellcheck_phrase <- function(phrase, wordlist) {
  sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
}

## e.g. spellcheck the gi terms
spellcheck_phrase(gi, terms)

## create an example dataset containing the (misspelled) terms and some values
meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])

## stir to ensure randomness works
meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
meddata

## add the corrected phrases to the data
meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
meddata

## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
## the parameters here may need to be adjusted, but seem to work for this example data
meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
meddata

## grouping can now be done as usual
library(dplyr)
meddata |>
  group_by(group) |>
  summarise(res = toString(sort(value)))
	## Medical Term Fuzzy Grouping
	## J. Carroll 2024
	##
	## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin

	## read in a set of medical terms, lowercased
	terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))

	## example data with typos and inserted words
	gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
	hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
	co <- c("myocarditis", "myocardits", "myocardites")

	## find the closest matching word in wordlist, either as a direct string match
	## or the lowest Levenshtein distance of all the words in wordlist
	match_word <- function(word, wordlist) {
	word <- tolower(word)
	if (word %in% wordlist) return(word)
	wordlist[which.min(adist(word, wordlist)[1, ])]
	}

	## apply spellchecking to each word of a phrase of words
	## joining back into a space-delimited phrase afterwards
	spellcheck_phrase <- function(phrase, wordlist) {
	sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
	}

	## e.g. spellcheck the gi terms
	spellcheck_phrase(gi, terms)

	## create an example dataset containing the (misspelled) terms and some values
	meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])

	## stir to ensure randomness works
	meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
	meddata

	## add the corrected phrases to the data
	meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
	meddata

	## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
	## the parameters here may need to be adjusted, but seem to work for this example data
	meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
	meddata

	## grouping can now be done as usual
	library(dplyr)
	meddata \|>
	group_by(group) \|>
	summarise(res = toString(sort(value)))