Skip to content

Instantly share code, notes, and snippets.

@jonocarroll
Created March 13, 2024 05:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jonocarroll/e18f6eeee39875ccd7c4a65411ff1d0a to your computer and use it in GitHub Desktop.
Save jonocarroll/e18f6eeee39875ccd7c4a65411ff1d0a to your computer and use it in GitHub Desktop.
Perform fuzzy grouping on medical terms
## Medical Term Fuzzy Grouping
## J. Carroll 2024
## 
## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin

## read in a set of medical terms, lowercased
terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))

## example data with typos and inserted words
gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
co <- c("myocarditis", "myocardits", "myocardites")

## find the closest matching word in wordlist, either as a direct string match
## or the lowest Levenshtein distance of all the words in wordlist
match_word <- function(word, wordlist) {
  word <- tolower(word)
  if (word %in% wordlist) return(word)
  wordlist[which.min(adist(word, wordlist)[1, ])]
}

## apply spellchecking to each word of a phrase of words
## joining back into a space-delimited phrase afterwards
spellcheck_phrase <- function(phrase, wordlist) {
  sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
}

## e.g. spellcheck the gi terms
spellcheck_phrase(gi, terms)
#> [1] "gastrointestinal disorders"       "gastrointestinal tract disorders"
#> [3] "gastrointestinal disorder"

## create an example dataset containing the (misspelled) terms and some values
meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])

## stir to ensure randomness works
meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
meddata
#>                               term value
#> 3       gastreinstestinal disorder     C
#> 4           hepatic encephalopathy     D
#> 8                       myocardits     H
#> 5           hepatic encephalapathy     E
#> 6            hepatic encefalopathy     F
#> 1       gastrointestinal disorders     A
#> 9                      myocardites     I
#> 7                      myocarditis     G
#> 2 gastrointestinal tract disorders     B

## add the corrected phrases to the data
meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
meddata
#>                               term value                        corrected
#> 3       gastreinstestinal disorder     C        gastrointestinal disorder
#> 4           hepatic encephalopathy     D           hepatic encephalopathy
#> 8                       myocardits     H                      myocarditis
#> 5           hepatic encephalapathy     E           hepatic encephalopathy
#> 6            hepatic encefalopathy     F           hepatic encephalopathy
#> 1       gastrointestinal disorders     A       gastrointestinal disorders
#> 9                      myocardites     I                      myocarditis
#> 7                      myocarditis     G                      myocarditis
#> 2 gastrointestinal tract disorders     B gastrointestinal tract disorders

## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
## the parameters here may need to be adjusted, but seem to work for this example data
meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
#> Loading required namespace: igraph
meddata
#>                               term value                        corrected
#> 3       gastreinstestinal disorder     C        gastrointestinal disorder
#> 4           hepatic encephalopathy     D           hepatic encephalopathy
#> 8                       myocardits     H                      myocarditis
#> 5           hepatic encephalapathy     E           hepatic encephalopathy
#> 6            hepatic encefalopathy     F           hepatic encephalopathy
#> 1       gastrointestinal disorders     A       gastrointestinal disorders
#> 9                      myocardites     I                      myocarditis
#> 7                      myocarditis     G                      myocarditis
#> 2 gastrointestinal tract disorders     B gastrointestinal tract disorders
#>                       group
#> 3 gastrointestinal disorder
#> 4    hepatic encephalopathy
#> 8               myocarditis
#> 5    hepatic encephalopathy
#> 6    hepatic encephalopathy
#> 1 gastrointestinal disorder
#> 9               myocarditis
#> 7               myocarditis
#> 2 gastrointestinal disorder

## grouping can now be done as usual
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
meddata |> 
  group_by(group) |> 
  summarise(res = toString(sort(value)))
#> # A tibble: 3 × 2
#>   group                     res    
#>   <chr>                     <chr>  
#> 1 gastrointestinal disorder A, B, C
#> 2 hepatic encephalopathy    D, E, F
#> 3 myocarditis               G, H, I

Created on 2024-03-13 with reprex v2.0.2

## Medical Term Fuzzy Grouping
## J. Carroll 2024
##
## Uses the {zoomerjoin} package: https://github.com/beniaminogreen/zoomerjoin
## read in a set of medical terms, lowercased
terms <- tolower(readLines("https://raw.githubusercontent.com/socd06/medical-nlp/master/data/vocab.txt"))
## example data with typos and inserted words
gi <- c("gastrointestinal disorders", "gastrointestinal tract disorders", "gastreinstestinal disorder")
hep <- c("hepatic encephalopathy", "hepatic encephalapathy", "hepatic encefalopathy")
co <- c("myocarditis", "myocardits", "myocardites")
## find the closest matching word in wordlist, either as a direct string match
## or the lowest Levenshtein distance of all the words in wordlist
match_word <- function(word, wordlist) {
word <- tolower(word)
if (word %in% wordlist) return(word)
wordlist[which.min(adist(word, wordlist)[1, ])]
}
## apply spellchecking to each word of a phrase of words
## joining back into a space-delimited phrase afterwards
spellcheck_phrase <- function(phrase, wordlist) {
sapply(phrase, \(w) paste(sapply(strsplit(w, " ")[[1]], \(word) match_word(word, wordlist)), collapse = " "), USE.NAMES = FALSE)
}
## e.g. spellcheck the gi terms
spellcheck_phrase(gi, terms)
## create an example dataset containing the (misspelled) terms and some values
meddata <- data.frame(term = c(gi, hep, co), value = LETTERS[1:9])
## stir to ensure randomness works
meddata <- meddata[match(meddata$value, strsplit("FIABDEHCG", "")[[1]]), ]
meddata
## add the corrected phrases to the data
meddata$corrected <- sapply(meddata$term, \(x) spellcheck_phrase(x, terms), USE.NAMES = FALSE)
meddata
## perform a grouping of the corrected terms, assigning a 'canonical' value to each group
## the parameters here may need to be adjusted, but seem to work for this example data
meddata$group <- zoomerjoin::jaccard_string_group(meddata$corrected, threshold = 0.1)
meddata
## grouping can now be done as usual
library(dplyr)
meddata |>
group_by(group) |>
summarise(res = toString(sort(value)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment