Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jdswinbank/456beb08453a12130b875300b832b106 to your computer and use it in GitHub Desktop.
Save jdswinbank/456beb08453a12130b875300b832b106 to your computer and use it in GitHub Desktop.
"""
Cross-match RxNorm-RxCUI with Medicare Part D drug names.
"""
from collections import defaultdict
from collections import namedtuple
from difflib import SequenceMatcher
import re
import sys
def normalize_text(text):
"""
Expand common abbreviations in `text`.
"""
replacements = [
("ndl ", "needle "),
("ndl,", "needle,"),
("ndl-", "needle-"),
("syring ", "syringe "),
("insul ", "insulin "),
("insul,", "insulin,"),
("hcl", ""),
("hydrochloride", ""),
("sod ", "sodium ")
]
for rpl in replacements:
text = text.replace(rpl[0], rpl[1])
return text
def remove_quantities(text):
"""
Remove anything that looks like a quantity with units.
"""
unit_strings = ["ml", "g", "mg", "unit", "%"]
for unit_string in unit_strings:
text = re.sub(r"\d+(\.\d+)? ?%s" % unit_string, "", text)
return text
def squeeze_spaces(text):
"""
Replace multiple space characters with a single space.
"""
return re.sub(r" +", " ", text)
def strip_text(input_string, to_remove):
return re.sub(to_remove, "", input_string, re.I)
def search_for_name(rxnorm_lookup, name, threshold=0.8):
rxnorms = []
name = normalize_text(name)
name = remove_quantities(name)
name = squeeze_spaces(name)
Match = namedtuple("Match", ("number", "name", "score"))
best_match = Match([None], "", 0)
seq = SequenceMatcher()
seq.set_seq2(name)
for rx_name in name_to_rxnorm.keys():
seq.set_seq1(rx_name) #squeeze_spaces(remove_quantities(normalize_text(rx_name))))
if seq.real_quick_ratio() > best_match.score and seq.real_quick_ratio() >= threshold:
if seq.quick_ratio() > best_match.score and seq.quick_ratio() >= threshold:
if seq.ratio() > best_match.score and seq.ratio() >= threshold:
best_match = Match(name_to_rxnorm[rx_name], rx_name, seq.ratio())
if best_match.score == 1.0: break
return best_match
if __name__ == "__main__":
name_to_rxnorm = defaultdict(list)
with open("rxnorm_rxcui_name_assoc.uniq.csv", "r") as f:
for line in f:
if line[0] == "#": continue
rxnorm, name = line.strip().split('\t')
name_to_rxnorm[name].append(rxnorm)
with open("drugnames.csv", "r") as f:
for line in f:
if line[0] == "#": continue
brand, generic = line.strip().split('\t')
best_brand = search_for_name(name_to_rxnorm, brand)
best_generic = search_for_name(name_to_rxnorm, generic)
best = best_brand if best_brand.score > best_generic.score else best_generic
print(line.strip(), best.number, best.name, best.score)
sys.stdout.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment