Created
April 2, 2017 00:00
-
-
Save jdswinbank/456beb08453a12130b875300b832b106 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Cross-match RxNorm-RxCUI with Medicare Part D drug names. | |
""" | |
from collections import defaultdict | |
from collections import namedtuple | |
from difflib import SequenceMatcher | |
import re | |
import sys | |
def normalize_text(text): | |
""" | |
Expand common abbreviations in `text`. | |
""" | |
replacements = [ | |
("ndl ", "needle "), | |
("ndl,", "needle,"), | |
("ndl-", "needle-"), | |
("syring ", "syringe "), | |
("insul ", "insulin "), | |
("insul,", "insulin,"), | |
("hcl", ""), | |
("hydrochloride", ""), | |
("sod ", "sodium ") | |
] | |
for rpl in replacements: | |
text = text.replace(rpl[0], rpl[1]) | |
return text | |
def remove_quantities(text): | |
""" | |
Remove anything that looks like a quantity with units. | |
""" | |
unit_strings = ["ml", "g", "mg", "unit", "%"] | |
for unit_string in unit_strings: | |
text = re.sub(r"\d+(\.\d+)? ?%s" % unit_string, "", text) | |
return text | |
def squeeze_spaces(text): | |
""" | |
Replace multiple space characters with a single space. | |
""" | |
return re.sub(r" +", " ", text) | |
def strip_text(input_string, to_remove): | |
return re.sub(to_remove, "", input_string, re.I) | |
def search_for_name(rxnorm_lookup, name, threshold=0.8): | |
rxnorms = [] | |
name = normalize_text(name) | |
name = remove_quantities(name) | |
name = squeeze_spaces(name) | |
Match = namedtuple("Match", ("number", "name", "score")) | |
best_match = Match([None], "", 0) | |
seq = SequenceMatcher() | |
seq.set_seq2(name) | |
for rx_name in name_to_rxnorm.keys(): | |
seq.set_seq1(rx_name) #squeeze_spaces(remove_quantities(normalize_text(rx_name)))) | |
if seq.real_quick_ratio() > best_match.score and seq.real_quick_ratio() >= threshold: | |
if seq.quick_ratio() > best_match.score and seq.quick_ratio() >= threshold: | |
if seq.ratio() > best_match.score and seq.ratio() >= threshold: | |
best_match = Match(name_to_rxnorm[rx_name], rx_name, seq.ratio()) | |
if best_match.score == 1.0: break | |
return best_match | |
if __name__ == "__main__": | |
name_to_rxnorm = defaultdict(list) | |
with open("rxnorm_rxcui_name_assoc.uniq.csv", "r") as f: | |
for line in f: | |
if line[0] == "#": continue | |
rxnorm, name = line.strip().split('\t') | |
name_to_rxnorm[name].append(rxnorm) | |
with open("drugnames.csv", "r") as f: | |
for line in f: | |
if line[0] == "#": continue | |
brand, generic = line.strip().split('\t') | |
best_brand = search_for_name(name_to_rxnorm, brand) | |
best_generic = search_for_name(name_to_rxnorm, generic) | |
best = best_brand if best_brand.score > best_generic.score else best_generic | |
print(line.strip(), best.number, best.name, best.score) | |
sys.stdout.flush() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment