jdswinbank/gist:456beb08453a12130b875300b832b106

## gistfile1.txt
"""
Cross-match RxNorm-RxCUI with Medicare Part D drug names.
"""
from collections import defaultdict
from collections import namedtuple
from difflib import SequenceMatcher
import re
import sys

def normalize_text(text):
    """
    Expand common abbreviations in `text`.
    """
    replacements = [
        ("ndl ", "needle "),
        ("ndl,", "needle,"),
        ("ndl-", "needle-"),
        ("syring ", "syringe "),
        ("insul ", "insulin "),
        ("insul,", "insulin,"),
        ("hcl", ""),
        ("hydrochloride", ""),
        ("sod ", "sodium ")
    ]
    for rpl in replacements:
        text = text.replace(rpl[0], rpl[1])
    return text

def remove_quantities(text):
    """
    Remove anything that looks like a quantity with units.
    """
    unit_strings = ["ml", "g", "mg", "unit", "%"]
    for unit_string in unit_strings:
        text = re.sub(r"\d+(\.\d+)? ?%s" % unit_string, "", text)
    return text

def squeeze_spaces(text):
    """
    Replace multiple space characters with a single space.
    """
    return re.sub(r" +", " ", text)

def strip_text(input_string, to_remove):
    return re.sub(to_remove, "", input_string, re.I)

def search_for_name(rxnorm_lookup, name, threshold=0.8):
    rxnorms = []
    name = normalize_text(name)
    name = remove_quantities(name)
    name = squeeze_spaces(name)

    Match = namedtuple("Match", ("number", "name", "score"))
    best_match = Match([None], "", 0)

    seq = SequenceMatcher()
    seq.set_seq2(name)
    for rx_name in name_to_rxnorm.keys():
        seq.set_seq1(rx_name) #squeeze_spaces(remove_quantities(normalize_text(rx_name))))
        if seq.real_quick_ratio() > best_match.score and seq.real_quick_ratio() >= threshold:
            if seq.quick_ratio() > best_match.score and seq.quick_ratio() >= threshold:
                if seq.ratio() > best_match.score and seq.ratio() >= threshold:
                    best_match = Match(name_to_rxnorm[rx_name], rx_name, seq.ratio())
        if best_match.score == 1.0: break
    return best_match

if __name__ == "__main__":
    name_to_rxnorm = defaultdict(list)

    with open("rxnorm_rxcui_name_assoc.uniq.csv", "r") as f:
        for line in f:
            if line[0] == "#": continue
            rxnorm, name = line.strip().split('\t')
            name_to_rxnorm[name].append(rxnorm)

    with open("drugnames.csv", "r") as f:
        for line in f:
            if line[0] == "#": continue
            brand, generic = line.strip().split('\t')

            best_brand = search_for_name(name_to_rxnorm, brand)
            best_generic = search_for_name(name_to_rxnorm, generic)
            best = best_brand if best_brand.score > best_generic.score else best_generic

            print(line.strip(), best.number, best.name, best.score)
            sys.stdout.flush()
	"""
	Cross-match RxNorm-RxCUI with Medicare Part D drug names.
	"""
	from collections import defaultdict
	from collections import namedtuple
	from difflib import SequenceMatcher
	import re
	import sys

	def normalize_text(text):
	"""
	Expand common abbreviations in `text`.
	"""
	replacements = [
	("ndl ", "needle "),
	("ndl,", "needle,"),
	("ndl-", "needle-"),
	("syring ", "syringe "),
	("insul ", "insulin "),
	("insul,", "insulin,"),
	("hcl", ""),
	("hydrochloride", ""),
	("sod ", "sodium ")
	]
	for rpl in replacements:
	text = text.replace(rpl[0], rpl[1])
	return text

	def remove_quantities(text):
	"""
	Remove anything that looks like a quantity with units.
	"""
	unit_strings = ["ml", "g", "mg", "unit", "%"]
	for unit_string in unit_strings:
	text = re.sub(r"\d+(\.\d+)? ?%s" % unit_string, "", text)
	return text

	def squeeze_spaces(text):
	"""
	Replace multiple space characters with a single space.
	"""
	return re.sub(r" +", " ", text)

	def strip_text(input_string, to_remove):
	return re.sub(to_remove, "", input_string, re.I)

	def search_for_name(rxnorm_lookup, name, threshold=0.8):
	rxnorms = []
	name = normalize_text(name)
	name = remove_quantities(name)
	name = squeeze_spaces(name)

	Match = namedtuple("Match", ("number", "name", "score"))
	best_match = Match([None], "", 0)

	seq = SequenceMatcher()
	seq.set_seq2(name)
	for rx_name in name_to_rxnorm.keys():
	seq.set_seq1(rx_name) #squeeze_spaces(remove_quantities(normalize_text(rx_name))))
	if seq.real_quick_ratio() > best_match.score and seq.real_quick_ratio() >= threshold:
	if seq.quick_ratio() > best_match.score and seq.quick_ratio() >= threshold:
	if seq.ratio() > best_match.score and seq.ratio() >= threshold:
	best_match = Match(name_to_rxnorm[rx_name], rx_name, seq.ratio())
	if best_match.score == 1.0: break
	return best_match

	if __name__ == "__main__":
	name_to_rxnorm = defaultdict(list)

	with open("rxnorm_rxcui_name_assoc.uniq.csv", "r") as f:
	for line in f:
	if line[0] == "#": continue
	rxnorm, name = line.strip().split('\t')
	name_to_rxnorm[name].append(rxnorm)

	with open("drugnames.csv", "r") as f:
	for line in f:
	if line[0] == "#": continue
	brand, generic = line.strip().split('\t')

	best_brand = search_for_name(name_to_rxnorm, brand)
	best_generic = search_for_name(name_to_rxnorm, generic)
	best = best_brand if best_brand.score > best_generic.score else best_generic

	print(line.strip(), best.number, best.name, best.score)
	sys.stdout.flush()