nmolivo/acronym_text_matcher.py

## acronym_text_matcher.py
from nltk.corpus import stopwords

def find_org(text, acronym):
    # good for abbrevs that are in all caps.
    try:
        text = text.split(acronym)[0]
    except:
        text = text
    orig_text_token_list = text.split(" ")
    text_token_list = [x.title() for x in orig_text_token_list]
    stop_w = [x.title() for x in list(set(stopwords.words("english")))]

    potential_match = []
    #if acronym is surrounded by parens, remove
    acronym = acronym.replace("(", "").replace(")", "")
    # return all indices of words that start w first letter of acronym
    for potential_start_idx in [
        i for i, x in enumerate(text_token_list) if x[:1] == acronym[:1]
    ]:
        potential_match.append(text_token_list[potential_start_idx])
        idx = potential_start_idx + 1
        matches = ""
        for i, letter in enumerate(acronym[1:]):
            has_match = False
            stop_word_counter = 0

            if text_token_list[idx][:1] == letter:
                potential_match.append(orig_text_token_list[idx])
                matches = matches + letter
                idx = idx + 1
                has_match = True

            while stop_word_counter <= 2 and text_token_list[idx] in stop_w:
                potential_match.append(orig_text_token_list[idx])
                stop_word_counter = stop_word_counter + 1
                idx = idx + 1

            if text_token_list[idx][:1] == letter and has_match = False:
                potential_match.append(orig_text_token_list[idx])
                matches = matches + letter
                idx = idx + 1
                has_match = True

        if matches != acronym[1:]:
            potential_match = []
        if matches == acronym[1:]:
            break
    try:
        if potential_match[-1].title() in stop_w:
            potential_match = potential_match[:-1]
        result = " ".join(potential_match)
    except:
        result = None
    return result
	from nltk.corpus import stopwords

	def find_org(text, acronym):
	# good for abbrevs that are in all caps.
	try:
	text = text.split(acronym)[0]
	except:
	text = text
	orig_text_token_list = text.split(" ")
	text_token_list = [x.title() for x in orig_text_token_list]
	stop_w = [x.title() for x in list(set(stopwords.words("english")))]

	potential_match = []
	#if acronym is surrounded by parens, remove
	acronym = acronym.replace("(", "").replace(")", "")
	# return all indices of words that start w first letter of acronym
	for potential_start_idx in [
	i for i, x in enumerate(text_token_list) if x[:1] == acronym[:1]
	]:
	potential_match.append(text_token_list[potential_start_idx])
	idx = potential_start_idx + 1
	matches = ""
	for i, letter in enumerate(acronym[1:]):
	has_match = False
	stop_word_counter = 0

	if text_token_list[idx][:1] == letter:
	potential_match.append(orig_text_token_list[idx])
	matches = matches + letter
	idx = idx + 1
	has_match = True

	while stop_word_counter <= 2 and text_token_list[idx] in stop_w:
	potential_match.append(orig_text_token_list[idx])
	stop_word_counter = stop_word_counter + 1
	idx = idx + 1

	if text_token_list[idx][:1] == letter and has_match = False:
	potential_match.append(orig_text_token_list[idx])
	matches = matches + letter
	idx = idx + 1
	has_match = True

	if matches != acronym[1:]:
	potential_match = []
	if matches == acronym[1:]:
	break
	try:
	if potential_match[-1].title() in stop_w:
	potential_match = potential_match[:-1]
	result = " ".join(potential_match)
	except:
	result = None
	return result