Last active
February 20, 2020 08:02
-
-
Save nmolivo/ed07ccc158e230b8e7fcaa3b04dbabc1 to your computer and use it in GitHub Desktop.
enter text and acronym to find what the acronym stands for, if un-abbreviated text of acronym appears in text. assumes text which contains acronym meaning appears before acronym, if acronym is mentioned. case sensitive. Allows for 2 stop words.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
def find_org(text, acronym): | |
# good for abbrevs that are in all caps. | |
try: | |
text = text.split(acronym)[0] | |
except: | |
text = text | |
orig_text_token_list = text.split(" ") | |
text_token_list = [x.title() for x in orig_text_token_list] | |
stop_w = [x.title() for x in list(set(stopwords.words("english")))] | |
potential_match = [] | |
#if acronym is surrounded by parens, remove | |
acronym = acronym.replace("(", "").replace(")", "") | |
# return all indices of words that start w first letter of acronym | |
for potential_start_idx in [ | |
i for i, x in enumerate(text_token_list) if x[:1] == acronym[:1] | |
]: | |
potential_match.append(text_token_list[potential_start_idx]) | |
idx = potential_start_idx + 1 | |
matches = "" | |
for i, letter in enumerate(acronym[1:]): | |
has_match = False | |
stop_word_counter = 0 | |
if text_token_list[idx][:1] == letter: | |
potential_match.append(orig_text_token_list[idx]) | |
matches = matches + letter | |
idx = idx + 1 | |
has_match = True | |
while stop_word_counter <= 2 and text_token_list[idx] in stop_w: | |
potential_match.append(orig_text_token_list[idx]) | |
stop_word_counter = stop_word_counter + 1 | |
idx = idx + 1 | |
if text_token_list[idx][:1] == letter and has_match = False: | |
potential_match.append(orig_text_token_list[idx]) | |
matches = matches + letter | |
idx = idx + 1 | |
has_match = True | |
if matches != acronym[1:]: | |
potential_match = [] | |
if matches == acronym[1:]: | |
break | |
try: | |
if potential_match[-1].title() in stop_w: | |
potential_match = potential_match[:-1] | |
result = " ".join(potential_match) | |
except: | |
result = None | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment