Skip to content

Instantly share code, notes, and snippets.

@nmolivo
Last active February 20, 2020 08:02
Show Gist options
  • Save nmolivo/ed07ccc158e230b8e7fcaa3b04dbabc1 to your computer and use it in GitHub Desktop.
Save nmolivo/ed07ccc158e230b8e7fcaa3b04dbabc1 to your computer and use it in GitHub Desktop.
enter text and acronym to find what the acronym stands for, if un-abbreviated text of acronym appears in text. assumes text which contains acronym meaning appears before acronym, if acronym is mentioned. case sensitive. Allows for 2 stop words.
from nltk.corpus import stopwords
def find_org(text, acronym):
# good for abbrevs that are in all caps.
try:
text = text.split(acronym)[0]
except:
text = text
orig_text_token_list = text.split(" ")
text_token_list = [x.title() for x in orig_text_token_list]
stop_w = [x.title() for x in list(set(stopwords.words("english")))]
potential_match = []
#if acronym is surrounded by parens, remove
acronym = acronym.replace("(", "").replace(")", "")
# return all indices of words that start w first letter of acronym
for potential_start_idx in [
i for i, x in enumerate(text_token_list) if x[:1] == acronym[:1]
]:
potential_match.append(text_token_list[potential_start_idx])
idx = potential_start_idx + 1
matches = ""
for i, letter in enumerate(acronym[1:]):
has_match = False
stop_word_counter = 0
if text_token_list[idx][:1] == letter:
potential_match.append(orig_text_token_list[idx])
matches = matches + letter
idx = idx + 1
has_match = True
while stop_word_counter <= 2 and text_token_list[idx] in stop_w:
potential_match.append(orig_text_token_list[idx])
stop_word_counter = stop_word_counter + 1
idx = idx + 1
if text_token_list[idx][:1] == letter and has_match = False:
potential_match.append(orig_text_token_list[idx])
matches = matches + letter
idx = idx + 1
has_match = True
if matches != acronym[1:]:
potential_match = []
if matches == acronym[1:]:
break
try:
if potential_match[-1].title() in stop_w:
potential_match = potential_match[:-1]
result = " ".join(potential_match)
except:
result = None
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment