Last active
April 23, 2022 21:32
-
-
Save tezansahu/88e64a5d4ac18bb7fd51792d33a12325 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keybert import KeyBERT | |
import nltk | |
# Use NLTK's data downloader to download the required data packages (WordNet and Open Multilingual Wordnet) if not present already | |
for resource in ["wordnet", "omw-1.4"]: | |
try: | |
nltk_path = nltk.find("corpora/{0}".format(resource)) | |
except Exception: | |
nltk.download(resource) | |
from nltk.corpus import wordnet | |
class KeywordSynonyms: | |
def __init__(self): | |
# KeyBERT model for Keyword Extraction | |
print("Loading KeyBERT Model for Keyword Extraction.") | |
self.keyword_extraction_model = KeyBERT() | |
def extractKeywords(self, text): | |
keywords = self.keyword_extraction_model.extract_keywords(text) | |
# The output is of the format [('keyword1', 'score1'), ('keyword2', 'score2'), ...] | |
return [x[0] for x in keywords] | |
def getSynonyms(self, word, max_synonyms=6): | |
synonyms = [] | |
for syn in wordnet.synsets(word): | |
for l in syn.lemmas(): | |
synonyms.append(l.name().replace("_", " ")) | |
# Multi-word synonyms contain a '_' between the words, which needs to be replaced with a ' ' | |
return [x for x in list(set(synonyms)) if x.lower() != word.lower()][:max_synonyms] | |
# Consider those synonyms that are not the same as the original word | |
def getSynonymsForKeywords(self, text, max_synonyms=6): | |
keywords = self.extractKeywords(text) | |
kw_syn = {} | |
for word in keywords: | |
synonyms = self.getSynonyms(word, max_synonyms) | |
if len(synonyms) > 0: | |
kw_syn[word] = synonyms | |
return kw_syn |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment