Last active
February 2, 2021 16:16
-
-
Save gsasikiran/19d769c16c09c610cefe5236a45d282c to your computer and use it in GitHub Desktop.
Find all the synonyms for a given word in a text corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import regex as re | |
from nltk.corpus import wordnet | |
class SynonymSearch: | |
def __init__(self): | |
pass | |
@staticmethod | |
def __split_words(word): | |
""" | |
Split the combined synonyms from wordnet into words with spaces | |
:param word: string | |
The synoynm from the wordnet | |
:return: string | |
Returns the splitted version of string | |
Example | |
> __split_words(python_is_good) | |
>> 'python is good' | |
""" | |
# Wordnet consists the synonyms of compound words with format 'playing_period' | |
split_list = re.split(r'_', word) | |
synonym = str() | |
for strings in split_list: | |
synonym += strings + ' ' | |
# Returns string except last character as the last character is ' ' which is not preferred for the regex pattern | |
return synonym[:-1] | |
def get_synonyms(self, word): | |
""" | |
Return the strings of synonyms in regex pattern | |
:param word: string | |
The word for which synonyms are needed | |
:return: string | |
string with word boundaries of synonym | |
Returns string in the syntax : '\bsynonym_1\b|\bsynonym2\b| ..................... |\bynonym_n\b' | |
""" | |
syn_list = [] | |
return_string = '' | |
synsets = wordnet.synsets(word) | |
for synset in synsets: | |
for lemma in synset.lemmas(): | |
if lemma.name() not in syn_list: | |
syn_list.append(self.__split_words(lemma.name())) | |
for synonym in syn_list: | |
return_string += "\\b" + synonym + "\\b" + '|' | |
# Returns string except last character as the last character is '|' which is not required for the regex pattern | |
return return_string[:-1] | |
@staticmethod | |
def return_words_from_text(pattern, text): | |
""" | |
Prints the synonyms in the text and their spans in the counts of characters | |
:param pattern: string | |
String pattern of synonyms to be input to the regex | |
:param text: string | |
Text from the user's file | |
:return: None | |
""" | |
r = re.compile(pattern, flags=re.I | re.DOTALL) | |
for m in r.finditer(text): | |
print(m.group(), '(%d - %d)' % (m.start(), m.end())) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser("Enter the path and the word") | |
parser.add_argument('path', type=str, help='text file path, usually .txt extension') | |
parser.add_argument('word', type=str, help='word for which synonyms have to be extracted') | |
args = parser.parse_args() | |
find_synonyms = SynonymSearch() | |
with open(args.path, 'r', encoding='utf-8') as text_file: | |
corpus_text = text_file.read() | |
regex_pattern = find_synonyms.get_synonyms(args.word) | |
find_synonyms.return_words_from_text(regex_pattern, corpus_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Description
The program aims to find synonyms along with the search word in the text corpus and returns the positions of the synonyms in the text corpus. Synonyms with compound words/ multiple words are also found in the text corpus. Synonyms are extracted from the wordnet corpus.
Packages
Run:
python synonym_search.py text_path word
Example
Command:
python synonym_search.py sample.txt play
Output:
`period of play (1262 - 1276)
play (1311 - 1315)
act (9912 - 9915)
turn (32793 - 32797)`