Skip to content

Instantly share code, notes, and snippets.

@gsasikiran
Last active February 2, 2021 16:16
Show Gist options
  • Save gsasikiran/19d769c16c09c610cefe5236a45d282c to your computer and use it in GitHub Desktop.
Save gsasikiran/19d769c16c09c610cefe5236a45d282c to your computer and use it in GitHub Desktop.
Find all the synonyms for a given word in a text corpus
import argparse
import regex as re
from nltk.corpus import wordnet
class SynonymSearch:
def __init__(self):
pass
@staticmethod
def __split_words(word):
"""
Split the combined synonyms from wordnet into words with spaces
:param word: string
The synoynm from the wordnet
:return: string
Returns the splitted version of string
Example
> __split_words(python_is_good)
>> 'python is good'
"""
# Wordnet consists the synonyms of compound words with format 'playing_period'
split_list = re.split(r'_', word)
synonym = str()
for strings in split_list:
synonym += strings + ' '
# Returns string except last character as the last character is ' ' which is not preferred for the regex pattern
return synonym[:-1]
def get_synonyms(self, word):
"""
Return the strings of synonyms in regex pattern
:param word: string
The word for which synonyms are needed
:return: string
string with word boundaries of synonym
Returns string in the syntax : '\bsynonym_1\b|\bsynonym2\b| ..................... |\bynonym_n\b'
"""
syn_list = []
return_string = ''
synsets = wordnet.synsets(word)
for synset in synsets:
for lemma in synset.lemmas():
if lemma.name() not in syn_list:
syn_list.append(self.__split_words(lemma.name()))
for synonym in syn_list:
return_string += "\\b" + synonym + "\\b" + '|'
# Returns string except last character as the last character is '|' which is not required for the regex pattern
return return_string[:-1]
@staticmethod
def return_words_from_text(pattern, text):
"""
Prints the synonyms in the text and their spans in the counts of characters
:param pattern: string
String pattern of synonyms to be input to the regex
:param text: string
Text from the user's file
:return: None
"""
r = re.compile(pattern, flags=re.I | re.DOTALL)
for m in r.finditer(text):
print(m.group(), '(%d - %d)' % (m.start(), m.end()))
if __name__ == '__main__':
parser = argparse.ArgumentParser("Enter the path and the word")
parser.add_argument('path', type=str, help='text file path, usually .txt extension')
parser.add_argument('word', type=str, help='word for which synonyms have to be extracted')
args = parser.parse_args()
find_synonyms = SynonymSearch()
with open(args.path, 'r', encoding='utf-8') as text_file:
corpus_text = text_file.read()
regex_pattern = find_synonyms.get_synonyms(args.word)
find_synonyms.return_words_from_text(regex_pattern, corpus_text)
@gsasikiran
Copy link
Author

gsasikiran commented Jul 31, 2020

Description

The program aims to find synonyms along with the search word in the text corpus and returns the positions of the synonyms in the text corpus. Synonyms with compound words/ multiple words are also found in the text corpus. Synonyms are extracted from the wordnet corpus.

Packages

  • nltk
  • regex

Run:

python synonym_search.py text_path word

Example

Command:
python synonym_search.py sample.txt play

Output:
`period of play (1262 - 1276)

play (1311 - 1315)

act (9912 - 9915)

turn (32793 - 32797)`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment