gsasikiran/synonym_search.py

## synonym_search.py
import argparse

import regex as re
from nltk.corpus import wordnet


class SynonymSearch:
    def __init__(self):
        pass

    @staticmethod
    def __split_words(word):
        """
            Split the combined synonyms from wordnet into words with spaces
        :param word: string
            The synoynm from the wordnet
        :return: string
            Returns the splitted version of string

        Example
        > __split_words(python_is_good)
        >> 'python is good'
        """

        # Wordnet consists the synonyms of compound words with format 'playing_period'
        split_list = re.split(r'_', word)

        synonym = str()

        for strings in split_list:
            synonym += strings + ' '
        # Returns string except last character as the last character is ' ' which is not preferred for the regex pattern
        return synonym[:-1]

    def get_synonyms(self, word):
        """
         Return the strings of synonyms in regex pattern

        :param word: string
            The word for which synonyms are needed
        :return: string
            string with word boundaries of synonym
            Returns string in the syntax : '\bsynonym_1\b|\bsynonym2\b| ..................... |\bynonym_n\b'
        """

        syn_list = []
        return_string = ''

        synsets = wordnet.synsets(word)

        for synset in synsets:
            for lemma in synset.lemmas():
                if lemma.name() not in syn_list:
                    syn_list.append(self.__split_words(lemma.name()))

        for synonym in syn_list:
            return_string += "\\b" + synonym + "\\b" + '|'
        # Returns string except last character as the last character is '|' which is not required for the regex pattern
        return return_string[:-1]

    @staticmethod
    def return_words_from_text(pattern, text):
        """
            Prints the synonyms in the text and their spans in the counts of characters
        :param pattern: string
            String pattern of synonyms to be input to the regex
        :param text: string
            Text from the user's file
        :return: None
        """
        r = re.compile(pattern, flags=re.I | re.DOTALL)

        for m in r.finditer(text):
            print(m.group(), '(%d - %d)' % (m.start(), m.end()))


if __name__ == '__main__':
    parser = argparse.ArgumentParser("Enter the path and the word")
    parser.add_argument('path', type=str, help='text file path, usually .txt extension')
    parser.add_argument('word', type=str, help='word for which synonyms have to be extracted')

    args = parser.parse_args()

    find_synonyms = SynonymSearch()

    with open(args.path, 'r', encoding='utf-8') as text_file:
        corpus_text = text_file.read()

    regex_pattern = find_synonyms.get_synonyms(args.word)

    find_synonyms.return_words_from_text(regex_pattern, corpus_text)
	import argparse

	import regex as re
	from nltk.corpus import wordnet


	class SynonymSearch:
	def __init__(self):
	pass

	@staticmethod
	def __split_words(word):
	"""
	Split the combined synonyms from wordnet into words with spaces
	:param word: string
	The synoynm from the wordnet
	:return: string
	Returns the splitted version of string

	Example
	> __split_words(python_is_good)
	>> 'python is good'
	"""

	# Wordnet consists the synonyms of compound words with format 'playing_period'
	split_list = re.split(r'_', word)

	synonym = str()

	for strings in split_list:
	synonym += strings + ' '
	# Returns string except last character as the last character is ' ' which is not preferred for the regex pattern
	return synonym[:-1]

	def get_synonyms(self, word):
	"""
	Return the strings of synonyms in regex pattern

	:param word: string
	The word for which synonyms are needed
	:return: string
	string with word boundaries of synonym
	Returns string in the syntax : '\bsynonym_1\b\|\bsynonym2\b\| ..................... \|\bynonym_n\b'
	"""

	syn_list = []
	return_string = ''

	synsets = wordnet.synsets(word)

	for synset in synsets:
	for lemma in synset.lemmas():
	if lemma.name() not in syn_list:
	syn_list.append(self.__split_words(lemma.name()))

	for synonym in syn_list:
	return_string += "\\b" + synonym + "\\b" + '\|'
	# Returns string except last character as the last character is '\|' which is not required for the regex pattern
	return return_string[:-1]

	@staticmethod
	def return_words_from_text(pattern, text):
	"""
	Prints the synonyms in the text and their spans in the counts of characters
	:param pattern: string
	String pattern of synonyms to be input to the regex
	:param text: string
	Text from the user's file
	:return: None
	"""
	r = re.compile(pattern, flags=re.I \| re.DOTALL)

	for m in r.finditer(text):
	print(m.group(), '(%d - %d)' % (m.start(), m.end()))


	if __name__ == '__main__':
	parser = argparse.ArgumentParser("Enter the path and the word")
	parser.add_argument('path', type=str, help='text file path, usually .txt extension')
	parser.add_argument('word', type=str, help='word for which synonyms have to be extracted')

	args = parser.parse_args()

	find_synonyms = SynonymSearch()

	with open(args.path, 'r', encoding='utf-8') as text_file:
	corpus_text = text_file.read()

	regex_pattern = find_synonyms.get_synonyms(args.word)

	find_synonyms.return_words_from_text(regex_pattern, corpus_text)