goophile/parse_words.py

## parse_words.py
#!/usr/bin/env python3

import os
import sys
import re
import subprocess
from collections import Counter, OrderedDict


CUR_DIR = os.path.dirname(os.path.realpath(__file__))
MARKS = r""" ,./?;':"|\<>[]{}()-_=+*&^%$#@!~0987654321` """
KNOWN_FILE = os.path.join(CUR_DIR, 'known_words.txt')
NAME_FILE = os.path.join(CUR_DIR, 'names.txt')


def bash_cmd(c):
    """
    Run a cmd with bash and return the exit code and text output.
    """

    cmd = ['bash', '-c', c]
    child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=False)

    stdout, _stderr = child.communicate()
    rc = int(child.returncode)

    return (rc, stdout)


def read_words(file_path):
    """
    Read all words from the file, remove all punctuations and change all letters to lower.
    Return a OrderedDict ordered by words frequency.
    Note: only words_by_frequency.keys() are ordered, words_by_frequency.items() are not ordered.
    """
    with open(file_path) as f:
        lines = f.readlines()

    words = []

    for line in lines:
        for mark in MARKS:
            line = line.replace(mark, ' ')
        line = line.lower()
        words += line.split()

    # remove short one
    words = [word for word in words if len(word) > 2]

    # sort by frequency
    counts = Counter(words)
    words_by_frequency = OrderedDict(sorted(counts.items(), key=lambda t: t[1], reverse=True))

    return words_by_frequency


def search_wordnet(word):
    """
    WordNet can handle tenses, comparatives and plurals, etc.
    The cmd wordnet returns the number of matches. (return 0 if no match, -1 if database error.)

    We only handle 4 types of words here: noun/verb/adj/adv.
    Some output examples:

    1) Overview of noun girl
       The noun girl has 5 senses (first 5 from tagged texts)

    2) Overview of noun walk
       The noun walk has 7 senses (first 6 from tagged texts)
       Overview of verb walk
       The verb walk has 10 senses (first 5 from tagged texts)

    3) Overview of adj able
       The adj able has 4 senses (first 3 from tagged texts)

    4) Overview of adv well
       The adv well has 13 senses (first 8 from tagged texts)

    Return value: return a list of tuples.
    Each tuple has the type, basic form (remove tenses/plurals...), and a list of all their meanings.
    [
        ('noun', 'basic form', ['first meaning', 'second meaning', ]),
        ('verb', 'basic form', ['first meaning', 'second meaning', 'third meaning']),
        ('adj' , 'basic form', []),
        ('adv' , 'basic form', ['first meaning', ]),
    ]
    """

    rc, output = bash_cmd('wordnet {word} -over'.format(word=word))

    # no match
    if rc == 0:
        return None

    # database not found
    if rc == 255:
        print(output)
        raise Exception('Fatal error - cannot open WordNet database')

    # separators = ['\nOverview of noun ', '\nOverview of verb ', '\nOverview of adj ', '\nOverview of adv ']
    word_types = ['noun', 'verb', 'adj', 'adv']
    groups = output.split('\nOverview of ')

    return_list = []
    for group in groups:
        lines = group.splitlines()
        if len(lines) == 0:
            continue

        overview_line = lines[0]
        if not overview_line.strip():
            continue

        word_type, basic_form = overview_line.split()[0], overview_line.split()[1]

        meanings = []
        for line in lines[1:]:
            re_obj = re.search(r'^\d+\. (.*)', line)
            if re_obj:
                meaning = re_obj.group(1)
                meanings.append(meaning)

        return_list.append((word_type, basic_form, meanings))

    return return_list


def _test_wordnet():
    """
    Here we use the word 'well' to test.
    """
    word = 'better'
    return_list = search_wordnet(word)
    for return_tuple in return_list:
        print('======>>>')
        print(return_tuple)
        print('<<<======')


def main():

    known_words = read_words(KNOWN_FILE)
    name_words = read_words(NAME_FILE)
    all_words = read_words(sys.argv[1])

    # print frequency
    # for word in all_words.keys():
        # print(word, all_words[word])
    # print('===========')

    basic_words = OrderedDict()
    for word in all_words.keys():
        search_result = search_wordnet(word)
        if not search_result:
            # print('word meaning not found: {}'.format(word))
            continue

        for result in search_result:
            word_type, basic_form, meanings = result
            basic_words[basic_form] = meanings

    for word in list(known_words.keys()) + list(name_words.keys()):
        if word in basic_words:
            basic_words.pop(word)

    new_words = [word for word in basic_words.keys() if len(word) > 2]

    for word in new_words:
        print(word)
    return

    for word in basic_words.keys():
        print('\n===>>> {} <<<===\n'.format(word))
        for meaning in basic_words[word]:
            print('  {}\n'.format(meaning))


if __name__ == '__main__':
    # _test_wordnet()
    main()
	#!/usr/bin/env python3

	import os
	import sys
	import re
	import subprocess
	from collections import Counter, OrderedDict


	CUR_DIR = os.path.dirname(os.path.realpath(__file__))
	MARKS = r""" ,./?;':"\|\<>[]{}()-_=+*&^%$#@!~0987654321` """
	KNOWN_FILE = os.path.join(CUR_DIR, 'known_words.txt')
	NAME_FILE = os.path.join(CUR_DIR, 'names.txt')


	def bash_cmd(c):
	"""
	Run a cmd with bash and return the exit code and text output.
	"""

	cmd = ['bash', '-c', c]
	child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=False)

	stdout, _stderr = child.communicate()
	rc = int(child.returncode)

	return (rc, stdout)


	def read_words(file_path):
	"""
	Read all words from the file, remove all punctuations and change all letters to lower.
	Return a OrderedDict ordered by words frequency.
	Note: only words_by_frequency.keys() are ordered, words_by_frequency.items() are not ordered.
	"""
	with open(file_path) as f:
	lines = f.readlines()

	words = []

	for line in lines:
	for mark in MARKS:
	line = line.replace(mark, ' ')
	line = line.lower()
	words += line.split()

	# remove short one
	words = [word for word in words if len(word) > 2]

	# sort by frequency
	counts = Counter(words)
	words_by_frequency = OrderedDict(sorted(counts.items(), key=lambda t: t[1], reverse=True))

	return words_by_frequency


	def search_wordnet(word):
	"""
	WordNet can handle tenses, comparatives and plurals, etc.
	The cmd wordnet returns the number of matches. (return 0 if no match, -1 if database error.)

	We only handle 4 types of words here: noun/verb/adj/adv.
	Some output examples:

	1) Overview of noun girl
	The noun girl has 5 senses (first 5 from tagged texts)

	2) Overview of noun walk
	The noun walk has 7 senses (first 6 from tagged texts)
	Overview of verb walk
	The verb walk has 10 senses (first 5 from tagged texts)

	3) Overview of adj able
	The adj able has 4 senses (first 3 from tagged texts)

	4) Overview of adv well
	The adv well has 13 senses (first 8 from tagged texts)

	Return value: return a list of tuples.
	Each tuple has the type, basic form (remove tenses/plurals...), and a list of all their meanings.
	[
	('noun', 'basic form', ['first meaning', 'second meaning', ]),
	('verb', 'basic form', ['first meaning', 'second meaning', 'third meaning']),
	('adj' , 'basic form', []),
	('adv' , 'basic form', ['first meaning', ]),
	]
	"""

	rc, output = bash_cmd('wordnet {word} -over'.format(word=word))

	# no match
	if rc == 0:
	return None

	# database not found
	if rc == 255:
	print(output)
	raise Exception('Fatal error - cannot open WordNet database')

	# separators = ['\nOverview of noun ', '\nOverview of verb ', '\nOverview of adj ', '\nOverview of adv ']
	word_types = ['noun', 'verb', 'adj', 'adv']
	groups = output.split('\nOverview of ')

	return_list = []
	for group in groups:
	lines = group.splitlines()
	if len(lines) == 0:
	continue

	overview_line = lines[0]
	if not overview_line.strip():
	continue

	word_type, basic_form = overview_line.split()[0], overview_line.split()[1]

	meanings = []
	for line in lines[1:]:
	re_obj = re.search(r'^\d+\. (.*)', line)
	if re_obj:
	meaning = re_obj.group(1)
	meanings.append(meaning)

	return_list.append((word_type, basic_form, meanings))

	return return_list


	def _test_wordnet():
	"""
	Here we use the word 'well' to test.
	"""
	word = 'better'
	return_list = search_wordnet(word)
	for return_tuple in return_list:
	print('======>>>')
	print(return_tuple)
	print('<<<======')


	def main():

	known_words = read_words(KNOWN_FILE)
	name_words = read_words(NAME_FILE)
	all_words = read_words(sys.argv[1])

	# print frequency
	# for word in all_words.keys():
	# print(word, all_words[word])
	# print('===========')

	basic_words = OrderedDict()
	for word in all_words.keys():
	search_result = search_wordnet(word)
	if not search_result:
	# print('word meaning not found: {}'.format(word))
	continue

	for result in search_result:
	word_type, basic_form, meanings = result
	basic_words[basic_form] = meanings

	for word in list(known_words.keys()) + list(name_words.keys()):
	if word in basic_words:
	basic_words.pop(word)

	new_words = [word for word in basic_words.keys() if len(word) > 2]

	for word in new_words:
	print(word)
	return

	for word in basic_words.keys():
	print('\n===>>> {} <<<===\n'.format(word))
	for meaning in basic_words[word]:
	print(' {}\n'.format(meaning))


	if __name__ == '__main__':
	# _test_wordnet()
	main()