Skip to content

Instantly share code, notes, and snippets.

@goophile
Last active November 25, 2018 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save goophile/e0b5272bc04b5c4c2245161e63c81085 to your computer and use it in GitHub Desktop.
Save goophile/e0b5272bc04b5c4c2245161e63c81085 to your computer and use it in GitHub Desktop.
Get a list of words from a text file.
#!/usr/bin/env python3
import os
import sys
import re
import subprocess
from collections import Counter, OrderedDict
CUR_DIR = os.path.dirname(os.path.realpath(__file__))
MARKS = r""" ,./?;':"|\<>[]{}()-_=+*&^%$#@!~0987654321` """
KNOWN_FILE = os.path.join(CUR_DIR, 'known_words.txt')
NAME_FILE = os.path.join(CUR_DIR, 'names.txt')
def bash_cmd(c):
"""
Run a cmd with bash and return the exit code and text output.
"""
cmd = ['bash', '-c', c]
child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=False)
stdout, _stderr = child.communicate()
rc = int(child.returncode)
return (rc, stdout)
def read_words(file_path):
"""
Read all words from the file, remove all punctuations and change all letters to lower.
Return a OrderedDict ordered by words frequency.
Note: only words_by_frequency.keys() are ordered, words_by_frequency.items() are not ordered.
"""
with open(file_path) as f:
lines = f.readlines()
words = []
for line in lines:
for mark in MARKS:
line = line.replace(mark, ' ')
line = line.lower()
words += line.split()
# remove short one
words = [word for word in words if len(word) > 2]
# sort by frequency
counts = Counter(words)
words_by_frequency = OrderedDict(sorted(counts.items(), key=lambda t: t[1], reverse=True))
return words_by_frequency
def search_wordnet(word):
"""
WordNet can handle tenses, comparatives and plurals, etc.
The cmd wordnet returns the number of matches. (return 0 if no match, -1 if database error.)
We only handle 4 types of words here: noun/verb/adj/adv.
Some output examples:
1) Overview of noun girl
The noun girl has 5 senses (first 5 from tagged texts)
2) Overview of noun walk
The noun walk has 7 senses (first 6 from tagged texts)
Overview of verb walk
The verb walk has 10 senses (first 5 from tagged texts)
3) Overview of adj able
The adj able has 4 senses (first 3 from tagged texts)
4) Overview of adv well
The adv well has 13 senses (first 8 from tagged texts)
Return value: return a list of tuples.
Each tuple has the type, basic form (remove tenses/plurals...), and a list of all their meanings.
[
('noun', 'basic form', ['first meaning', 'second meaning', ]),
('verb', 'basic form', ['first meaning', 'second meaning', 'third meaning']),
('adj' , 'basic form', []),
('adv' , 'basic form', ['first meaning', ]),
]
"""
rc, output = bash_cmd('wordnet {word} -over'.format(word=word))
# no match
if rc == 0:
return None
# database not found
if rc == 255:
print(output)
raise Exception('Fatal error - cannot open WordNet database')
# separators = ['\nOverview of noun ', '\nOverview of verb ', '\nOverview of adj ', '\nOverview of adv ']
word_types = ['noun', 'verb', 'adj', 'adv']
groups = output.split('\nOverview of ')
return_list = []
for group in groups:
lines = group.splitlines()
if len(lines) == 0:
continue
overview_line = lines[0]
if not overview_line.strip():
continue
word_type, basic_form = overview_line.split()[0], overview_line.split()[1]
meanings = []
for line in lines[1:]:
re_obj = re.search(r'^\d+\. (.*)', line)
if re_obj:
meaning = re_obj.group(1)
meanings.append(meaning)
return_list.append((word_type, basic_form, meanings))
return return_list
def _test_wordnet():
"""
Here we use the word 'well' to test.
"""
word = 'better'
return_list = search_wordnet(word)
for return_tuple in return_list:
print('======>>>')
print(return_tuple)
print('<<<======')
def main():
known_words = read_words(KNOWN_FILE)
name_words = read_words(NAME_FILE)
all_words = read_words(sys.argv[1])
# print frequency
# for word in all_words.keys():
# print(word, all_words[word])
# print('===========')
basic_words = OrderedDict()
for word in all_words.keys():
search_result = search_wordnet(word)
if not search_result:
# print('word meaning not found: {}'.format(word))
continue
for result in search_result:
word_type, basic_form, meanings = result
basic_words[basic_form] = meanings
for word in list(known_words.keys()) + list(name_words.keys()):
if word in basic_words:
basic_words.pop(word)
new_words = [word for word in basic_words.keys() if len(word) > 2]
for word in new_words:
print(word)
return
for word in basic_words.keys():
print('\n===>>> {} <<<===\n'.format(word))
for meaning in basic_words[word]:
print(' {}\n'.format(meaning))
if __name__ == '__main__':
# _test_wordnet()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment