Last active
February 6, 2019 15:19
-
-
Save bixb0012/d25e3f24c84a0e694053117480e19675 to your computer and use it in GitHub Desktop.
Python3: Generate English Word List
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python3 | |
# Reference: 1) https://www.gutenberg.org/ebooks/29765 | |
# Reference: 2) https://wordnet.princeton.edu/download/current-version | |
from itertools import dropwhile, takewhile | |
import re | |
import urllib.request | |
# Example 1: Project Gutenberg's Webster's Unabridged Dictionary (Webster's Dictionary 1913) | |
url = r"https://www.gutenberg.org/cache/epub/29765/pg29765.txt" | |
start_text = r"*** START OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY ***" | |
end_text = r"*** END OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY ***" | |
regex = re.compile(r"[a-zA-Z -']*$") | |
with urllib.request.urlopen(url) as response: | |
page_text = response.read().decode('utf-8') | |
page_lines = iter(page_text.split('\r\n')) | |
lines = takewhile( | |
lambda x: x != end_text, | |
dropwhile(lambda x: x != start_text, page_lines) | |
) | |
words = sorted(set([ | |
word.strip().lower() | |
for line in lines | |
if line.isupper() | |
for word in line.split(';') | |
if regex.match(word) is not None and word[:1].isalpha() | |
])) | |
# Example 2: Princeton University "About WordNet." WordNet. Princeton University. 2010 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment