Skip to content

Instantly share code, notes, and snippets.

@bixb0012
Last active February 6, 2019 15:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bixb0012/d25e3f24c84a0e694053117480e19675 to your computer and use it in GitHub Desktop.
Save bixb0012/d25e3f24c84a0e694053117480e19675 to your computer and use it in GitHub Desktop.
Python3: Generate English Word List
#!python3
# Reference: 1) https://www.gutenberg.org/ebooks/29765
# Reference: 2) https://wordnet.princeton.edu/download/current-version
from itertools import dropwhile, takewhile
import re
import urllib.request
# Example 1: Project Gutenberg's Webster's Unabridged Dictionary (Webster's Dictionary 1913)
url = r"https://www.gutenberg.org/cache/epub/29765/pg29765.txt"
start_text = r"*** START OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY ***"
end_text = r"*** END OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY ***"
regex = re.compile(r"[a-zA-Z -']*$")
with urllib.request.urlopen(url) as response:
page_text = response.read().decode('utf-8')
page_lines = iter(page_text.split('\r\n'))
lines = takewhile(
lambda x: x != end_text,
dropwhile(lambda x: x != start_text, page_lines)
)
words = sorted(set([
word.strip().lower()
for line in lines
if line.isupper()
for word in line.split(';')
if regex.match(word) is not None and word[:1].isalpha()
]))
# Example 2: Princeton University "About WordNet." WordNet. Princeton University. 2010
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment