bixb0012/python3_generate_english_word_list.py

## python3_generate_english_word_list.py
#!python3
# Reference: 1) https://www.gutenberg.org/ebooks/29765
# Reference: 2) https://wordnet.princeton.edu/download/current-version

from itertools import dropwhile, takewhile
import re
import urllib.request

# Example 1: Project Gutenberg's Webster's Unabridged Dictionary (Webster's Dictionary 1913)
url = r"https://www.gutenberg.org/cache/epub/29765/pg29765.txt"
start_text = r"*** START OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY ***"
end_text = r"*** END OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY ***"
regex = re.compile(r"[a-zA-Z -']*$")

with urllib.request.urlopen(url) as response:
    page_text = response.read().decode('utf-8')

page_lines = iter(page_text.split('\r\n'))
lines = takewhile(
    lambda x: x != end_text,
    dropwhile(lambda x: x != start_text, page_lines)
)
words = sorted(set([
    word.strip().lower()
    for line in lines
    if line.isupper()
    for word in line.split(';')
    if regex.match(word) is not None and word[:1].isalpha()
]))

# Example 2: Princeton University "About WordNet." WordNet. Princeton University. 2010
	#!python3
	# Reference: 1) https://www.gutenberg.org/ebooks/29765
	# Reference: 2) https://wordnet.princeton.edu/download/current-version

	from itertools import dropwhile, takewhile
	import re
	import urllib.request

	# Example 1: Project Gutenberg's Webster's Unabridged Dictionary (Webster's Dictionary 1913)
	url = r"https://www.gutenberg.org/cache/epub/29765/pg29765.txt"
	start_text = r"* START OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY *"
	end_text = r"* END OF THIS PROJECT GUTENBERG EBOOK WEBSTER'S UNABRIDGED DICTIONARY *"
	regex = re.compile(r"[a-zA-Z -']*$")

	with urllib.request.urlopen(url) as response:
	page_text = response.read().decode('utf-8')

	page_lines = iter(page_text.split('\r\n'))
	lines = takewhile(
	lambda x: x != end_text,
	dropwhile(lambda x: x != start_text, page_lines)
	)
	words = sorted(set([
	word.strip().lower()
	for line in lines
	if line.isupper()
	for word in line.split(';')
	if regex.match(word) is not None and word[:1].isalpha()
	]))

	# Example 2: Princeton University "About WordNet." WordNet. Princeton University. 2010