bradmontgomery/count_words.py

## count_words.py
"""
Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by
L. Frank Baum.

Available in (mostly) plain text at:
https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt

Note: This code also counts the words in the header, so it's not a *realistic*
applicaton, but more of a demonstration of python's Counter.

Running this code should give you something like this:

    $ python count_words.py

    The Top 10 words
    the: 2808
    and: 1630
    to: 1143
    of: 869
    a: 819
    I: 597
    was: 502
    you: 486
    in: 476
    he: 408

"""
import re
from collections import Counter
from html.parser import HTMLParser

import requests


class PreParser(HTMLParser):
    """
    This is an HTML parser that captures the text within
    <pre></pre> tags. See more in the html.parser docs:
    https://docs.python.org/3/library/html.parser.html

    """

    capture = False
    result = None  # <--- we'll store some text here.

    def handle_starttag(self, tag, attrs):
        if tag == "pre":
            self.capture = True

    def handle_endtag(self, tag):
        if tag == "pre":
            self.capture = False

    def handle_data(self, data):
        if self.capture:
            self.result = data


def main(n=10):

    # Create a parser to parse the HTML document.
    parser = PreParser()

    # Now fetch some content & do a little cleanup
    url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Failed to fetch document: {resp.status_code}\n{resp.txt}")
        return

    # Do some content cleaning...
    parser.feed(resp.text)
    content = re.sub("\s+", " ", parser.result)
    content = re.sub("[^A-Za-z ]+", "", content)
    words = content.split()
    print(f"Found {len(words):,} words!")

    # Start counting
    word_count = Counter(words)

    # The Top-N words
    print("The Top {0} words".format(n))
    for word, count in word_count.most_common(n):
        print("{0}: {1}".format(word, count))


if __name__ == "__main__":
    main()

## counter.py
"""
This python file is exported from a Jupyter Notebook used
during the MEMpy presentation on 2022-08-15.

"""
#!/usr/bin/env python
# coding: utf-8

# # collections.Counter
#
# It's good stuff!
#
# Super-powerful utilities that do _very common_ operations.
#

# In[1]:


from collections import Counter


# ## what is `Counter`?
#
# - It's like a dictionary (keys & values)
# - Keys -> The things you want to count
# - Values -> The number of times the key appears in a _collection_ of stuff.

# In[2]:


# Before using Counter
c = {}
if "widgets" in c:
    c["widgets"] += 1
else:
    c["widgets"] = 1


# OR, if you KNOW all of your keys...
c = {
    "widgets": 0,
}
c["widgets"] += 1


# Counter lets you start counting without knowing keys in advance
c = Counter()
c["widgets"] += 1
c


# # Counter behavior
#
# - Most of the dict methods are available.
# - `.keys()` & `.values()`
# - `in` operations

# In[3]:


c.keys()


# In[4]:


c.values()


# In[5]:


# Update will create new keys or adjust
# counts for existing keys
c.update({"foo": 1})
c


# In[6]:


c.update({"foo": 1})  # calling it a 2nd time ...
c


# In[7]:


# you can create a Counger based on keyword arguments
scores = Counter(grizzlies=134, warriors=95)
scores


# ## Where things start to get interesting
#
# You can create a Counter object from _any_ iterable!

# In[8]:


c = Counter(["moe", "larry", "larry", "curly", "curly"])
c


# In[9]:


# you can also go "backwards" ... get a "list" of elements based on their counts
list(c.elements())


# ## most common occurances?
#
# One of the best use-cases for a Counter!
#

# In[10]:


# Most common letters in a string?
word = "supercalifragilisticexpialidocious"
Counter(word).most_common(3)


# ## What are the 10 most common words in "The Wonderful Wizard of Oz"?

# In[11]:


import requests
import re

from html.parser import HTMLParser
from collections import Counter


# blergh, write a little HTML parser:
# https://docs.python.org/3/library/html.parser.html
class Parser(HTMLParser):
    capture = False
    result = None  # <--- we'll store some text here.

    def handle_starttag(self, tag, attrs):
        if tag == "pre":
            self.capture = True

    def handle_endtag(self, tag):
        if tag == "pre":
            self.capture = False

    def handle_data(self, data):
        if self.capture:
            self.result = data


parser = Parser()


# In[12]:


# Now fetch some content & do a little cleanup
url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
resp = requests.get(url)
parser.feed(resp.text)

content = re.sub("\s+", " ", parser.result)
content = re.sub("[^A-Za-z ]+", "", content)
words = content.split()
print(f"{len(words):,} words!")


# In[13]:


# Start counting
word_count = Counter(words)
word_count.most_common(10)


# ## ok that's neat, but....
#
# How can this help me build that RPG/MMO I've always wanted to build?

# # Answer: Let's go shopping!

# In[14]:


# Set up your purse!
purse = Counter(gold=1000, silver=500, copper=100)
purse


# In[15]:


# Create some items in the shop
shield = {"gold": 25}
sword = {"gold": 100, "silver": 50}
tunic = {"silver": 10, "copper": 50}


# In[16]:


# Let's make some purchases
purse.subtract(shield)
purse


# In[17]:


# Buy the sword.
purse.subtract(sword)
purse


# In[18]:


# Get the tunic too
purse.subtract(tunic)
purse


# In[19]:


# Buy a castle!
castle = {"gold": 50_000, "silver": 10_000, "copper": 350}
purse.subtract(castle)
purse  # whoops


# In[20]:


# New in 3.10
purse.total()  # -> Should sum all the values.


# In[ ]:


# or ...
debt = sum(purse.values())
print(f"We owe {debt:,}!")


# In[ ]:


purse.clear()  # reset!
purse


# ## Resources
#
# - Python Collections: https://docs.python.org/3/library/collections.html
# - HTML Parser: https://docs.python.org/3/library/html.parser.html
# - The 2013 version of this talk: https://speakerdeck.com/bkmontgomery/pythons-counter-collection?slide=40
# - Sample Code: https://gist.github.com/bradmontgomery/4717521
#
# ### Other collections goodies!
#
# - ChainMap
# - deque
# - namedtuple
# - defaultdict
# - OrderedDict
# - UserDict, UserList, UserString

# # Thank you!
#
# Questions?

## requirements.txt
black==22.6.0
certifi==2022.6.15
isort==5.10.1
requests==2.28.1
	"""
	Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by
	L. Frank Baum.

	Available in (mostly) plain text at:
	https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt

	Note: This code also counts the words in the header, so it's not a realistic
	applicaton, but more of a demonstration of python's Counter.

	Running this code should give you something like this:

	$ python count_words.py

	The Top 10 words
	the: 2808
	and: 1630
	to: 1143
	of: 869
	a: 819
	I: 597
	was: 502
	you: 486
	in: 476
	he: 408

	"""
	import re
	from collections import Counter
	from html.parser import HTMLParser

	import requests


	class PreParser(HTMLParser):
	"""
	This is an HTML parser that captures the text within
	<pre></pre> tags. See more in the html.parser docs:
	https://docs.python.org/3/library/html.parser.html

	"""

	capture = False
	result = None # <--- we'll store some text here.

	def handle_starttag(self, tag, attrs):
	if tag == "pre":
	self.capture = True

	def handle_endtag(self, tag):
	if tag == "pre":
	self.capture = False

	def handle_data(self, data):
	if self.capture:
	self.result = data


	def main(n=10):

	# Create a parser to parse the HTML document.
	parser = PreParser()

	# Now fetch some content & do a little cleanup
	url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
	resp = requests.get(url)
	if resp.status_code != 200:
	print(f"Failed to fetch document: {resp.status_code}\n{resp.txt}")
	return

	# Do some content cleaning...
	parser.feed(resp.text)
	content = re.sub("\s+", " ", parser.result)
	content = re.sub("[^A-Za-z ]+", "", content)
	words = content.split()
	print(f"Found {len(words):,} words!")

	# Start counting
	word_count = Counter(words)

	# The Top-N words
	print("The Top {0} words".format(n))
	for word, count in word_count.most_common(n):
	print("{0}: {1}".format(word, count))


	if __name__ == "__main__":
	main()
	"""
	This python file is exported from a Jupyter Notebook used
	during the MEMpy presentation on 2022-08-15.

	"""
	#!/usr/bin/env python
	# coding: utf-8

	# # collections.Counter
	#
	# It's good stuff!
	#
	# Super-powerful utilities that do _very common_ operations.
	#

	# In[1]:


	from collections import Counter


	# ## what is `Counter`?
	#
	# - It's like a dictionary (keys & values)
	# - Keys -> The things you want to count
	# - Values -> The number of times the key appears in a _collection_ of stuff.

	# In[2]:


	# Before using Counter
	c = {}
	if "widgets" in c:
	c["widgets"] += 1
	else:
	c["widgets"] = 1


	# OR, if you KNOW all of your keys...
	c = {
	"widgets": 0,
	}
	c["widgets"] += 1


	# Counter lets you start counting without knowing keys in advance
	c = Counter()
	c["widgets"] += 1
	c


	# # Counter behavior
	#
	# - Most of the dict methods are available.
	# - `.keys()` & `.values()`
	# - `in` operations

	# In[3]:


	c.keys()


	# In[4]:


	c.values()


	# In[5]:


	# Update will create new keys or adjust
	# counts for existing keys
	c.update({"foo": 1})
	c


	# In[6]:


	c.update({"foo": 1}) # calling it a 2nd time ...
	c


	# In[7]:


	# you can create a Counger based on keyword arguments
	scores = Counter(grizzlies=134, warriors=95)
	scores


	# ## Where things start to get interesting
	#
	# You can create a Counter object from _any_ iterable!

	# In[8]:


	c = Counter(["moe", "larry", "larry", "curly", "curly"])
	c


	# In[9]:


	# you can also go "backwards" ... get a "list" of elements based on their counts
	list(c.elements())


	# ## most common occurances?
	#
	# One of the best use-cases for a Counter!
	#

	# In[10]:


	# Most common letters in a string?
	word = "supercalifragilisticexpialidocious"
	Counter(word).most_common(3)


	# ## What are the 10 most common words in "The Wonderful Wizard of Oz"?

	# In[11]:


	import requests
	import re

	from html.parser import HTMLParser
	from collections import Counter


	# blergh, write a little HTML parser:
	# https://docs.python.org/3/library/html.parser.html
	class Parser(HTMLParser):
	capture = False
	result = None # <--- we'll store some text here.

	def handle_starttag(self, tag, attrs):
	if tag == "pre":
	self.capture = True

	def handle_endtag(self, tag):
	if tag == "pre":
	self.capture = False

	def handle_data(self, data):
	if self.capture:
	self.result = data


	parser = Parser()


	# In[12]:


	# Now fetch some content & do a little cleanup
	url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
	resp = requests.get(url)
	parser.feed(resp.text)

	content = re.sub("\s+", " ", parser.result)
	content = re.sub("[^A-Za-z ]+", "", content)
	words = content.split()
	print(f"{len(words):,} words!")


	# In[13]:


	# Start counting
	word_count = Counter(words)
	word_count.most_common(10)


	# ## ok that's neat, but....
	#
	# How can this help me build that RPG/MMO I've always wanted to build?

	# # Answer: Let's go shopping!

	# In[14]:


	# Set up your purse!
	purse = Counter(gold=1000, silver=500, copper=100)
	purse


	# In[15]:


	# Create some items in the shop
	shield = {"gold": 25}
	sword = {"gold": 100, "silver": 50}
	tunic = {"silver": 10, "copper": 50}


	# In[16]:


	# Let's make some purchases
	purse.subtract(shield)
	purse


	# In[17]:


	# Buy the sword.
	purse.subtract(sword)
	purse


	# In[18]:


	# Get the tunic too
	purse.subtract(tunic)
	purse


	# In[19]:


	# Buy a castle!
	castle = {"gold": 50_000, "silver": 10_000, "copper": 350}
	purse.subtract(castle)
	purse # whoops


	# In[20]:


	# New in 3.10
	purse.total() # -> Should sum all the values.


	# In[ ]:


	# or ...
	debt = sum(purse.values())
	print(f"We owe {debt:,}!")


	# In[ ]:


	purse.clear() # reset!
	purse


	# ## Resources
	#
	# - Python Collections: https://docs.python.org/3/library/collections.html
	# - HTML Parser: https://docs.python.org/3/library/html.parser.html
	# - The 2013 version of this talk: https://speakerdeck.com/bkmontgomery/pythons-counter-collection?slide=40
	# - Sample Code: https://gist.github.com/bradmontgomery/4717521
	#
	# ### Other collections goodies!
	#
	# - ChainMap
	# - deque
	# - namedtuple
	# - defaultdict
	# - OrderedDict
	# - UserDict, UserList, UserString

	# # Thank you!
	#
	# Questions?
	black==22.6.0
	certifi==2022.6.15
	isort==5.10.1
	requests==2.28.1