Skip to content

Instantly share code, notes, and snippets.

@bradmontgomery
Last active August 15, 2022 19:11
Show Gist options
  • Star 29 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save bradmontgomery/4717521 to your computer and use it in GitHub Desktop.
Save bradmontgomery/4717521 to your computer and use it in GitHub Desktop.
playing with python's `collections.Counter`
"""
Use a Counter to find the most common words in "The Wonderful Wizard of Oz" by
L. Frank Baum.
Available in (mostly) plain text at:
https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt
Note: This code also counts the words in the header, so it's not a *realistic*
applicaton, but more of a demonstration of python's Counter.
Running this code should give you something like this:
$ python count_words.py
The Top 10 words
the: 2808
and: 1630
to: 1143
of: 869
a: 819
I: 597
was: 502
you: 486
in: 476
he: 408
"""
import re
from collections import Counter
from html.parser import HTMLParser
import requests
class PreParser(HTMLParser):
"""
This is an HTML parser that captures the text within
<pre></pre> tags. See more in the html.parser docs:
https://docs.python.org/3/library/html.parser.html
"""
capture = False
result = None # <--- we'll store some text here.
def handle_starttag(self, tag, attrs):
if tag == "pre":
self.capture = True
def handle_endtag(self, tag):
if tag == "pre":
self.capture = False
def handle_data(self, data):
if self.capture:
self.result = data
def main(n=10):
# Create a parser to parse the HTML document.
parser = PreParser()
# Now fetch some content & do a little cleanup
url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
resp = requests.get(url)
if resp.status_code != 200:
print(f"Failed to fetch document: {resp.status_code}\n{resp.txt}")
return
# Do some content cleaning...
parser.feed(resp.text)
content = re.sub("\s+", " ", parser.result)
content = re.sub("[^A-Za-z ]+", "", content)
words = content.split()
print(f"Found {len(words):,} words!")
# Start counting
word_count = Counter(words)
# The Top-N words
print("The Top {0} words".format(n))
for word, count in word_count.most_common(n):
print("{0}: {1}".format(word, count))
if __name__ == "__main__":
main()
"""
This python file is exported from a Jupyter Notebook used
during the MEMpy presentation on 2022-08-15.
"""
#!/usr/bin/env python
# coding: utf-8
# # collections.Counter
#
# It's good stuff!
#
# Super-powerful utilities that do _very common_ operations.
#
# In[1]:
from collections import Counter
# ## what is `Counter`?
#
# - It's like a dictionary (keys & values)
# - Keys -> The things you want to count
# - Values -> The number of times the key appears in a _collection_ of stuff.
# In[2]:
# Before using Counter
c = {}
if "widgets" in c:
c["widgets"] += 1
else:
c["widgets"] = 1
# OR, if you KNOW all of your keys...
c = {
"widgets": 0,
}
c["widgets"] += 1
# Counter lets you start counting without knowing keys in advance
c = Counter()
c["widgets"] += 1
c
# # Counter behavior
#
# - Most of the dict methods are available.
# - `.keys()` & `.values()`
# - `in` operations
# In[3]:
c.keys()
# In[4]:
c.values()
# In[5]:
# Update will create new keys or adjust
# counts for existing keys
c.update({"foo": 1})
c
# In[6]:
c.update({"foo": 1}) # calling it a 2nd time ...
c
# In[7]:
# you can create a Counger based on keyword arguments
scores = Counter(grizzlies=134, warriors=95)
scores
# ## Where things start to get interesting
#
# You can create a Counter object from _any_ iterable!
# In[8]:
c = Counter(["moe", "larry", "larry", "curly", "curly"])
c
# In[9]:
# you can also go "backwards" ... get a "list" of elements based on their counts
list(c.elements())
# ## most common occurances?
#
# One of the best use-cases for a Counter!
#
# In[10]:
# Most common letters in a string?
word = "supercalifragilisticexpialidocious"
Counter(word).most_common(3)
# ## What are the 10 most common words in "The Wonderful Wizard of Oz"?
# In[11]:
import requests
import re
from html.parser import HTMLParser
from collections import Counter
# blergh, write a little HTML parser:
# https://docs.python.org/3/library/html.parser.html
class Parser(HTMLParser):
capture = False
result = None # <--- we'll store some text here.
def handle_starttag(self, tag, attrs):
if tag == "pre":
self.capture = True
def handle_endtag(self, tag):
if tag == "pre":
self.capture = False
def handle_data(self, data):
if self.capture:
self.result = data
parser = Parser()
# In[12]:
# Now fetch some content & do a little cleanup
url = "https://archive.org/stream/wonderfulwizardo00baumiala/wonderfulwizardo00baumiala_djvu.txt"
resp = requests.get(url)
parser.feed(resp.text)
content = re.sub("\s+", " ", parser.result)
content = re.sub("[^A-Za-z ]+", "", content)
words = content.split()
print(f"{len(words):,} words!")
# In[13]:
# Start counting
word_count = Counter(words)
word_count.most_common(10)
# ## ok that's neat, but....
#
# How can this help me build that RPG/MMO I've always wanted to build?
# # Answer: Let's go shopping!
# In[14]:
# Set up your purse!
purse = Counter(gold=1000, silver=500, copper=100)
purse
# In[15]:
# Create some items in the shop
shield = {"gold": 25}
sword = {"gold": 100, "silver": 50}
tunic = {"silver": 10, "copper": 50}
# In[16]:
# Let's make some purchases
purse.subtract(shield)
purse
# In[17]:
# Buy the sword.
purse.subtract(sword)
purse
# In[18]:
# Get the tunic too
purse.subtract(tunic)
purse
# In[19]:
# Buy a castle!
castle = {"gold": 50_000, "silver": 10_000, "copper": 350}
purse.subtract(castle)
purse # whoops
# In[20]:
# New in 3.10
purse.total() # -> Should sum all the values.
# In[ ]:
# or ...
debt = sum(purse.values())
print(f"We owe {debt:,}!")
# In[ ]:
purse.clear() # reset!
purse
# ## Resources
#
# - Python Collections: https://docs.python.org/3/library/collections.html
# - HTML Parser: https://docs.python.org/3/library/html.parser.html
# - The 2013 version of this talk: https://speakerdeck.com/bkmontgomery/pythons-counter-collection?slide=40
# - Sample Code: https://gist.github.com/bradmontgomery/4717521
#
# ### Other collections goodies!
#
# - ChainMap
# - deque
# - namedtuple
# - defaultdict
# - OrderedDict
# - UserDict, UserList, UserString
# # Thank you!
#
# Questions?
black==22.6.0
certifi==2022.6.15
isort==5.10.1
requests==2.28.1
@salihkaragoz
Copy link

Thank you for sharing this code.
Can you update the download links? It seems like broken.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment