Skip to content

Instantly share code, notes, and snippets.

@evansde77
Last active February 10, 2021 12:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save evansde77/910ea58bd59d43dc9b0b3e6995f7b424 to your computer and use it in GitHub Desktop.
Save evansde77/910ea58bd59d43dc9b0b3e6995f7b424 to your computer and use it in GitHub Desktop.
Playing around with an ascii histogram example
#!/usr/bin/env python
"""
Example wordcount and ascii histogram script
- Writes a data file in a temporary dir
- defines function to parse file into a word iterable
- Histogram class that does the word count and draws it as
ascii strings with various sorting options
Example usage:
data_file = write_data_file()
histogram = Histogram()
histogram.populate(words(data_file))
print histogram.draw()
[ ] [ ] [====] [ ] [====] [ ] [ ] [====]
[ ] [ ] [====] [ ] [====] [ ] [ ] [====]
[ ] [ ] [====] [====] [====] [====] [====] [====]
[====] [====] [====] [====] [====] [====] [====] [====]
derp wibble foo baz bork womp whizz bar
"""
import os
import re
import string
import tempfile
import collections
PUNCTUATION = re.compile('[{}]'.format(re.escape(string.punctuation)))
DATA = \
"""
foo bar baz womp
whizz bar foo
womp. derp, bork!
bork bork bork!!
foo foo bar bar
baz whizz wibble
"""
def write_data_file():
"""
create a tempfile containing the data fixture, return the
filename
"""
tempdir = tempfile.mkdtemp()
datafile = os.path.join(tempdir, "data.txt")
with open(datafile, 'w') as handle:
handle.write(DATA)
return datafile
def words(filename):
"""
given a file containing whitespace/newline words,
parse the file and clean up the words, then yield them on as
an iterator
"""
with open(filename, 'r') as handle:
for line in handle:
line = PUNCTUATION.sub('', line)
linewords = (word.strip() for word in line.split() if word.strip())
for word in linewords:
yield word
class Column(list):
"""
Column
Helper class to contain column elements, plus some extra
information to aid sorting
"""
def __init__(self, *elements):
super(Column, self).__init__(*elements)
self.column_name = None
self.value = 0
class Histogram(dict):
"""
Histogram
dictionary based helper object to populate and count words
and draw an ascii histogram
"""
def __init__(self):
super(Histogram, self).__init__()
self.height = None
self.width = None
self.column_width = None
self.data_entry = None
self.blank_entry = None
self.columns = {}
def populate(self, iterable):
"""
populate
Populate self by consuming the iterable provided,
creating an entry for each word seem and keeping count of the
columns and widths as they go by
:param iterable: iterable word list generator
"""
max_word_len = 0
for word in iterable:
self.setdefault(word, 0)
self[word] += 1
word_length = len(word)
if word_length > max_word_len:
max_word_len = word_length
self.width = len(self)
self.height = max(self.itervalues())
self.column_width = max_word_len
padding = self.column_width - 2
self.data_entry = '[{}]'.format('='*padding)
self.blank_entry = '[{}]'.format(' '*padding)
self._build_columns()
def _make_column(self, count, name):
"""
build a column instance containing the appropriate filled
and blank entries, set the name and value fields to aid
sorting
:param count: number of entries in the column
:param name: name of the column
:returns: Column instance
"""
result = [name]
result.extend(self.data_entry for i in range(count))
result.extend(self.blank_entry for i in range(self.height-count))
col = Column(result)
col.value = count
col.column_name = name
return col
def _build_columns(self):
"""
build the internal column data structure so that we can consume
it to draw the histogram
"""
self.columns = {}
for word, count in self.iteritems():
column_name = word.ljust(self.column_width, ' ')
column = self._make_column(count, column_name)
self.columns[column_name] = column
def _format(self, columns):
"""
given an array of columns, draw the formatted histogram string
:param columns: list of column instances
"""
result = "\n"
for i in range(1, self.height+2):
result += ' '.join([col[-i] for col in columns if col])
result += '\n'
return result
def draw(self):
"""
draw - create an unsorted histogram string
"""
values = self.columns.values()
return self._format(values)
def draw_sorted_size(self, descending=False):
"""
draw sorted size
create a histogram sorted by bin population, defaults to
ascending order, this can be flipped using the descending=True flag
"""
od = collections.OrderedDict(
sorted(
self.columns.iteritems(),
key=lambda x: x[1].value,
reverse=descending
)
)
values = [od[k] for k in od]
return self._format(values)
def draw_sorted_title(self, descending=False):
"""
draw sorted title - create a representation of the histogram
sorted by title (alphabetically)
"""
od = collections.OrderedDict(
sorted(
self.columns.iteritems(),
key=lambda x: x[1].column_name,
reverse=descending
)
)
values = [od[k] for k in od]
return self._format(values)
if __name__ == '__main__':
#
# main test program
#
# create test data file
data_file = write_data_file()
# create and populate the histogram instance
histogram = Histogram()
histogram.populate(words(data_file))
# draw some histograms using various sorting approaches
print histogram.draw()
print histogram.draw_sorted_size()
print histogram.draw_sorted_size(descending=True)
print histogram.draw_sorted_title()
print histogram.draw_sorted_title(descending=True)
# clean up the data file
os.remove(data_file)
@omarmarquez
Copy link

Very nice implementation!

Thanks for getting back to us.

om

@nottings
Copy link

nottings commented Jun 1, 2016

Very nice. Don't forget some texts contain punctuation. Example: "The quick brown fox jumps over the lazy dog. The dog barks!"
"dog." and "dog" should both be treated as "dog"

@evansde77
Copy link
Author

@nottings update should take care of punctuation, non-ascii chars and unicode left as an exercise to the reader :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment