Skip to content

Instantly share code, notes, and snippets.

@Seanmatthews
Created May 6, 2019 02:58
Show Gist options
  • Save Seanmatthews/9187fd5ffc568138114dfdcaa5d4ba3a to your computer and use it in GitHub Desktop.
Save Seanmatthews/9187fd5ffc568138114dfdcaa5d4ba3a to your computer and use it in GitHub Desktop.
Word cloud & word count
#!/usr/bin/env python
from collections import OrderedDict
from os import path
from PIL import Image
import argparse
import matplotlib.pyplot as plt
import numpy as np
import os
from wordcloud import WordCloud, STOPWORDS
# I'm not making this official, these are for my own testing
parser = argparse.ArgumentParser(description='Word cloud and bar graph')
parser.add_argument('--cloud', action='store_true',
default=False,
dest='cloud',
help='Generate and save word cloud')
parser.add_argument('--plot', action='store_true',
default=False,
dest='plot',
help='Generate and save the word plot')
parser.add_argument('--topn', action='store', type=int,
default=20,
dest='topn',
help='Include the top N words in your plot or cloud')
args = parser.parse_args()
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
# Read the whole text.
text = open(path.join(d, 'words.txt')).read()
text = text.lower()
stopwords = set(STOPWORDS)
stopwords = stopwords.union(("references", "summary", "questions", "notes", "introduction", "illustration", "bibliographical",
"remarks", "history", "practical", "considerations", "overview", "conclusions", "definitions",
"concepts", "related areas", "open problems", "background", "applications", "terminology",
"bibliography", "using"))
bender_mask = np.array(Image.open(path.join(d, "bender-mask-4k.png")))
wc = WordCloud(background_color="black", max_words=2000, mask=bender_mask,
stopwords=stopwords, contour_width=3, contour_color="grey")
#wc = WordCloud(background_color="black", max_words=2000, #mask=bender_mask,
# height=2000, width=1000,
# stopwords=stopwords, contour_width=3, contour_color="grey")
if args.cloud:
#
# Create word cloud
#
# generate word cloud
wc.generate(text)
# save to file
wc.to_file(path.join(d, "bendercloud.png"))
elif args.plot:
#
# Create plot
#
#wc = WordCloud()
topwords = wc.process_text(text)
sorted_words = sorted(topwords.items(), key=lambda x: x[1])
sorted_words.reverse()
if len(sorted_words) >= args.topn:
sorted_words = sorted_words[:args.topn]
od = OrderedDict(sorted_words)
plt.rcdefaults()
fig, ax = plt.subplots()
ypos = np.arange(len(od.keys()))
vals = list(od.values())
ax.barh(ypos, vals, align='center', height=0.5)
ax.set_ylim(bottom=-1, top=25) # Removes excess vertical space. Orig: (-5, 25)
ax.set_yticks(ypos)
ax.set_yticklabels(list(od.keys()))
ax.invert_yaxis()
ax.set_title("Robotics ToCs: Individual Word Counts")
fig.tight_layout()
fig.subplots_adjust(top=0.5) # Compresses the whole plot
plt.show()
#fig.savefig('plot.png', dpi=fig.dpi)
else:
print('No options specified')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment