Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Word cloud & word count
#!/usr/bin/env python
from collections import OrderedDict
from os import path
from PIL import Image
import argparse
import matplotlib.pyplot as plt
import numpy as np
import os
from wordcloud import WordCloud, STOPWORDS
# I'm not making this official, these are for my own testing
parser = argparse.ArgumentParser(description='Word cloud and bar graph')
parser.add_argument('--cloud', action='store_true',
default=False,
dest='cloud',
help='Generate and save word cloud')
parser.add_argument('--plot', action='store_true',
default=False,
dest='plot',
help='Generate and save the word plot')
parser.add_argument('--topn', action='store', type=int,
default=20,
dest='topn',
help='Include the top N words in your plot or cloud')
args = parser.parse_args()
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
# Read the whole text.
text = open(path.join(d, 'words.txt')).read()
text = text.lower()
stopwords = set(STOPWORDS)
stopwords = stopwords.union(("references", "summary", "questions", "notes", "introduction", "illustration", "bibliographical",
"remarks", "history", "practical", "considerations", "overview", "conclusions", "definitions",
"concepts", "related areas", "open problems", "background", "applications", "terminology",
"bibliography", "using"))
bender_mask = np.array(Image.open(path.join(d, "bender-mask-4k.png")))
wc = WordCloud(background_color="black", max_words=2000, mask=bender_mask,
stopwords=stopwords, contour_width=3, contour_color="grey")
#wc = WordCloud(background_color="black", max_words=2000, #mask=bender_mask,
# height=2000, width=1000,
# stopwords=stopwords, contour_width=3, contour_color="grey")
if args.cloud:
#
# Create word cloud
#
# generate word cloud
wc.generate(text)
# save to file
wc.to_file(path.join(d, "bendercloud.png"))
elif args.plot:
#
# Create plot
#
#wc = WordCloud()
topwords = wc.process_text(text)
sorted_words = sorted(topwords.items(), key=lambda x: x[1])
sorted_words.reverse()
if len(sorted_words) >= args.topn:
sorted_words = sorted_words[:args.topn]
od = OrderedDict(sorted_words)
plt.rcdefaults()
fig, ax = plt.subplots()
ypos = np.arange(len(od.keys()))
vals = list(od.values())
ax.barh(ypos, vals, align='center', height=0.5)
ax.set_ylim(bottom=-1, top=25) # Removes excess vertical space. Orig: (-5, 25)
ax.set_yticks(ypos)
ax.set_yticklabels(list(od.keys()))
ax.invert_yaxis()
ax.set_title("Robotics ToCs: Individual Word Counts")
fig.tight_layout()
fig.subplots_adjust(top=0.5) # Compresses the whole plot
plt.show()
#fig.savefig('plot.png', dpi=fig.dpi)
else:
print('No options specified')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.