Last active
February 16, 2019 19:20
-
-
Save arscan/ed4fac87cad217cb1a3272fbdb0cc70d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Cures NPRM word cloud | |
================ | |
Creates a simple word cloud out of the ONC Cures Act NPRM. For entertainment purposes only. | |
Uses https://github.com/amueller/word_cloud to create the cloud. | |
Based on Masked Word cloud Example | |
http://amueller.github.io/word_cloud/auto_examples/masked.html | |
1. Download rule: wget https://www.healthit.gov/sites/default/files/nprm/ONCCuresActNPRM.pdf | |
2. Get text without footer: pdftotext -y 80 -H 600 -W 1000 -nopgbrk ONCCuresActNPRM.pdf nprm-content.txt | |
3. Download stencil: wget https://gist.githubusercontent.com/arscan/ed4fac87cad217cb1a3272fbdb0cc70d/raw/f4311e587967f2466c9592ff0c52cc752bd67ad9/cures-nprm-stencil.png | |
4. Install wordcloud: pip install wordcloud | |
5. Run this script: ./cures-nprm-cloud.py | |
""" | |
from os import path | |
from PIL import Image | |
import numpy as np | |
import os | |
import random | |
from wordcloud import WordCloud, STOPWORDS | |
def grey_color_func(word, font_size, position, orientation, random_state=None, | |
**kwargs): | |
return "rgb(%d, %d, %d)" % (random.randint(245,255), random.randint(0, 155), random.randint(0,40)) | |
# get data directory (using getcwd() is needed to support running example in generated IPython notebook) | |
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() | |
# Read the whole text. | |
text = open(path.join(d, 'nprm-content.txt')).read() | |
# read the mask image | |
# taken from | |
alice_mask = np.array(Image.open(path.join(d, "cures-nprm-stencil.png"))) | |
stopwords = set(STOPWORDS) | |
stopwords.add("said") | |
wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, | |
stopwords=stopwords, contour_width=3, contour_color='rgb(255,255,255)').generate(text) | |
wc.recolor(color_func=grey_color_func, random_state=3) | |
wc.to_file(path.join(d, "cures-nprm-cloud.png")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment