singingwolfboy/file.py

## file.py
# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
# (c) 2012
#
# License: MIT
import os
import sys
import random

from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont

import numpy as np
from query_integral_image import query_integral_image
from sklearn.feature_extraction.text import CountVectorizer

FONT_PATH = "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf"
#FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf"


def make_wordcloud(words, counts, fname, font_path=None, width=400, height=200,
                   margin=5, ranks_only=False):
    """Build word cloud using word counts, store in image.

    Parameters
    ----------
    words : numpy array of strings
        Words that will be drawn in the image.

    counts : numpy array of word counts
        Word counts or weighting of words. Determines the size of the word in
        the final image.
        Will be normalized to lie between zero and one.

    font_path : string
        Font path to the font that will be used.
        Defaults to DroidSansMono path.

    fname : sting
        Output filename. Extension determins image type
        (written with PIL).

    width : int (default=400)
        Width of the word cloud image.

    height : int (default=200)
        Height of the word cloud image.

    ranks_only : boolean (default=False)
        Only use the rank of the words, not the actual counts.

    Notes
    -----
    Larger Images with make the code significantly slower.
    If you need a large image, you can try running the algorithm at a lower
    resolution and then drawing the result at the desired resolution.

    In the current form it actually just uses the rank of the counts,
    i.e. the relative differences don't matter.
    Play with setting the font_size in the main loop vor differnt styles.

    Colors are used completely at random. Currently the colors are sampled
    from HSV space with a fixed S and V.
    Adjusting the percentages at the very end gives differnt color ranges.
    Obviously you can also set all at random - haven't tried that.

    """
    if len(counts) <= 0:
        print("We need at least 1 word to plot a word cloud, got %d."
              % len(counts))

    if font_path is None:
        font_path = FONT_PATH

    # normalize counts
    counts = counts / float(counts.max())
    # sort words by counts
    inds = np.argsort(counts)[::-1]
    counts = counts[inds]
    words = words[inds]
    # create image
    img_grey = Image.new("L", (width, height))
    draw = ImageDraw.Draw(img_grey)
    integral = np.zeros((height, width), dtype=np.uint32)
    img_array = np.asarray(img_grey)
    font_sizes, positions, orientations = [], [], []
    # intitiallize font size "large enough"
    font_size = 1000
    # start drawing grey image
    for word, count in zip(words, counts):
        # alternative way to set the font size
        if not ranks_only:
            font_size = min(font_size, int(100 * np.log(count + 100)))
        while True:
            # try to find a position
            font = ImageFont.truetype(font_path, font_size)
            # transpose font optionally
            orientation = random.choice([None, Image.ROTATE_90])
            transposed_font = ImageFont.TransposedFont(font,
                                                       orientation=orientation)
            draw.setfont(transposed_font)
            # get size of resulting text
            box_size = draw.textsize(word)
            # find possible places using integral image:
            result = query_integral_image(integral, box_size[1] + margin,
                                          box_size[0] + margin)
            if result is not None or font_size == 0:
                break
            # if we didn't find a place, make font smaller
            font_size -= 1

        if font_size == 0:
            # we were unable to draw any more
            break

        x, y = np.array(result) + margin // 2
        # actually draw the text
        draw.text((y, x), word, fill="white")
        positions.append((x, y))
        orientations.append(orientation)
        font_sizes.append(font_size)
        # recompute integral image
        img_array = np.asarray(img_grey)
        # recompute bottom right
        # the order of the cumsum's is important for speed ?!
        partial_integral = np.cumsum(np.cumsum(img_array[x:, y:], axis=1),
                                     axis=0)
        # paste recomputed part into old image
        # if x or y is zero it is a bit annoying
        if x > 0:
            if y > 0:
                partial_integral += (integral[x - 1, y:]
                                     - integral[x - 1, y - 1])
            else:
                partial_integral += integral[x - 1, y:]
        if y > 0:
            partial_integral += integral[x:, y - 1][:, np.newaxis]

        integral[x:, y:] = partial_integral

    # redraw in color
    img = Image.new("RGB", (width, height))
    draw = ImageDraw.Draw(img)
    everything = zip(words, font_sizes, positions, orientations)
    for word, font_size, position, orientation in everything:
        font = ImageFont.truetype(font_path, font_size)
        # transpose font optionally
        transposed_font = ImageFont.TransposedFont(font,
                                                   orientation=orientation)
        draw.setfont(transposed_font)
        draw.text((position[1], position[0]), word,
                  fill="hsl(%d" % random.randint(0, 255) + ", 80%, 50%)")
    img.show()
    img.save(fname)


def main(*args):

    if "-" in args:
        lines = sys.stdin.readlines()
        sources = ['stdin']
    else:
        sources = ([arg for arg in args if os.path.exists(arg)]
                   or ["constitution.txt"])
        lines = []
        for s in sources:
            with open(s) as f:
                lines.extend(f.readlines())
    text = "".join(lines)

    cv = CountVectorizer(min_df=1, charset_error="ignore",
                         stop_words="english", max_features=200)
    counts = cv.fit_transform([text]).toarray().ravel()
    words = np.array(cv.get_feature_names())
    # throw away some words, normalize
    words = words[counts > 1]
    counts = counts[counts > 1]
    output_filename = (os.path.splitext(os.path.basename(sources[0]))[0]
                       + "_.png")
    counts = make_wordcloud(words, counts, output_filename)

if __name__ == "__main__":
    main(*sys.argv[1:])
	# Author: Andreas Christian Mueller <amueller@ais.uni-bonn.de>
	# (c) 2012
	#
	# License: MIT
	import os
	import sys
	import random

	from PIL import Image
	from PIL import ImageDraw
	from PIL import ImageFont

	import numpy as np
	from query_integral_image import query_integral_image
	from sklearn.feature_extraction.text import CountVectorizer

	FONT_PATH = "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf"
	#FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf"


	def make_wordcloud(words, counts, fname, font_path=None, width=400, height=200,
	margin=5, ranks_only=False):
	"""Build word cloud using word counts, store in image.

	Parameters
	----------
	words : numpy array of strings
	Words that will be drawn in the image.

	counts : numpy array of word counts
	Word counts or weighting of words. Determines the size of the word in
	the final image.
	Will be normalized to lie between zero and one.

	font_path : string
	Font path to the font that will be used.
	Defaults to DroidSansMono path.

	fname : sting
	Output filename. Extension determins image type
	(written with PIL).

	width : int (default=400)
	Width of the word cloud image.

	height : int (default=200)
	Height of the word cloud image.

	ranks_only : boolean (default=False)
	Only use the rank of the words, not the actual counts.

	Notes
	-----
	Larger Images with make the code significantly slower.
	If you need a large image, you can try running the algorithm at a lower
	resolution and then drawing the result at the desired resolution.

	In the current form it actually just uses the rank of the counts,
	i.e. the relative differences don't matter.
	Play with setting the font_size in the main loop vor differnt styles.

	Colors are used completely at random. Currently the colors are sampled
	from HSV space with a fixed S and V.
	Adjusting the percentages at the very end gives differnt color ranges.
	Obviously you can also set all at random - haven't tried that.

	"""
	if len(counts) <= 0:
	print("We need at least 1 word to plot a word cloud, got %d."
	% len(counts))

	if font_path is None:
	font_path = FONT_PATH

	# normalize counts
	counts = counts / float(counts.max())
	# sort words by counts
	inds = np.argsort(counts)[::-1]
	counts = counts[inds]
	words = words[inds]
	# create image
	img_grey = Image.new("L", (width, height))
	draw = ImageDraw.Draw(img_grey)
	integral = np.zeros((height, width), dtype=np.uint32)
	img_array = np.asarray(img_grey)
	font_sizes, positions, orientations = [], [], []
	# intitiallize font size "large enough"
	font_size = 1000
	# start drawing grey image
	for word, count in zip(words, counts):
	# alternative way to set the font size
	if not ranks_only:
	font_size = min(font_size, int(100 * np.log(count + 100)))
	while True:
	# try to find a position
	font = ImageFont.truetype(font_path, font_size)
	# transpose font optionally
	orientation = random.choice([None, Image.ROTATE_90])
	transposed_font = ImageFont.TransposedFont(font,
	orientation=orientation)
	draw.setfont(transposed_font)
	# get size of resulting text
	box_size = draw.textsize(word)
	# find possible places using integral image:
	result = query_integral_image(integral, box_size[1] + margin,
	box_size[0] + margin)
	if result is not None or font_size == 0:
	break
	# if we didn't find a place, make font smaller
	font_size -= 1

	if font_size == 0:
	# we were unable to draw any more
	break

	x, y = np.array(result) + margin // 2
	# actually draw the text
	draw.text((y, x), word, fill="white")
	positions.append((x, y))
	orientations.append(orientation)
	font_sizes.append(font_size)
	# recompute integral image
	img_array = np.asarray(img_grey)
	# recompute bottom right
	# the order of the cumsum's is important for speed ?!
	partial_integral = np.cumsum(np.cumsum(img_array[x:, y:], axis=1),
	axis=0)
	# paste recomputed part into old image
	# if x or y is zero it is a bit annoying
	if x > 0:
	if y > 0:
	partial_integral += (integral[x - 1, y:]
	- integral[x - 1, y - 1])
	else:
	partial_integral += integral[x - 1, y:]
	if y > 0:
	partial_integral += integral[x:, y - 1][:, np.newaxis]

	integral[x:, y:] = partial_integral

	# redraw in color
	img = Image.new("RGB", (width, height))
	draw = ImageDraw.Draw(img)
	everything = zip(words, font_sizes, positions, orientations)
	for word, font_size, position, orientation in everything:
	font = ImageFont.truetype(font_path, font_size)
	# transpose font optionally
	transposed_font = ImageFont.TransposedFont(font,
	orientation=orientation)
	draw.setfont(transposed_font)
	draw.text((position[1], position[0]), word,
	fill="hsl(%d" % random.randint(0, 255) + ", 80%, 50%)")
	img.show()
	img.save(fname)


	def main(*args):

	if "-" in args:
	lines = sys.stdin.readlines()
	sources = ['stdin']
	else:
	sources = ([arg for arg in args if os.path.exists(arg)]
	or ["constitution.txt"])
	lines = []
	for s in sources:
	with open(s) as f:
	lines.extend(f.readlines())
	text = "".join(lines)

	cv = CountVectorizer(min_df=1, charset_error="ignore",
	stop_words="english", max_features=200)
	counts = cv.fit_transform([text]).toarray().ravel()
	words = np.array(cv.get_feature_names())
	# throw away some words, normalize
	words = words[counts > 1]
	counts = counts[counts > 1]
	output_filename = (os.path.splitext(os.path.basename(sources[0]))[0]
	+ "_.png")
	counts = make_wordcloud(words, counts, output_filename)

	if __name__ == "__main__":
	main(*sys.argv[1:])