Skip to content

Instantly share code, notes, and snippets.

@Robin-Lord
Created November 28, 2019 22:17
Show Gist options
  • Save Robin-Lord/c19feef34186c64442e69f6a60ddc516 to your computer and use it in GitHub Desktop.
Save Robin-Lord/c19feef34186c64442e69f6a60ddc516 to your computer and use it in GitHub Desktop.
Manifestos Word Clouds
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Manifestos Word Clouds",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Robin-Lord/c19feef34186c64442e69f6a60ddc516/manifestos-word-clouds.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2uQzUJkJC9wE",
"colab_type": "code",
"colab": {}
},
"source": [
"# Importing libraries to read our pdf\n",
"\n",
"import io\n",
"from pdfminer.converter import TextConverter\n",
"from pdfminer.pdfinterp import PDFPageInterpreter\n",
"from pdfminer.pdfinterp import PDFResourceManager\n",
"from pdfminer.pdfpage import PDFPage\n",
"\n",
"\n",
"# Importing library to show progress bar\n",
"from tqdm import tqdm_notebook as tqdm\n",
"\n",
"# Library to read image and turn it into a mask\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# Importing library to construct word clouds and our stopwords list\n",
"from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "MKOwALsyDHLn",
"colab_type": "code",
"colab": {}
},
"source": [
"# Function to extract text from pdf\n",
"def extract_text_from_pdf(pdf_path):\n",
" resource_manager = PDFResourceManager()\n",
" fake_file_handle = io.StringIO()\n",
" converter = TextConverter(resource_manager, fake_file_handle)\n",
" page_interpreter = PDFPageInterpreter(resource_manager, converter)\n",
" \n",
" with open(pdf_path, 'rb') as fh:\n",
" for page in tqdm(PDFPage.get_pages(fh, \n",
" caching=True,\n",
" check_extractable=True)):\n",
" page_interpreter.process_page(page)\n",
" \n",
" text = fake_file_handle.getvalue()\n",
" \n",
" print (\"Completed page loop\")\n",
" # close open handles\n",
" converter.close()\n",
" fake_file_handle.close()\n",
" \n",
" if text:\n",
" return text"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "UYX4fQ6UDCae",
"colab_type": "code",
"colab": {}
},
"source": [
"# Settings for this word cloud\n",
"\n",
"maniesto_file=r\"\"\"Real-Change-Labour-Manifesto-2019.pdf\"\"\"\n",
"party_name=\"Labour\"\n",
"party_logo=r\"\"\"labour rose bigger.jpg\"\"\"\n",
"my_background_colour=\"black\"\n",
"outfile=party_name+\"_\"+my_background_colour+\".png\""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "gc1lxrmmDPSg",
"colab_type": "code",
"colab": {}
},
"source": [
"resulting_text=extract_text_from_pdf(maniesto_file)\n",
"\n",
"# Checking we were able to extract anything\n",
"print (len(resulting_text))\n",
"\n",
"# Joining the text together and printing a sample\n",
"joined_text=\"\".join(resulting_text)\n",
"print(joined_text[1800:1900])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_PP8i5p2DQkM",
"colab_type": "code",
"colab": {}
},
"source": [
"# Adding our stopwords (in this case we're also removing the additional words)\n",
"\n",
"stopwords_plus=list(STOPWORDS)+[\"will\",\"people\"]\n",
"print(stopwords_plus)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "hhYfSIgMDSXQ",
"colab_type": "code",
"colab": {}
},
"source": [
"image_to_use=party_logo\n",
"\n",
"canvas_and_colours = np.array(Image.open(image_to_use))\n",
"image_colors = ImageColorGenerator(canvas_and_colours)\n",
"\n",
"wordcloud = WordCloud(\n",
" background_color=my_background_colour,\n",
" mask=canvas_and_colours,\n",
" color_func=image_colors,\n",
" max_font_size=800,\n",
" max_words=3000,\n",
" stopwords=stopwords_plus,\n",
" margin=0,\n",
" random_state=42,\n",
" relative_scaling=0.5,\n",
" collocations=True).generate(joined_text)\n",
"\n",
"wordcloud.to_file(outfile)"
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment