-
-
Save Robin-Lord/c19feef34186c64442e69f6a60ddc516 to your computer and use it in GitHub Desktop.
Manifestos Word Clouds
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Manifestos Word Clouds", | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Robin-Lord/c19feef34186c64442e69f6a60ddc516/manifestos-word-clouds.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2uQzUJkJC9wE", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Importing libraries to read our pdf\n", | |
"\n", | |
"import io\n", | |
"from pdfminer.converter import TextConverter\n", | |
"from pdfminer.pdfinterp import PDFPageInterpreter\n", | |
"from pdfminer.pdfinterp import PDFResourceManager\n", | |
"from pdfminer.pdfpage import PDFPage\n", | |
"\n", | |
"\n", | |
"# Importing library to show progress bar\n", | |
"from tqdm import tqdm_notebook as tqdm\n", | |
"\n", | |
"# Library to read image and turn it into a mask\n", | |
"from PIL import Image\n", | |
"import numpy as np\n", | |
"\n", | |
"# Importing library to construct word clouds and our stopwords list\n", | |
"from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MKOwALsyDHLn", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Function to extract text from pdf\n", | |
"def extract_text_from_pdf(pdf_path):\n", | |
" resource_manager = PDFResourceManager()\n", | |
" fake_file_handle = io.StringIO()\n", | |
" converter = TextConverter(resource_manager, fake_file_handle)\n", | |
" page_interpreter = PDFPageInterpreter(resource_manager, converter)\n", | |
" \n", | |
" with open(pdf_path, 'rb') as fh:\n", | |
" for page in tqdm(PDFPage.get_pages(fh, \n", | |
" caching=True,\n", | |
" check_extractable=True)):\n", | |
" page_interpreter.process_page(page)\n", | |
" \n", | |
" text = fake_file_handle.getvalue()\n", | |
" \n", | |
" print (\"Completed page loop\")\n", | |
" # close open handles\n", | |
" converter.close()\n", | |
" fake_file_handle.close()\n", | |
" \n", | |
" if text:\n", | |
" return text" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UYX4fQ6UDCae", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Settings for this word cloud\n", | |
"\n", | |
"maniesto_file=r\"\"\"Real-Change-Labour-Manifesto-2019.pdf\"\"\"\n", | |
"party_name=\"Labour\"\n", | |
"party_logo=r\"\"\"labour rose bigger.jpg\"\"\"\n", | |
"my_background_colour=\"black\"\n", | |
"outfile=party_name+\"_\"+my_background_colour+\".png\"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gc1lxrmmDPSg", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"resulting_text=extract_text_from_pdf(maniesto_file)\n", | |
"\n", | |
"# Checking we were able to extract anything\n", | |
"print (len(resulting_text))\n", | |
"\n", | |
"# Joining the text together and printing a sample\n", | |
"joined_text=\"\".join(resulting_text)\n", | |
"print(joined_text[1800:1900])" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_PP8i5p2DQkM", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Adding our stopwords (in this case we're also removing the additional words)\n", | |
"\n", | |
"stopwords_plus=list(STOPWORDS)+[\"will\",\"people\"]\n", | |
"print(stopwords_plus)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "hhYfSIgMDSXQ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"image_to_use=party_logo\n", | |
"\n", | |
"canvas_and_colours = np.array(Image.open(image_to_use))\n", | |
"image_colors = ImageColorGenerator(canvas_and_colours)\n", | |
"\n", | |
"wordcloud = WordCloud(\n", | |
" background_color=my_background_colour,\n", | |
" mask=canvas_and_colours,\n", | |
" color_func=image_colors,\n", | |
" max_font_size=800,\n", | |
" max_words=3000,\n", | |
" stopwords=stopwords_plus,\n", | |
" margin=0,\n", | |
" random_state=42,\n", | |
" relative_scaling=0.5,\n", | |
" collocations=True).generate(joined_text)\n", | |
"\n", | |
"wordcloud.to_file(outfile)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment