Notebook available at:
https://nbviewer.jupyter.org/gist/jsanz/22de75a321475485219d67a3740193f5
# coding: utf-8
# # Processing FOSS4G 2019 abstracts submitted
# Non Python 3 standard libraries available through this command
#
# ```bash
# $ pip install requests beautifulsoup4 wordcloud
# ```
# imports
import os
import requests
from bs4 import BeautifulSoup
import re
from collections import namedtuple
from wordcloud import WordCloud
# getting my param session to retrieve abstracts using env variable
SESSION = os.getenv('FOSS4G_SESSION')
if SESSION:
print('all good!')
# retrieve the community review page
r = requests.get(url=f'https://community-review.foss4g.org//rank.php?ranksession={SESSION}')
if r.status_code == 200:
print('request OK')
# cook your soup and check the title
soup = BeautifulSoup(r.text)
soup.title
# retrieve the abstract divs, and check how many
divs = soup.find_all(class_=re.compile("abstract-div-[01]"))
len(divs)
# process the divs to get a list of named tuples with title and abstract
Submission = namedtuple('Submission',['title','abstract'])
def process_div(div):
# Title is easy
h3 = div.find_all('h3')[0]
title = ' '.join(h3.strings).strip()
# Abstract is a three level dive
els = div.find_all('div')
pars = [el.strings for el in els]
words = [ item.strip() for par in pars for item in par]
abstract = ' '.join(words).strip()
return Submission(title, abstract)
subs = [ process_div(div) for div in divs]
print(subs[0])
# generate the word cloud
cloud_generator = WordCloud(width=1024, height=768, scale=1, background_color="white", mode="RGBA")
# put together titles and abstracts
texts = [ sub.title.split(' ') for sub in subs] + [ sub.abstract.split(' ') for sub in subs]
words = [ word.lower() for phrase in texts for word in phrase]
# generate the cloud joining all words again
cloud_generator.generate(' '.join(words)).to_image()