Skip to content

Instantly share code, notes, and snippets.

@jsanz
Last active April 22, 2019 16:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jsanz/22de75a321475485219d67a3740193f5 to your computer and use it in GitHub Desktop.
Save jsanz/22de75a321475485219d67a3740193f5 to your computer and use it in GitHub Desktop.
Python: word cloud of FOSS4G 2019 submissions

Notebook available at:

https://nbviewer.jupyter.org/gist/jsanz/22de75a321475485219d67a3740193f5

# coding: utf-8

# # Processing FOSS4G 2019 abstracts submitted

# Non Python 3 standard libraries available through this command
# 
# ```bash
# $ pip install requests beautifulsoup4 wordcloud
# ```

# imports
import os
import requests
from bs4 import BeautifulSoup
import re
from collections import namedtuple
from wordcloud import WordCloud

# getting my param session to retrieve abstracts using env variable
SESSION = os.getenv('FOSS4G_SESSION')
if SESSION:
    print('all good!')

# retrieve the community review page
r = requests.get(url=f'https://community-review.foss4g.org//rank.php?ranksession={SESSION}')
if r.status_code == 200:
    print('request OK')

# cook your soup and check the title
soup = BeautifulSoup(r.text)
soup.title

# retrieve the abstract divs, and check how many
divs = soup.find_all(class_=re.compile("abstract-div-[01]"))
len(divs)

# process the divs to get a list of named tuples with title and abstract
Submission = namedtuple('Submission',['title','abstract'])

def process_div(div):
    # Title is easy
    h3 = div.find_all('h3')[0]
    title = ' '.join(h3.strings).strip()
    # Abstract is a three level dive
    els = div.find_all('div')
    pars = [el.strings for el in els]    
    words = [ item.strip() for par in pars for item in par]
    abstract = ' '.join(words).strip()
    
    return Submission(title, abstract)

subs = [ process_div(div) for div in divs]
print(subs[0])

# generate the word cloud
cloud_generator = WordCloud(width=1024, height=768, scale=1, background_color="white", mode="RGBA")

# put together titles and abstracts 
texts = [ sub.title.split(' ') for sub in subs] + [ sub.abstract.split(' ') for sub in subs]
words = [ word.lower() for phrase in texts for word in phrase]

# generate the cloud joining all words again
cloud_generator.generate(' '.join(words)).to_image()
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment