Last active
July 20, 2016 22:40
-
-
Save marciomazza/ed51a7e9b2e6f7c6b5bce9c78afccd19 to your computer and use it in GitHub Desktop.
Scrapping das propostas no site da pybr 12
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# requirements: | |
# | |
# beautifulsoup4 | |
# requests | |
# unidecode | |
from itertools import groupby | |
from bs4 import BeautifulSoup | |
from requests import get | |
from unidecode import unidecode | |
def ler_propostas(): | |
res = get('http://speakerfight.com/events/python-brasil12-apresentacoes/') | |
soup = BeautifulSoup(res.content, 'html.parser') | |
div_event_proposals = soup.find('div', id='event-proposals') | |
propostas = div_event_proposals.find_all(attrs={'class': 'panel-body'}) | |
for p in propostas: | |
titulo = p.h3.a.text.strip() | |
autor = p.find('p', attrs={'class': 'proposal-metadata'}).a.text | |
yield autor, titulo | |
propostas = sorted(ler_propostas(), key=lambda x: unidecode(str(x)).lower()) | |
propostas_por_autor = [(a, [titulo for _, titulo in props]) | |
for a, props in groupby(propostas, lambda p: p[0])] | |
print('#### PROPOSTAS POR AUTOR ####\n') | |
for autor, propostas in propostas_por_autor: | |
print(autor) | |
for proposta in propostas: | |
print(' ', proposta) | |
autores_com_mais_de_uma_proposta = sorted([(len(l), a) | |
for a, l in propostas_por_autor | |
if len(l) > 1], reverse=True) | |
print('\n#### AUTORES COM MAIS DE UMA PROPOSTA ####\n') | |
for num, autor in autores_com_mais_de_uma_proposta: | |
print(num, autor) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment