Skip to content

Instantly share code, notes, and snippets.

@Atala
Last active May 11, 2016
Embed
What would you like to do?
#!/bin/env/python3
import re
import requests
from bs4 import BeautifulSoup as BS
db = {}
if __name__ == '__main__':
rep = requests.get('https://wiki.nuitdebout.fr/wiki/Villes/Paris/Cahiers_de_dol%C3%A9ances_et_d%27exigences/Dol%C3%A9ances')
soup = BS(rep.content, "html.parser")
date = soup.find(class_='navbox collapsible noprint uncollapsed')
links = date.find_all('a')
for link in links:
url = 'https://wiki.nuitdebout.fr' + link['href']
# url = 'https://wiki.nuitdebout.fr/wiki/Villes/Paris/Cahiers_de_dol%C3%A9ances_et_d%27exigences/Dol%C3%A9ances/Dol%C3%A9ances_du_44_mars_(13_avril_2016)'
rep = requests.get(url)
import ipdb
ipdb.set_trace()
soup = BS(rep.content.decode('utf-8'), "html.parser", from_encoding='UTF-8')
title = soup.find(class_='mw-headline')
# filter links that are not doleance links
if title and url != 'https://wiki.nuitdebout.fr/wiki/Villes/Paris/Cahiers_de_dol%C3%A9ances_et_d%27exigences':
title = title.text
head = title[:21]
date = title[-14:-1]
text = soup.find(id='mw-content-text').text
dols = re.findall(r'\d{1}\s{0,1}[–|-](.*?)\d{0,1}\s{1}[–|-]', text, re.DOTALL|re.UNICODE)
nb_dols = len(dols)
last_dol = re.findall(str(nb_dols+1) + r'\s{0,1}[–|-](.*?) mDoléances du ...', text, re.DOTALL|re.UNICODE)
last_dol = last_dol[0].strip()
dols = [dol.strip() for dol in dols]
dols.append(last_dol)
db[date] = dols
print(db)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment