Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created January 28, 2020 19:41
Show Gist options
  • Save nruigrok/e3bcae831d9da99d6635572dbda84abc to your computer and use it in GitHub Desktop.
Save nruigrok/e3bcae831d9da99d6635572dbda84abc to your computer and use it in GitHub Desktop.
def scrape_pb(url):
url = URL_ROOT + url
print(url)
page = requests.get(url)
tree = html.fromstring(page.text)
if get_css(tree, "div.alert.alert-info"):
continue
else:
medium = get_css(tree, "h1.resultheader-publicatietype")
try:
headline = get_css(tree, "h1.title")
except:
pass
else:
print(headline)
times = {t.get("itemprop"): t.get("datetime") for t in tree.cssselect("time")}
print(times)
if "Datum vergadering" in times:
date = times["Datum vergadering"]
date = datetime.strptime(date, "%Y-%m-%d")
else:
date = times["datePublished"]
date = datetime.strptime(date, "%d-%m-%Y %H:%M:%S")
content, = tree.cssselect("div.agendapunt")
body = content.text_content()
return {"headline": headline,
"text": body,
"date": date,
"medium": medium,
"url": url}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment