Skip to content

Instantly share code, notes, and snippets.

@nruigrok
Created January 28, 2020 19:17
Show Gist options
  • Save nruigrok/27cbdf2648741b7362d2b768059c9694 to your computer and use it in GitHub Desktop.
Save nruigrok/27cbdf2648741b7362d2b768059c9694 to your computer and use it in GitHub Desktop.
def scrape_pb(url):
url = URL_ROOT + url
print(url)
page = requests.get(url)
tree = html.fromstring(page.text)
medium = get_css(tree, "h1.resultheader-publicatietype")
try:
headline = get_css(tree, "h1.title")
except:
pass
else:
print(headline)
times = {t.get("itemprop"): t.get("datetime") for t in tree.cssselect("time")}
print(times)
try:
date = times["Datum vergadering"]
if not date:
date = times["DatePublished"]
if not date:
raise Exception ("could not get date")
print(date)
finally:
date = datetime.strptime(date, "%Y-%m-%d")
content, = tree.cssselect("div.agendapunt")
# content_html = etree.tostring(content)
body = content.text_content()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment