Skip to content

Instantly share code, notes, and snippets.

@jgarciab
Created March 20, 2025 15:57
Show Gist options
  • Select an option

  • Save jgarciab/8011e168ec2cc280be974c0f4ded1002 to your computer and use it in GitHub Desktop.

Select an option

Save jgarciab/8011e168ec2cc280be974c0f4ded1002 to your computer and use it in GitHub Desktop.
Python scraper for the website officielebekendmakingen.nl with all policy documents in the Netherlands
import requests
import bs4 as bs
import pandas as pd
#from tqdm.notebook import tqdm
from tqdm import tqdm
def query_overheid(query="samenredzaamheid", max_docs=10000):
r = requests.get(f"https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and(cql.textAndIndexes=%22samenredzaamheid%22)&zv={query}&pg={max_docs}&col=&svel=Kenmerkendedatum&svol=Aflopend")
if r.ok is False:
print("Error in query")
else:
html = bs.BeautifulSoup(r.text)
return html
def parse_block(block):
# Extract the title and its URL from the h2 element
title_tag = block.find("h2", class_="result--title")
title_link = title_tag.find("a")
title = title_link.get_text(strip=True)
url = title_link["href"]
# Extract the subtitle from the p element
subtitle_tag = block.find("p").find("a", class_="result--subtitle")
subtitle = subtitle_tag.get_text(strip=True)
# Extract publication details from the dl element
publication_info = {}
dl = block.find("dl", class_="dl dl--publication")
for item in dl.find_all("div"):
key = item.find("dt").get_text(strip=True)
value = item.find("dd").get_text(strip=True)
publication_info[key] = value
# Extract the PDF download link from the ul element
pdf_link = block.find("ul", class_="result--actions").find("a")["href"]
# Organize the extracted data in a dictionary
result = {
"title": title,
"url": "https://zoek.officielebekendmakingen.nl/" + url,
"subtitle": subtitle,
"publication_info": publication_info,
"pdf_link": "https://zoek.officielebekendmakingen.nl/" + pdf_link,
}
return result
# Get HTML with all results
html = query_overheid(query="samenredzaamheid")
# FInd publications
publications = html.find("div", {"id": "Publicaties"})
all_publications = publications.find_all("li")
# This number will be double the one in the website
print(len(all_publications))
# Extract the metadata of the publications as JSON
all_json_pubs = []
for publication in all_publications:
if len(publication.text) < 20:
continue
all_json_pubs.append(parse_block(publication))
all_json_pubs = pd.DataFrame(all_json_pubs)
print(len(all_json_pubs))
# Extract the text of the publication
all_json_text = []
for url in tqdm(all_json_pubs["url"]):
r = requests.get(url)
html = bs.BeautifulSoup(r.text)
if html.find("div", {"class": "alert__inner"}) is not None:
print(url)
all_json_text.append({"url": url, "text": "Only as PDF"})
else:
pub = html.find("div", {"id": "broodtekst"})
all_json_text.append({"url": url, "text": pub.text})
all_json_text = pd.DataFrame(all_json_text)
# Merge and save to disk
all_data = pd.merge(all_json_pubs, all_json_text)
all_data.to_parquet("all_data.parquet", index=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment