Created
March 20, 2025 15:57
-
-
Save jgarciab/8011e168ec2cc280be974c0f4ded1002 to your computer and use it in GitHub Desktop.
Python scraper for the website officielebekendmakingen.nl with all policy documents in the Netherlands
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import bs4 as bs | |
| import pandas as pd | |
| #from tqdm.notebook import tqdm | |
| from tqdm import tqdm | |
| def query_overheid(query="samenredzaamheid", max_docs=10000): | |
| r = requests.get(f"https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and(cql.textAndIndexes=%22samenredzaamheid%22)&zv={query}&pg={max_docs}&col=&svel=Kenmerkendedatum&svol=Aflopend") | |
| if r.ok is False: | |
| print("Error in query") | |
| else: | |
| html = bs.BeautifulSoup(r.text) | |
| return html | |
| def parse_block(block): | |
| # Extract the title and its URL from the h2 element | |
| title_tag = block.find("h2", class_="result--title") | |
| title_link = title_tag.find("a") | |
| title = title_link.get_text(strip=True) | |
| url = title_link["href"] | |
| # Extract the subtitle from the p element | |
| subtitle_tag = block.find("p").find("a", class_="result--subtitle") | |
| subtitle = subtitle_tag.get_text(strip=True) | |
| # Extract publication details from the dl element | |
| publication_info = {} | |
| dl = block.find("dl", class_="dl dl--publication") | |
| for item in dl.find_all("div"): | |
| key = item.find("dt").get_text(strip=True) | |
| value = item.find("dd").get_text(strip=True) | |
| publication_info[key] = value | |
| # Extract the PDF download link from the ul element | |
| pdf_link = block.find("ul", class_="result--actions").find("a")["href"] | |
| # Organize the extracted data in a dictionary | |
| result = { | |
| "title": title, | |
| "url": "https://zoek.officielebekendmakingen.nl/" + url, | |
| "subtitle": subtitle, | |
| "publication_info": publication_info, | |
| "pdf_link": "https://zoek.officielebekendmakingen.nl/" + pdf_link, | |
| } | |
| return result | |
| # Get HTML with all results | |
| html = query_overheid(query="samenredzaamheid") | |
| # FInd publications | |
| publications = html.find("div", {"id": "Publicaties"}) | |
| all_publications = publications.find_all("li") | |
| # This number will be double the one in the website | |
| print(len(all_publications)) | |
| # Extract the metadata of the publications as JSON | |
| all_json_pubs = [] | |
| for publication in all_publications: | |
| if len(publication.text) < 20: | |
| continue | |
| all_json_pubs.append(parse_block(publication)) | |
| all_json_pubs = pd.DataFrame(all_json_pubs) | |
| print(len(all_json_pubs)) | |
| # Extract the text of the publication | |
| all_json_text = [] | |
| for url in tqdm(all_json_pubs["url"]): | |
| r = requests.get(url) | |
| html = bs.BeautifulSoup(r.text) | |
| if html.find("div", {"class": "alert__inner"}) is not None: | |
| print(url) | |
| all_json_text.append({"url": url, "text": "Only as PDF"}) | |
| else: | |
| pub = html.find("div", {"id": "broodtekst"}) | |
| all_json_text.append({"url": url, "text": pub.text}) | |
| all_json_text = pd.DataFrame(all_json_text) | |
| # Merge and save to disk | |
| all_data = pd.merge(all_json_pubs, all_json_text) | |
| all_data.to_parquet("all_data.parquet", index=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment