Skip to content

Instantly share code, notes, and snippets.

@DIRKMJK
Last active January 21, 2021 16:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DIRKMJK/6090151cf4ddca2c8fa75131394f0605 to your computer and use it in GitHub Desktop.
Save DIRKMJK/6090151cf4ddca2c8fa75131394f0605 to your computer and use it in GitHub Desktop.
Scrape verkeersbesluiten

Achtergrond hier

To run the code, create a project directory with subdirectories script and data; in the data subdirectory create subdirectories html and processed. Store the python files in the script subdirectory and run from there.

Further note that the START_URL in download.py will determine what kind of content will be downloaded. You may want to inspect the page in a browser before running the script to make sure you’re getting the results you’re after. In the browser, you can apply filters to your results and then copy the url into download.py, but note that filters use tags that may not be present in old traffic decisions; therefore a safer approach may be to just use a search term.

"""Download traffic decisions from officielebekendmakingen.nl"""
from pathlib import Path
import requests
from bs4 import BeautifulSoup as bs
BASE_URL = 'https://zoek.officielebekendmakingen.nl'
START_URL = 'https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and(dt.available%253e=%222016-01-01%22)and((w.publicatienaam==%22Staatscourant%22))%20AND%20w.verkeersbordcode==%22A1%22&zv=&pg=10&col=Staatscourant&svel=Publicatiedatum&svol=Aflopend&sf=vb|A1'
START_URL = 'https://zoek.officielebekendmakingen.nl/resultaten?q=(c.product-area==%22officielepublicaties%22)and((w.publicatienaam==%22Staatscourant%22))and((cql.textAndIndexes=%2230+km%22+or+cql.textAndIndexes=%2230km%22))&zv=%252230+km%2522+OR++%252230km%2522&col=Staatscourant'
DIR_HTML = Path('../data/html')
def extract_page_urls(soup):
"""Extract urls of web pages containing a decision"""
links = soup.find_all('a', {'id': 'publicatieHyperLink'})
return [BASE_URL + l.get('href') for l in links]
def extract_next_link(soup):
"""Extract link to next page with results"""
next_link = soup.find('a', {'aria-label': 'Next'})
try:
return BASE_URL + next_link.get('href')
except TypeError:
return None
def collect_all_page_urls():
"""Go through all result pages to extract page urls"""
next_link = START_URL
page_urls = []
while next_link:
html = requests.get(next_link).text
soup = bs(html, 'lxml')
page_urls.extend(extract_page_urls(soup))
next_link = extract_next_link(soup)
return page_urls
def download(page_urls):
"""Download and save all html pages from page urls"""
for url in page_urls:
name = url.split('/')[-1]
path = DIR_HTML / name
html = requests.get(url).text
path.write_text(html)
if __name__ == '__main__':
downloaded = [p.name for p in DIR_HTML.glob('*.html')]
print('collecting page urls')
page_urls = collect_all_page_urls()
page_urls = [u for u in page_urls if u.split('/')[-1] not in downloaded]
print('downloading {} pages'.format(len(page_urls)))
download(page_urls)
"""Process traffic decisions"""
from pathlib import Path
import re
from bs4 import BeautifulSoup as bs
import pandas as pd
PATTERN_KM_SIGN = r'A\s?0?1[\-]?([0-9]{2})'
PATTERN_KM_ZONE = r'A\s?0?1\s?\(zone ([0-9]{2})\)'
PATTERN_KM = r'[^0-9]([0-9]{2,3})\s?k[mi]'
PATTERN_SIGN = r'[Aa]\s?\.?0?1'
DIR_HTML = Path('../data/html')
TERMS_MAX_SPEED = [
'zone',
'zonale',
'maximum snelheid',
'maximumsnelheid',
'snelheidslimiet',
'maximum toegestane snelheid'
]
INTRODUCED = [
'geplaatst',
'plaatsen',
'plaatsing',
'vaststellen',
'instellen',
'uitbreiden',
'voorzien',
'aanbrengen',
'in te stellen'
]
CANCELLED = [
'verwijder',
'intrekken',
'opheffen'
]
UNDER_4M = 'tijdelijke verkeersmaatregel van kortere duur dan 4 maanden'
OVER_4M = 'tijdelijke verkeersmaatregel van langere duur dan 4 maanden'
def decision_starts(line):
"""Check if line announces start of decision"""
if line.startswith('B E S L U I'):
return True
line = ''.join([c for c in line if c.isalpha()]).lower()
for string in ['besluittot', 'besluitentot', 'besluitenwij']:
if line.startswith(string):
return True
return line in ['besluit', 'besluiten', 'hetbesluit']
def process_line(line):
"""Extract speed decision from sentence"""
speeds = re.findall(PATTERN_KM_SIGN, line)
speeds.extend(re.findall(PATTERN_KM_ZONE, line))
if re.findall(PATTERN_SIGN, line):
speeds.extend(re.findall(PATTERN_KM, line))
else:
for term in TERMS_MAX_SPEED:
if term in line.lower():
speeds.extend(re.findall(PATTERN_KM, line))
break
if not speeds:
return None
introduced = False
for term in INTRODUCED:
if term.lower() in line.lower():
introduced = True
cancelled = False
for term in CANCELLED:
if term.lower() in line.lower():
cancelled = True
return set(speeds), introduced, cancelled
def parse_html(html):
"""Extract relevant info from html"""
soup = bs(html, 'lxml')
decision_txt = soup.text
decision_start_idx = 0
lines = decision_txt.split('\n')
for i, line in enumerate(lines):
if decision_starts(line):
decision_start_idx = i
lines = lines[decision_start_idx:]
lines = [l for l in lines if not l.lower().startswith('dat')]
measures = [(l, process_line(l)) for l in lines]
measures = [d for d in measures if d[1]]
authority = soup.find('meta', {'name': 'OVERHEID.authority'})
if authority:
authority = authority.get('content')
times = [t for t in soup.find_all('time') if t]
for t in times:
dt = t.get('datetime')
decision_types = soup.find_all('meta', {'name': 'Type verkeersbesluit'})
if decision_types:
decision_types = ';'.join([dt.get('content') for dt in decision_types])
else:
decision_types = ''
coords = soup.find('meta', {'data-scheme': 'OVERHEID.EPSG28992'})
if coords:
coords = coords.get('content')
return {
'authority': authority,
'datetime': dt,
'decision_types': decision_types,
'coords': coords,
'measures': measures
}
def parse_dt(dt, to='month'):
"""Convert dt to month or year"""
if pd.isnull(dt):
return None
dt = dt.split(' ')[0]
d, m, Y = dt.split('-')
if to == 'year':
return Y
if len(m) == 1:
m = '0{}'.format(m)
return '{}-{}'.format(Y, m)
def is_temporary(decision_type):
"""Check if decision is marked as temporary"""
if not isinstance(decision_type, str):
return 'not_temp'
if UNDER_4M in decision_type.lower():
return 'under_4m'
if OVER_4M in decision_type.lower():
return 'over_4m'
return 'not_temp'
def count_measures(measures):
"""Count the numer of measures, by type"""
item = {}
for _, (speeds, introduced, cancelled) in measures:
for speed in speeds:
if speed not in item:
item[speed] = {
'introduced': 0,
'cancelled': 0
}
if introduced:
item[speed]['introduced'] += 1
if cancelled:
item[speed]['cancelled'] += 1
return item
def create_df():
"""Create dataframe containing decisions"""
decisions = []
for path in DIR_HTML.glob('*.html'):
html = path.read_text()
decision = parse_html(html)
measure_counts = count_measures(decision['measures'])
for speed, value in measure_counts.items():
decision['introduced_{}'.format(speed)] = value['introduced']
decision['cancelled_{}'.format(speed)] = value['cancelled']
decision['file_name'] = path.name
decisions.append(decision)
df = pd.DataFrame(decisions)
df['month'] = df.datetime.apply(parse_dt)
df['year'] = df.datetime.map(lambda x: parse_dt(x, 'year'))
df['temp'] = df.decision_types.apply(is_temporary)
return df
def main():
df = create_df()
df.to_csv('../data/processed/verkeersbesluiten.csv')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment