|
"""Process traffic decisions""" |
|
|
|
from pathlib import Path |
|
import re |
|
from bs4 import BeautifulSoup as bs |
|
import pandas as pd |
|
|
|
|
|
PATTERN_KM_SIGN = r'A\s?0?1[\-]?([0-9]{2})' |
|
PATTERN_KM_ZONE = r'A\s?0?1\s?\(zone ([0-9]{2})\)' |
|
PATTERN_KM = r'[^0-9]([0-9]{2,3})\s?k[mi]' |
|
PATTERN_SIGN = r'[Aa]\s?\.?0?1' |
|
DIR_HTML = Path('../data/html') |
|
TERMS_MAX_SPEED = [ |
|
'zone', |
|
'zonale', |
|
'maximum snelheid', |
|
'maximumsnelheid', |
|
'snelheidslimiet', |
|
'maximum toegestane snelheid' |
|
] |
|
INTRODUCED = [ |
|
'geplaatst', |
|
'plaatsen', |
|
'plaatsing', |
|
'vaststellen', |
|
'instellen', |
|
'uitbreiden', |
|
'voorzien', |
|
'aanbrengen', |
|
'in te stellen' |
|
] |
|
CANCELLED = [ |
|
'verwijder', |
|
'intrekken', |
|
'opheffen' |
|
] |
|
UNDER_4M = 'tijdelijke verkeersmaatregel van kortere duur dan 4 maanden' |
|
OVER_4M = 'tijdelijke verkeersmaatregel van langere duur dan 4 maanden' |
|
|
|
|
|
def decision_starts(line): |
|
"""Check if line announces start of decision""" |
|
if line.startswith('B E S L U I'): |
|
return True |
|
line = ''.join([c for c in line if c.isalpha()]).lower() |
|
for string in ['besluittot', 'besluitentot', 'besluitenwij']: |
|
if line.startswith(string): |
|
return True |
|
return line in ['besluit', 'besluiten', 'hetbesluit'] |
|
|
|
|
|
def process_line(line): |
|
"""Extract speed decision from sentence""" |
|
speeds = re.findall(PATTERN_KM_SIGN, line) |
|
speeds.extend(re.findall(PATTERN_KM_ZONE, line)) |
|
if re.findall(PATTERN_SIGN, line): |
|
speeds.extend(re.findall(PATTERN_KM, line)) |
|
else: |
|
for term in TERMS_MAX_SPEED: |
|
if term in line.lower(): |
|
speeds.extend(re.findall(PATTERN_KM, line)) |
|
break |
|
if not speeds: |
|
return None |
|
introduced = False |
|
for term in INTRODUCED: |
|
if term.lower() in line.lower(): |
|
introduced = True |
|
cancelled = False |
|
for term in CANCELLED: |
|
if term.lower() in line.lower(): |
|
cancelled = True |
|
return set(speeds), introduced, cancelled |
|
|
|
|
|
def parse_html(html): |
|
"""Extract relevant info from html""" |
|
soup = bs(html, 'lxml') |
|
decision_txt = soup.text |
|
decision_start_idx = 0 |
|
lines = decision_txt.split('\n') |
|
for i, line in enumerate(lines): |
|
if decision_starts(line): |
|
decision_start_idx = i |
|
lines = lines[decision_start_idx:] |
|
lines = [l for l in lines if not l.lower().startswith('dat')] |
|
measures = [(l, process_line(l)) for l in lines] |
|
measures = [d for d in measures if d[1]] |
|
authority = soup.find('meta', {'name': 'OVERHEID.authority'}) |
|
if authority: |
|
authority = authority.get('content') |
|
times = [t for t in soup.find_all('time') if t] |
|
for t in times: |
|
dt = t.get('datetime') |
|
decision_types = soup.find_all('meta', {'name': 'Type verkeersbesluit'}) |
|
if decision_types: |
|
decision_types = ';'.join([dt.get('content') for dt in decision_types]) |
|
else: |
|
decision_types = '' |
|
coords = soup.find('meta', {'data-scheme': 'OVERHEID.EPSG28992'}) |
|
if coords: |
|
coords = coords.get('content') |
|
return { |
|
'authority': authority, |
|
'datetime': dt, |
|
'decision_types': decision_types, |
|
'coords': coords, |
|
'measures': measures |
|
} |
|
|
|
|
|
def parse_dt(dt, to='month'): |
|
"""Convert dt to month or year""" |
|
if pd.isnull(dt): |
|
return None |
|
dt = dt.split(' ')[0] |
|
d, m, Y = dt.split('-') |
|
if to == 'year': |
|
return Y |
|
if len(m) == 1: |
|
m = '0{}'.format(m) |
|
return '{}-{}'.format(Y, m) |
|
|
|
|
|
def is_temporary(decision_type): |
|
"""Check if decision is marked as temporary""" |
|
if not isinstance(decision_type, str): |
|
return 'not_temp' |
|
if UNDER_4M in decision_type.lower(): |
|
return 'under_4m' |
|
if OVER_4M in decision_type.lower(): |
|
return 'over_4m' |
|
return 'not_temp' |
|
|
|
|
|
def count_measures(measures): |
|
"""Count the numer of measures, by type""" |
|
item = {} |
|
for _, (speeds, introduced, cancelled) in measures: |
|
for speed in speeds: |
|
if speed not in item: |
|
item[speed] = { |
|
'introduced': 0, |
|
'cancelled': 0 |
|
} |
|
if introduced: |
|
item[speed]['introduced'] += 1 |
|
if cancelled: |
|
item[speed]['cancelled'] += 1 |
|
return item |
|
|
|
|
|
def create_df(): |
|
"""Create dataframe containing decisions""" |
|
decisions = [] |
|
for path in DIR_HTML.glob('*.html'): |
|
html = path.read_text() |
|
decision = parse_html(html) |
|
measure_counts = count_measures(decision['measures']) |
|
for speed, value in measure_counts.items(): |
|
decision['introduced_{}'.format(speed)] = value['introduced'] |
|
decision['cancelled_{}'.format(speed)] = value['cancelled'] |
|
decision['file_name'] = path.name |
|
decisions.append(decision) |
|
df = pd.DataFrame(decisions) |
|
df['month'] = df.datetime.apply(parse_dt) |
|
df['year'] = df.datetime.map(lambda x: parse_dt(x, 'year')) |
|
df['temp'] = df.decision_types.apply(is_temporary) |
|
return df |
|
|
|
|
|
def main(): |
|
df = create_df() |
|
df.to_csv('../data/processed/verkeersbesluiten.csv') |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |